diff options
Diffstat (limited to 'kernel')
35 files changed, 783 insertions, 510 deletions
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 112581cf97e7..106735145948 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -283,7 +283,6 @@ static int iter_release(struct inode *inode, struct file *file) const struct file_operations bpf_iter_fops = { .open = iter_open, - .llseek = no_llseek, .read = bpf_seq_read, .release = iter_release, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4e07cc057d6f..5e77c58e0601 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -40,7 +40,7 @@ #include <linux/execmem.h> #include <asm/barrier.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 9d34d2364b5a..f625172d4b67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -33,7 +33,7 @@ #include <linux/reboot.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "debug_core.h" #define KGDB_MAX_THREAD_QUERY 17 diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a8071c45c80..e3589c4287cb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6821,7 +6821,6 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { - .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ec796e2f055..4b52cb2ae6d6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1545,7 +1545,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); + area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; diff --git a/kernel/fork.c b/kernel/fork.c index cbdaca45d0c1..89ceb4a68af2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -999,7 +999,7 @@ void __init __weak arch_task_cache_init(void) { } static void __init set_max_threads(unsigned int max_threads_suggested) { u64 threads; - unsigned long nr_pages = PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); + unsigned long nr_pages = memblock_estimated_nr_free_pages(); /* * The number of threads shall be limited such that the thread @@ -1756,33 +1756,30 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; - int error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) - goto out; + return 0; if (no_files) { tsk->files = NULL; - goto out; + return 0; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); - goto out; + return 0; } - newf = dup_fd(oldf, NR_OPEN_MAX, &error); - if (!newf) - goto out; + newf = dup_fd(oldf, NULL); + if (IS_ERR(newf)) + return PTR_ERR(newf); tsk->files = newf; - error = 0; -out: - return error; + return 0; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) @@ -3238,17 +3235,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp) +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; - int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, max_fds, &error); - if (!*new_fdp) - return error; + fd = dup_fd(fd, NULL); + if (IS_ERR(fd)) + return PTR_ERR(fd); + *new_fdp = fd; } return 0; @@ -3306,7 +3302,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); + err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3398,7 +3394,7 @@ int unshare_files(void) struct files_struct *old, *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); + error = unshare_fd(CLONE_FILES, ©); if (error || !copy) return error; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6dc76b590703..93a822d3c468 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -168,7 +168,7 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key) jump_label_update(key); /* * Ensure that when static_key_fast_inc_not_disabled() or - * static_key_slow_try_dec() observe the positive value, + * static_key_dec_not_one() observe the positive value, * they must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); @@ -250,7 +250,7 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static bool static_key_slow_try_dec(struct static_key *key) +static bool static_key_dec_not_one(struct static_key *key) { int v; @@ -274,6 +274,14 @@ static bool static_key_slow_try_dec(struct static_key *key) * enabled. This suggests an ordering problem on the user side. */ WARN_ON_ONCE(v < 0); + + /* + * Warn about underflow, and lie about success in an attempt to + * not make things worse. + */ + if (WARN_ON_ONCE(v == 0)) + return true; + if (v <= 1) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1))); @@ -284,15 +292,27 @@ static bool static_key_slow_try_dec(struct static_key *key) static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); + int val; - if (static_key_slow_try_dec(key)) + if (static_key_dec_not_one(key)) return; guard(mutex)(&jump_label_mutex); - if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) + val = atomic_read(&key->enabled); + /* + * It should be impossible to observe -1 with jump_label_mutex held, + * see static_key_slow_inc_cpuslocked(). + */ + if (WARN_ON_ONCE(val == -1)) + return; + /* + * Cannot already be 0, something went sideways. + */ + if (WARN_ON_ONCE(val == 0)) + return; + + if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); - else - WARN_ON_ONCE(!static_key_slow_try_dec(key)); } static void __static_key_slow_dec(struct static_key *key) @@ -329,7 +349,7 @@ void __static_key_slow_dec_deferred(struct static_key *key, { STATIC_KEY_CHECK_USE(key); - if (static_key_slow_try_dec(key)) + if (static_key_dec_not_one(key)) return; schedule_delayed_work(work, timeout); diff --git a/kernel/kthread.c b/kernel/kthread.c index db4ceb0f503c..9bb36897b6c6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -623,6 +623,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); + if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) + return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7963deac33c3..536bd471557f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -788,7 +788,7 @@ static void lockdep_print_held_locks(struct task_struct *p) printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); else printk("%d lock%s held by %s/%d:\n", depth, - depth > 1 ? "s" : "", p->comm, task_pid_nr(p)); + str_plural(depth), p->comm, task_pid_nr(p)); /* * It's not reliable to print a task's held locks if it's not sleeping * and it's not the current task. @@ -2084,6 +2084,9 @@ static noinline void print_bfs_bug(int ret) /* * Breadth-first-search failed, graph got corrupted? */ + if (ret == BFS_EQUEUEFULL) + pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n"); + WARN(1, "lockdep bfs error:%d\n", ret); } @@ -6263,25 +6266,27 @@ static struct pending_free *get_pending_free(void) static void free_zapped_rcu(struct rcu_head *cb); /* - * Schedule an RCU callback if no RCU callback is pending. Must be called with - * the graph lock held. - */ -static void call_rcu_zapped(struct pending_free *pf) +* See if we need to queue an RCU callback, must called with +* the lockdep lock held, returns false if either we don't have +* any pending free or the callback is already scheduled. +* Otherwise, a call_rcu() must follow this function call. +*/ +static bool prepare_call_rcu_zapped(struct pending_free *pf) { WARN_ON_ONCE(inside_selftest()); if (list_empty(&pf->zapped)) - return; + return false; if (delayed_free.scheduled) - return; + return false; delayed_free.scheduled = true; WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); delayed_free.index ^= 1; - call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + return true; } /* The caller must hold the graph lock. May be called from RCU context. */ @@ -6307,6 +6312,7 @@ static void free_zapped_rcu(struct rcu_head *ch) { struct pending_free *pf; unsigned long flags; + bool need_callback; if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) return; @@ -6318,14 +6324,18 @@ static void free_zapped_rcu(struct rcu_head *ch) pf = delayed_free.pf + (delayed_free.index ^ 1); __free_zapped_classes(pf); delayed_free.scheduled = false; + need_callback = + prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index); + lockdep_unlock(); + raw_local_irq_restore(flags); /* - * If there's anything on the open list, close and start a new callback. - */ - call_rcu_zapped(delayed_free.pf + delayed_free.index); + * If there's pending free and its callback has not been scheduled, + * queue an RCU callback. + */ + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); - lockdep_unlock(); - raw_local_irq_restore(flags); } /* @@ -6365,6 +6375,7 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) { struct pending_free *pf; unsigned long flags; + bool need_callback; init_data_structures_once(); @@ -6372,10 +6383,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); lockdep_unlock(); raw_local_irq_restore(flags); - + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. @@ -6469,6 +6481,7 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock) struct pending_free *pf; unsigned long flags; int locked; + bool need_callback = false; raw_local_irq_save(flags); locked = graph_lock(); @@ -6477,11 +6490,13 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock) pf = get_pending_free(); __lockdep_reset_lock(pf, lock); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); graph_unlock(); out_irq: raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); } /* @@ -6525,6 +6540,7 @@ void lockdep_unregister_key(struct lock_class_key *key) struct pending_free *pf; unsigned long flags; bool found = false; + bool need_callback = false; might_sleep(); @@ -6545,11 +6561,14 @@ void lockdep_unregister_key(struct lock_class_key *key) if (found) { pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); } lockdep_unlock(); raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ synchronize_rcu(); } diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index e2bfb1db589d..6db0f43fc4df 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -424,7 +424,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length) for (i = 0; i < offset; i++) seq_puts(m, " "); for (i = 0; i < length; i++) - seq_printf(m, "%c", c); + seq_putc(m, c); seq_puts(m, "\n"); } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 5ded7dff46ef..2bbb6eca5144 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -181,12 +181,21 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } +#ifdef CONFIG_DEBUG_RWSEMS +/* + * Return just the real task structure pointer of the owner + */ +static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + /* * Return true if the rwsem is owned by a reader. */ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) { -#ifdef CONFIG_DEBUG_RWSEMS /* * Check the count to see if it is write-locked. */ @@ -194,11 +203,9 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) if (count & RWSEM_WRITER_MASK) return false; -#endif return rwsem_test_oflags(sem, RWSEM_READER_OWNED); } -#ifdef CONFIG_DEBUG_RWSEMS /* * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there * is a task pointer in owner of a reader-owned rwsem, it will be the @@ -266,15 +273,6 @@ static inline bool rwsem_write_trylock(struct rw_semaphore *sem) } /* - * Return just the real task structure pointer of the owner - */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) -{ - return (struct task_struct *) - (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); -} - -/* * Return the real task structure pointer of the owner and the embedded * flags in the owner. pflags must be non-NULL. */ diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 4047b6d48255..7c6588148d42 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -160,6 +160,7 @@ config MODULE_UNLOAD_TAINT_TRACKING config MODVERSIONS bool "Module versioning support" + depends on !COMPILE_TEST help Usually, you have to use modules compiled with your kernel. Saying Y here makes it sometimes possible to use modules @@ -228,7 +229,7 @@ comment "Do not forget to sign required modules with scripts/sign-file" depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL choice - prompt "Which hash algorithm should modules be signed with?" + prompt "Hash algorithm to sign modules" depends on MODULE_SIG || IMA_APPRAISE_MODSIG help This determines which sort of hashing algorithm will be used during @@ -238,31 +239,31 @@ choice the signature on that module. config MODULE_SIG_SHA1 - bool "Sign modules with SHA-1" + bool "SHA-1" select CRYPTO_SHA1 config MODULE_SIG_SHA256 - bool "Sign modules with SHA-256" + bool "SHA-256" select CRYPTO_SHA256 config MODULE_SIG_SHA384 - bool "Sign modules with SHA-384" + bool "SHA-384" select CRYPTO_SHA512 config MODULE_SIG_SHA512 - bool "Sign modules with SHA-512" + bool "SHA-512" select CRYPTO_SHA512 config MODULE_SIG_SHA3_256 - bool "Sign modules with SHA3-256" + bool "SHA3-256" select CRYPTO_SHA3 config MODULE_SIG_SHA3_384 - bool "Sign modules with SHA3-384" + bool "SHA3-384" select CRYPTO_SHA3 config MODULE_SIG_SHA3_512 - bool "Sign modules with SHA3-512" + bool "SHA3-512" select CRYPTO_SHA3 endchoice @@ -278,64 +279,65 @@ config MODULE_SIG_HASH default "sha3-384" if MODULE_SIG_SHA3_384 default "sha3-512" if MODULE_SIG_SHA3_512 -choice - prompt "Module compression mode" +config MODULE_COMPRESS + bool "Module compression" help - This option allows you to choose the algorithm which will be used to - compress modules when 'make modules_install' is run. (or, you can - choose to not compress modules at all.) - - External modules will also be compressed in the same way during the - installation. - - For modules inside an initrd or initramfs, it's more efficient to - compress the whole initrd or initramfs instead. - + Enable module compression to reduce on-disk size of module binaries. This is fully compatible with signed modules. - Please note that the tool used to load modules needs to support the - corresponding algorithm. module-init-tools MAY support gzip, and kmod - MAY support gzip, xz and zstd. + The tool used to work with modules needs to support the selected + compression type. kmod MAY support gzip, xz and zstd. Other tools + might have a limited selection of the supported types. - Your build system needs to provide the appropriate compression tool - to compress the modules. + Note that for modules inside an initrd or initramfs, it's more + efficient to compress the whole ramdisk instead. - If in doubt, select 'None'. + If unsure, say N. -config MODULE_COMPRESS_NONE - bool "None" +choice + prompt "Module compression type" + depends on MODULE_COMPRESS help - Do not compress modules. The installed modules are suffixed - with .ko. + Choose the supported algorithm for module compression. config MODULE_COMPRESS_GZIP bool "GZIP" help - Compress modules with GZIP. The installed modules are suffixed - with .ko.gz. + Support modules compressed with GZIP. The installed modules are + suffixed with .ko.gz. config MODULE_COMPRESS_XZ bool "XZ" help - Compress modules with XZ. The installed modules are suffixed - with .ko.xz. + Support modules compressed with XZ. The installed modules are + suffixed with .ko.xz. config MODULE_COMPRESS_ZSTD bool "ZSTD" help - Compress modules with ZSTD. The installed modules are suffixed - with .ko.zst. + Support modules compressed with ZSTD. The installed modules are + suffixed with .ko.zst. endchoice +config MODULE_COMPRESS_ALL + bool "Automatically compress all modules" + default y + depends on MODULE_COMPRESS + help + Compress all modules during 'make modules_install'. + + Your build system needs to provide the appropriate compression tool + for the selected compression type. External modules will also be + compressed in the same way during the installation. + config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD + depends on MODULE_COMPRESS select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD help - Support for decompressing kernel modules by the kernel itself instead of relying on userspace to perform this task. Useful when load pinning security policy is enabled. diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c index 12a569d361e8..b4cc03842d70 100644 --- a/kernel/module/debug_kmemleak.c +++ b/kernel/module/debug_kmemleak.c @@ -12,19 +12,9 @@ void kmemleak_load_module(const struct module *mod, const struct load_info *info) { - unsigned int i; - - /* only scan the sections containing data */ - kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < info->hdr->e_shnum; i++) { - /* Scan all writable sections that's not executable */ - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || - !(info->sechdrs[i].sh_flags & SHF_WRITE) || - (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, - info->sechdrs[i].sh_size, GFP_KERNEL); + /* only scan writable, non-executable sections */ + for_each_mod_mem_type(type) { + if (type != MOD_DATA && type != MOD_INIT_DATA) + kmemleak_no_scan(mod->mem[type].base); } } diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index 26efe1305c12..456358e1fdc4 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -69,12 +69,13 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) kfree(sect_attrs); } -static void add_sect_attrs(struct module *mod, const struct load_info *info) +static int add_sect_attrs(struct module *mod, const struct load_info *info) { unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; struct bin_attribute **gattr; + int ret; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) @@ -85,7 +86,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); if (!sect_attrs) - return; + return -ENOMEM; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; @@ -103,8 +104,10 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) sattr->address = sec->sh_addr; sattr->battr.attr.name = kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); - if (!sattr->battr.attr.name) + if (!sattr->battr.attr.name) { + ret = -ENOMEM; goto out; + } sect_attrs->nsections++; sattr->battr.read = module_sect_read; sattr->battr.size = MODULE_SECT_READ_SIZE; @@ -113,13 +116,15 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) } *gattr = NULL; - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + ret = sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp); + if (ret) goto out; mod->sect_attrs = sect_attrs; - return; + return 0; out: free_sect_attrs(sect_attrs); + return ret; } static void remove_sect_attrs(struct module *mod) @@ -158,15 +163,12 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, kfree(notes_attrs); } -static void add_notes_attrs(struct module *mod, const struct load_info *info) +static int add_notes_attrs(struct module *mod, const struct load_info *info) { unsigned int notes, loaded, i; struct module_notes_attrs *notes_attrs; struct bin_attribute *nattr; - - /* failed to create section attributes, so can't create notes */ - if (!mod->sect_attrs) - return; + int ret; /* Count notes sections and allocate structures. */ notes = 0; @@ -176,12 +178,12 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) ++notes; if (notes == 0) - return; + return 0; notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), GFP_KERNEL); if (!notes_attrs) - return; + return -ENOMEM; notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; @@ -201,19 +203,23 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) } notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) + if (!notes_attrs->dir) { + ret = -ENOMEM; goto out; + } - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) + for (i = 0; i < notes; ++i) { + ret = sysfs_create_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); + if (ret) goto out; + } mod->notes_attrs = notes_attrs; - return; + return 0; out: free_notes_attrs(notes_attrs, i); + return ret; } static void remove_notes_attrs(struct module *mod) @@ -223,9 +229,15 @@ static void remove_notes_attrs(struct module *mod) } #else /* !CONFIG_KALLSYMS */ -static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { } +static inline int add_sect_attrs(struct module *mod, const struct load_info *info) +{ + return 0; +} static inline void remove_sect_attrs(struct module *mod) { } -static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { } +static inline int add_notes_attrs(struct module *mod, const struct load_info *info) +{ + return 0; +} static inline void remove_notes_attrs(struct module *mod) { } #endif /* CONFIG_KALLSYMS */ @@ -385,11 +397,20 @@ int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_modinfo_attrs; - add_sect_attrs(mod, info); - add_notes_attrs(mod, info); + err = add_sect_attrs(mod, info); + if (err) + goto out_del_usage_links; + + err = add_notes_attrs(mod, info); + if (err) + goto out_unreg_sect_attrs; return 0; +out_unreg_sect_attrs: + remove_sect_attrs(mod); +out_del_usage_links: + del_usage_links(mod); out_unreg_modinfo_attrs: module_remove_modinfo_attrs(mod, -1); out_unreg_param: diff --git a/kernel/power/user.c b/kernel/power/user.c index 3aa41ba22129..3f9e3efb9f6e 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -447,7 +447,6 @@ static const struct file_operations snapshot_fops = { .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, - .llseek = no_llseek, .unlocked_ioctl = snapshot_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snapshot_compat_ioctl, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a60616e69b66..b1f883fcd918 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3607,11 +3607,12 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) } // One work is per one batch, so there are three - // "free channels", the batch can handle. It can - // be that the work is in the pending state when - // channels have been detached following by each - // other. + // "free channels", the batch can handle. Break + // the loop since it is done with this CPU thus + // queuing an RCU work is _always_ success here. queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); + WARN_ON_ONCE(!queued); + break; } } diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 97b99cd06923..16865475120b 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -554,13 +554,19 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); - } else if (!irqs_disabled_flags(flags)) { + } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) { /* ... if queue was empty ... */ rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { + /* + * Don't do the wake-up upfront on fragile paths. + * Also offline CPUs can't call swake_up_one_online() from + * (soft-)IRQs. Rely on the final deferred wake-up from + * rcutree_report_cpu_dead() + */ rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); diff --git a/kernel/relay.c b/kernel/relay.c index a8e90e98bf2c..a8ae436dc77e 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1079,7 +1079,6 @@ const struct file_operations relay_file_operations = { .poll = relay_file_poll, .mmap = relay_file_mmap, .read = relay_file_read, - .llseek = no_llseek, .release = relay_file_release, }; EXPORT_SYMBOL_GPL(relay_file_operations); diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 42d2d8d20f5d..b8ef75b99eb2 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -169,6 +169,8 @@ static void resource_test_intersection(struct kunit *test) #define RES_TEST_RAM3_SIZE SZ_1M #define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE)) +KUNIT_DEFINE_ACTION_WRAPPER(kfree_wrapper, kfree, const void *); + static void remove_free_resource(void *ctx) { struct resource *res = (struct resource *)ctx; @@ -177,6 +179,14 @@ static void remove_free_resource(void *ctx) kfree(res); } +static void resource_test_add_action_or_abort( + struct kunit *test, void (*action)(void *), void *ctx) +{ + KUNIT_ASSERT_EQ_MSG(test, 0, + kunit_add_action_or_reset(test, action, ctx), + "Fail to add action"); +} + static void resource_test_request_region(struct kunit *test, struct resource *parent, resource_size_t start, resource_size_t size, const char *name, unsigned long flags) @@ -185,7 +195,7 @@ static void resource_test_request_region(struct kunit *test, struct resource *pa res = __request_region(parent, start, size, name, flags); KUNIT_ASSERT_NOT_NULL(test, res); - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_insert_resource(struct kunit *test, struct resource *parent, @@ -202,11 +212,11 @@ static void resource_test_insert_resource(struct kunit *test, struct resource *p res->end = start + size - 1; res->flags = flags; if (insert_resource(parent, res)) { - kfree(res); + resource_test_add_action_or_abort(test, kfree_wrapper, res); KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res); } - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_region_intersects(struct kunit *test) @@ -220,7 +230,7 @@ static void resource_test_region_intersects(struct kunit *test) "test resources"); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); start = parent->start; - kunit_add_action_or_reset(test, remove_free_resource, parent); + resource_test_add_action_or_abort(test, remove_free_resource, parent); resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET, RES_TEST_RAM0_SIZE, "Test System RAM 0", flags); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..aeb595514461 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3518,14 +3518,16 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); - else + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); + *wake_flags |= WF_RQ_SELECTED; + } else { cpu = cpumask_any(p->cpus_ptr); + } /* * In order not to call set_task_cpu() on a blocking task we need @@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP + if (wake_flags & WF_RQ_SELECTED) + en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else @@ -4120,6 +4124,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; + wake_flags |= WF_TTWU; + if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4252,7 +4258,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); @@ -4793,6 +4799,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; + int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); @@ -4807,7 +4814,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -4815,7 +4822,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); - wakeup_preempt(rq, p, WF_FORK); + wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c09e3dc38c34..6eae3b69bf6e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -18,6 +18,12 @@ enum scx_consts { SCX_EXIT_DUMP_DFL_LEN = 32768, SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_OPS_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { @@ -624,6 +630,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); @@ -691,6 +701,7 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ @@ -778,7 +789,6 @@ enum scx_tg_flags { }; enum scx_ops_enable_state { - SCX_OPS_PREPPING, SCX_OPS_ENABLING, SCX_OPS_ENABLED, SCX_OPS_DISABLING, @@ -786,7 +796,6 @@ enum scx_ops_enable_state { }; static const char *scx_ops_enable_state_str[] = { - [SCX_OPS_PREPPING] = "prepping", [SCX_OPS_ENABLING] = "enabling", [SCX_OPS_ENABLED] = "enabled", [SCX_OPS_DISABLING] = "disabling", @@ -854,6 +863,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -925,8 +935,15 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; */ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -/* dispatch queues */ -static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; +/* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is + * to avoid live-locking in bypass mode where all tasks are dispatched to + * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't + * sufficient, it can be further split. + */ +static struct scx_dispatch_q **global_dsqs; static const struct rhashtable_params dsq_hash_params = { .key_len = 8, @@ -1029,6 +1046,16 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +{ + return global_dsqs[cpu_to_node(task_cpu(p))]; +} + +static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +{ + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); +} + /* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -1252,86 +1279,104 @@ struct scx_task_iter { struct task_struct *locked; struct rq *rq; struct rq_flags rf; + u32 cnt; }; /** - * scx_task_iter_init - Initialize a task iterator + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * - * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, - * @iter must eventually be exited with scx_task_iter_exit(). + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop(). * - * scx_tasks_lock may be released between this and the first next() call or - * between any two next() calls. If scx_tasks_lock is released between two - * next() calls, the caller is responsible for ensuring that the task being - * iterated remains accessible either through RCU read lock or obtaining a - * reference count. + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist. */ -static void scx_task_iter_init(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + spin_lock_irq(&scx_tasks_lock); + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; + iter->cnt = 0; +} + +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } } /** - * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator - * @iter: iterator to unlock rq for + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock * * If @iter is in the middle of a locked iteration, it may be locking the rq of - * the task currently being visited. Unlock the rq if so. This function can be - * safely called anytime during an iteration. + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ + __scx_task_iter_rq_unlock(iter); + spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock * - * Returns %true if the rq @iter was locking is unlocked. %false if @iter was - * not locking an rq. + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations. */ -static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) +static void scx_task_iter_relock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; - return true; - } else { - return false; - } + spin_lock_irq(&scx_tasks_lock); } /** - * scx_task_iter_exit - Exit a task iterator + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * - * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. - * If the iterator holds a task's rq lock, that rq lock is released. See - * scx_task_iter_init() for details. + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details. */ -static void scx_task_iter_exit(struct scx_task_iter *iter) +static void scx_task_iter_stop(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - - scx_task_iter_rq_unlock(iter); list_del_init(&iter->cursor.tasks_node); + scx_task_iter_unlock(iter); } /** * scx_task_iter_next - Next task * @iter: iterator to walk * - * Visit the next task. See scx_task_iter_init() for details. + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing + * stalls by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - lockdep_assert_held(&scx_tasks_lock); + if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + scx_task_iter_unlock(iter); + cond_resched(); + scx_task_iter_relock(iter); + } list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) @@ -1352,14 +1397,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify - * whether they would like to filter out dead tasks. See scx_task_iter_init() + * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details. */ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) { struct task_struct *p; - scx_task_iter_rq_unlock(iter); + __scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* @@ -1637,7 +1682,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, scx_ops_error("attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = &scx_dsq_global; + dsq = find_global_dsq(p); raw_spin_lock(&dsq->lock); } } @@ -1803,21 +1848,6 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } -static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) -{ - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -} - -static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) -{ - lockdep_assert(rcu_read_lock_any_held()); - - if (dsq_id == SCX_DSQ_GLOBAL) - return &scx_dsq_global; - else - return find_user_dsq(dsq_id); -} - static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, struct task_struct *p) { @@ -1830,16 +1860,20 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return &scx_dsq_global; + return find_global_dsq(p); return &cpu_rq(cpu)->scx.local_dsq; } - dsq = find_non_local_dsq(dsq_id); + if (dsq_id == SCX_DSQ_GLOBAL) + dsq = find_global_dsq(p); + else + dsq = find_user_dsq(dsq_id); + if (unlikely(!dsq)) { scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return &scx_dsq_global; + return find_global_dsq(p); } return dsq; @@ -2011,7 +2045,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; - dispatch_enqueue(&scx_dsq_global, p, enq_flags); + dispatch_enqueue(find_global_dsq(p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2357,6 +2391,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } } #else /* CONFIG_SMP */ +static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ @@ -2396,6 +2431,13 @@ retry: return false; } +static bool consume_global_dsq(struct rq *rq) +{ + int node = cpu_to_node(cpu_of(rq)); + + return consume_dispatch_q(rq, global_dsqs[node]); +} + /** * dispatch_to_local_dsq - Dispatch a task to a local dsq * @rq: current rq which is locked @@ -2429,7 +2471,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, #ifdef CONFIG_SMP if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(find_global_dsq(p), p, + enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2629,7 +2672,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) @@ -2654,7 +2697,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; /* @@ -2937,8 +2980,8 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { - printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", - p->comm, p->pid); + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", + p->comm, p->pid, __func__); scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; @@ -3043,11 +3086,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, *found = false; - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return prev_cpu; - } - /* * If WAKE_SYNC, the waker's local DSQ is empty, and the system is * under utilized, wake up @p to the local DSQ of the waker. Checking @@ -3058,22 +3096,13 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, * there is an idle core elsewhere on the system. */ cpu = smp_processor_id(); - if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && + if ((wake_flags & SCX_WAKE_SYNC) && !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0) { if (cpumask_test_cpu(cpu, p->cpus_ptr)) goto cpu_found; } - if (p->nr_cpus_allowed == 1) { - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } else { - return prev_cpu; - } - } - /* * If CPU has SMT, any wholly idle CPU is likely a better pick than * partially idle @prev_cpu. @@ -3121,7 +3150,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu)) { + if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3186,7 +3215,7 @@ void __scx_update_idle(struct rq *rq, bool idle) { int cpu = cpu_of(rq); - if (SCX_HAS_OP(update_idle)) { + if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); if (!static_branch_unlikely(&scx_builtin_idle_enabled)) return; @@ -3550,7 +3579,7 @@ int scx_fork(struct task_struct *p) { percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_enabled()) + if (scx_ops_init_task_enabled) return scx_ops_init_task(p, task_group(p), true); else return 0; @@ -3558,7 +3587,7 @@ int scx_fork(struct task_struct *p) void scx_post_fork(struct task_struct *p) { - if (scx_enabled()) { + if (scx_ops_init_task_enabled) { scx_set_task_state(p, SCX_TASK_READY); /* @@ -3690,6 +3719,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +static bool scx_cgroup_enabled; static bool cgroup_warned_missing_weight; static bool cgroup_warned_missing_idle; @@ -3709,8 +3739,7 @@ static void scx_cgroup_warn_missing_weight(struct task_group *tg) static void scx_cgroup_warn_missing_idle(struct task_group *tg) { - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_idle) + if (!scx_cgroup_enabled || cgroup_warned_missing_idle) return; if (!tg->idle) @@ -3731,15 +3760,18 @@ int scx_tg_online(struct task_group *tg) scx_cgroup_warn_missing_weight(tg); - if (SCX_HAS_OP(cgroup_init)) { - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + if (scx_cgroup_enabled) { + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = + { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, - tg->css.cgroup, &args); - if (!ret) + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + tg->css.cgroup, &args); + if (ret) + ret = ops_sanitize_err("cgroup_init", ret); + } + if (ret == 0) tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; - else - ret = ops_sanitize_err("cgroup_init", ret); } else { tg->scx_flags |= SCX_TG_ONLINE; } @@ -3770,7 +3802,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) /* released in scx_finish/cancel_attach() */ percpu_down_read(&scx_cgroup_rwsem); - if (!scx_enabled()) + if (!scx_cgroup_enabled) return 0; cgroup_taskset_for_each(p, css, tset) { @@ -3813,7 +3845,7 @@ err: void scx_move_task(struct task_struct *p) { - if (!scx_enabled()) + if (!scx_cgroup_enabled) return; /* @@ -3849,7 +3881,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct task_struct *p; - if (!scx_enabled()) + if (!scx_cgroup_enabled) goto out_unlock; cgroup_taskset_for_each(p, css, tset) { @@ -3866,7 +3898,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) { percpu_down_read(&scx_cgroup_rwsem); - if (tg->scx_weight != weight) { + if (scx_cgroup_enabled && tg->scx_weight != weight) { if (SCX_HAS_OP(cgroup_set_weight)) SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, tg_cgrp(tg), weight); @@ -4038,6 +4070,8 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); + scx_cgroup_enabled = false; + /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and exit all the inited ones, all online cgroups are exited. @@ -4104,6 +4138,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); + scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -4113,6 +4148,9 @@ static int scx_cgroup_init(void) } rcu_read_unlock(); + WARN_ON_ONCE(scx_cgroup_enabled); + scx_cgroup_enabled = true; + return 0; } @@ -4240,21 +4278,23 @@ bool task_should_scx(struct task_struct *p) * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * - * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. - * %SCX_OPS_ENQ_LAST is also ignored. + * - ops.select_cpu() is ignored and the default select_cpu() is used. * - * b. ops.dispatch() is ignored. + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. * - * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice - * can't be trusted. Whenever a tick triggers, the running task is rotated to - * the tail of the queue with core_sched_at touched. + * - ops.dispatch() is ignored. * - * d. pick_next_task() suppresses zero slice warning. + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. * - * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM - * operations. + * - pick_next_task() suppresses zero slice warning. * - * f. scx_prio_less() reverts to the default core_sched_at order. + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + * + * - scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { @@ -4324,7 +4364,7 @@ static void scx_ops_bypass(bool bypass) rq_unlock_irqrestore(rq, &rf); - /* kick to restore ticks */ + /* resched to restore ticks and idle state */ resched_cpu(cpu); } } @@ -4431,26 +4471,28 @@ static void scx_ops_disable_workfn(struct kthread_work *work) WRITE_ONCE(scx_switching_all, false); /* - * Avoid racing against fork and cgroup changes. See scx_ops_enable() - * for explanation on the locking order. + * Shut down cgroup support before tasks so that the cgroup attach path + * doesn't race against scx_ops_exit_task(). */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); scx_cgroup_lock(); + scx_cgroup_exit(); + scx_cgroup_unlock(); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); /* * The BPF scheduler is going away. All tasks including %TASK_DEAD ones * must be switched out and exited synchronously. */ + percpu_down_write(&scx_fork_rwsem); + + scx_ops_init_task_enabled = false; + + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); @@ -4459,25 +4501,19 @@ static void scx_ops_disable_workfn(struct kthread_work *work) check_class_changed(task_rq(p), p, old_class, p->prio); scx_ops_exit_task(p); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); + percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable_cpuslocked(&__scx_ops_enabled); + static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable_cpuslocked(&scx_has_op[i]); - static_branch_disable_cpuslocked(&scx_ops_enq_last); - static_branch_disable_cpuslocked(&scx_ops_enq_exiting); - static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_enq_last); + static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); - scx_cgroup_exit(); - - scx_cgroup_unlock(); - cpus_read_unlock(); - percpu_up_write(&scx_fork_rwsem); - if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason); @@ -4929,7 +4965,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, cpu, ret; + int i, cpu, node, ret; if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { @@ -4948,6 +4984,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } } + if (!global_dsqs) { + struct scx_dispatch_q **dsqs; + + dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); + if (!dsqs) { + ret = -ENOMEM; + goto err_unlock; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + for_each_node_state(node, N_POSSIBLE) + kfree(dsqs[node]); + kfree(dsqs); + ret = -ENOMEM; + goto err_unlock; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + dsqs[node] = dsq; + } + + global_dsqs = dsqs; + } + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { ret = -EBUSY; goto err_unlock; @@ -4971,12 +5035,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } /* - * Set scx_ops, transition to PREPPING and clear exit info to arm the + * Set scx_ops, transition to ENABLING and clear exit info to arm the * disable path. Failure triggers full disabling from here on. */ scx_ops = *ops; - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != SCX_OPS_DISABLED); atomic_set(&scx_exit_kind, SCX_EXIT_NONE); @@ -4997,7 +5061,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); if (ret) { ret = ops_sanitize_err("init", ret); - goto err_disable_unlock_cpus; + cpus_read_unlock(); + scx_ops_error("ops.init() failed (%d)", ret); + goto err_disable; } } @@ -5005,6 +5071,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); + check_hotplug_seq(ops); cpus_read_unlock(); ret = validate_ops(ops); @@ -5032,57 +5099,40 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_watchdog_timeout / 2); /* - * Lock out forks, cgroup on/offlining and moves before opening the - * floodgate so that they don't wander into the operations prematurely. - * - * We don't need to keep the CPUs stable but static_branch_*() requires - * cpus_read_lock() and scx_cgroup_rwsem must nest inside - * cpu_hotplug_lock because of the following dependency chain: - * - * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem - * - * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use - * static_branch_*_cpuslocked(). - * - * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the - * following dependency chain: - * - * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock + * Once __scx_ops_enabled is set, %current can be switched to SCX + * anytime. This can lead to stalls as some BPF schedulers (e.g. + * userspace scheduling) may not function correctly before all tasks are + * switched. Init in bypass mode to guarantee forward progress. */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); - scx_cgroup_lock(); - - check_hotplug_seq(ops); + scx_ops_bypass(true); for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable_cpuslocked(&scx_has_op[i]); + static_branch_enable(&scx_has_op[i]); if (ops->flags & SCX_OPS_ENQ_LAST) - static_branch_enable_cpuslocked(&scx_ops_enq_last); + static_branch_enable(&scx_ops_enq_last); if (ops->flags & SCX_OPS_ENQ_EXITING) - static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + static_branch_enable(&scx_ops_enq_exiting); if (scx_ops.cpu_acquire || scx_ops.cpu_release) - static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); + static_branch_enable(&scx_ops_cpu_preempt); if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { reset_idle_masks(); - static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_enable(&scx_builtin_idle_enabled); } else { - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_enabled); } /* - * All cgroups should be initialized before letting in tasks. cgroup - * on/offlining and task migrations are already locked out. + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. */ - ret = scx_cgroup_init(); - if (ret) - goto err_disable_unlock_all; + percpu_down_write(&scx_fork_rwsem); - static_branch_enable_cpuslocked(&__scx_ops_enabled); + WARN_ON_ONCE(scx_ops_init_task_enabled); + scx_ops_init_task_enabled = true; /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5090,10 +5140,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * leaving as sched_ext_free() can handle both prepped and enabled * tasks. Prep all tasks first and then enable them with preemption * disabled. + * + * All cgroups should be initialized before scx_ops_init_task() so that + * the BPF scheduler can reliably track each task's cgroup membership + * from scx_ops_init_task(). Lock out cgroup on/offlining and task + * migrations while tasks are being initialized so that + * scx_cgroup_can_attach() never sees uninitialized tasks. */ - spin_lock_irq(&scx_tasks_lock); + scx_cgroup_lock(); + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and @@ -5103,65 +5162,48 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!tryget_task_struct(p)) continue; - scx_task_iter_rq_unlock(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_unlock(&sti); ret = scx_ops_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); - pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", - ret, p->comm, p->pid); + scx_task_iter_relock(&sti); + scx_task_iter_stop(&sti); + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } + scx_set_task_state(p, SCX_TASK_READY); + put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); } - scx_task_iter_exit(&sti); + scx_task_iter_stop(&sti); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* - * All tasks are prepped but are still ops-disabled. Ensure that - * %current can't be scheduled out and switch everyone. - * preempt_disable() is necessary because we can't guarantee that - * %current won't be starved if scheduled out while switching. + * All tasks are READY. It's safe to turn on scx_enabled() and switch + * all eligible tasks. */ - preempt_disable(); - - /* - * From here on, the disable path must assume that tasks have ops - * enabled and need to be recovered. - * - * Transition to ENABLING fails iff the BPF scheduler has already - * triggered scx_bpf_error(). Returning an error code here would lose - * the recorded error information. Exit indicating success so that the - * error is notified through ops.exit() with all the details. - */ - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { - preempt_enable(); - spin_unlock_irq(&scx_tasks_lock); - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; - goto err_disable_unlock_all; - } + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + static_branch_enable(&__scx_ops_enabled); /* - * We're fully committed and can't fail. The PREPPED -> ENABLED + * We're fully committed and can't fail. The task READY -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ - WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - - scx_task_iter_init(&sti); + percpu_down_write(&scx_fork_rwsem); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - scx_set_task_state(p, SCX_TASK_READY); + p->scx.slice = SCX_SLICE_DFL; __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); @@ -5169,18 +5211,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) check_class_changed(task_rq(p), p, old_class, p->prio); } - scx_task_iter_exit(&sti); - - spin_unlock_irq(&scx_tasks_lock); - preempt_enable(); - scx_cgroup_unlock(); - cpus_read_unlock(); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); - /* see above ENABLING transition for the explanation on exiting with 0 */ + scx_ops_bypass(false); + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; goto err_disable; } @@ -5212,14 +5249,21 @@ err_unlock: err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); -err_disable_unlock_cpus: - cpus_read_unlock(); + scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); - /* must be fully disabled before returning */ - scx_ops_disable(SCX_EXIT_ERROR); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_ops_error() for cases + * scx_ops_error() wasn't already invoked and exit indicating success so + * that the error is notified through ops.exit() with all the details. + * + * Flush scx_ops_disable_work to ensure that error is reported before + * init completion. + */ + scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); - return ret; + return 0; } @@ -5782,7 +5826,6 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); - init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); #ifdef CONFIG_SMP BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); @@ -5840,16 +5883,21 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { - *is_idle = false; - return prev_cpu; + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + goto prev_cpu; } + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + #ifdef CONFIG_SMP return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#else +#endif + +prev_cpu: *is_idle = false; return prev_cpu; -#endif } __bpf_kfunc_end_defs(); @@ -6058,7 +6106,7 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (!task_can_run_on_remote_rq(p, dst_rq, true)) { - dst_dsq = &scx_dsq_global; + dst_dsq = find_global_dsq(p); dst_rq = src_rq; } } else { @@ -6175,7 +6223,7 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) flush_dispatch_buf(dspc->rq); - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (unlikely(!dsq)) { scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); return false; @@ -6496,7 +6544,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) goto out; } } else { - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (dsq) { ret = READ_ONCE(dsq->nr); goto out; @@ -6545,7 +6593,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; - kit->dsq = find_non_local_dsq(dsq_id); + kit->dsq = find_user_dsq(dsq_id); if (!kit->dsq) return -ENOENT; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 020d58967d4e..84dad1511d1e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -769,12 +769,13 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, u64 now, + unsigned int clear, unsigned int set, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; + u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); @@ -789,6 +790,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); /* * Start with TSK_ONCPU, which doesn't have a corresponding @@ -899,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - now = cpu_clock(cpu); - group = task_psi_group(task); do { - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, true); } while ((group = group->parent)); } @@ -919,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - u64 now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -936,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, true); } while ((group = group->parent)); } @@ -974,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, do { if (group == common) break; - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } while ((group = group->parent)); /* @@ -986,7 +984,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } } } @@ -997,8 +995,8 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now, irq; s64 delta; + u64 irq; if (static_branch_likely(&psi_disabled)) return; @@ -1011,7 +1009,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st if (prev && task_psi_group(prev) == group) return; - now = cpu_clock(cpu); irq = irq_time_read(cpu); delta = (s64)(irq - rq->psi_irq_time); if (delta < 0) @@ -1019,12 +1016,15 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st rq->psi_irq_time = irq; do { + u64 now; + if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; @@ -1223,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); struct rq_flags rf; - u64 now; rq_lock_irq(rq, &rf); - now = cpu_clock(cpu); - psi_group_change(group, cpu, 0, 0, now, true); + psi_group_change(group, cpu, 0, 0, true); rq_unlock_irq(rq, &rf); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1c3588a8f00..6085ef50febf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ @@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) diff --git a/kernel/signal.c b/kernel/signal.c index 6e57036f947f..4344860ffcac 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2888,8 +2888,6 @@ relock: current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { - int ret; - if (print_fatal_signals) print_fatal_signal(signr); proc_coredump_connector(current); @@ -2901,24 +2899,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - ret = do_coredump(&ksig->info); - if (ret) - coredump_report_failure("coredump has not been created, error %d", - ret); - else if (!IS_ENABLED(CONFIG_COREDUMP)) { - /* - * Coredumps are not available, can't fail collecting - * the coredump. - * - * Leave a note though that the coredump is going to be - * not created. This is not an error or a warning as disabling - * support in the kernel for coredumps isn't commonplace, and - * the user must've built the kernel with the custom config so - * let them know all works as desired. - */ - coredump_report("no coredump collected as " - "that is disabled in the kernel configuration"); - } + do_coredump(&ksig->info); } /* diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 639397b5491c..5259cda486d0 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -411,6 +411,17 @@ static void static_call_del_module(struct module *mod) for (site = start; site < stop; site++) { key = static_call_key(site); + + /* + * If the key was not updated due to a memory allocation + * failure in __static_call_init() then treating key::sites + * as key::mods in the code below would cause random memory + * access and #GP. In that case all subsequent sites have + * not been touched either, so stop iterating. + */ + if (!static_call_key_has_mods(key)) + break; + if (key == prev_key) continue; @@ -442,7 +453,7 @@ static int static_call_module_notify(struct notifier_block *nb, case MODULE_STATE_COMING: ret = static_call_add_module(mod); if (ret) { - WARN(1, "Failed to allocate memory for static calls"); + pr_warn("Failed to allocate memory for static calls\n"); static_call_del_module(mod); } break; diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 4782edcbe7b9..316a4e8c97d3 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -168,7 +168,6 @@ static int posix_clock_release(struct inode *inode, struct file *fp) static const struct file_operations posix_clock_file_operations = { .owner = THIS_MODULE, - .llseek = no_llseek, .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, @@ -319,6 +318,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) goto out; } + if (!timespec64_valid_strict(ts)) + return -EINVAL; + if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 77dc0b25140e..3ea4f7bb1837 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2337,9 +2337,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers[cpu]) goto fail_free_buffers; - ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); - if (ret < 0) - goto fail_free_buffers; + /* If already mapped, do not hook to CPU hotplug */ + if (!start) { + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; + } mutex_init(&buffer->mutex); @@ -6725,39 +6728,38 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } for_each_buffer_cpu(buffer, cpu) { + struct buffer_data_page *old_free_data_page; + struct list_head old_pages; + unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); - /* Now walk the list and free all the old sub buffers */ - list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - /* The above loop stopped an the last page needing to be freed */ - bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); - free_buffer_page(bpage); - - /* Free the current reader page */ - free_buffer_page(cpu_buffer->reader_page); + /* + * Collect buffers from the cpu_buffer pages list and the + * reader_page on old_pages, so they can be freed later when not + * under a spinlock. The pages list is a linked list with no + * head, adding old_pages turns it into a regular list with + * old_pages being the head. + */ + list_add(&old_pages, cpu_buffer->pages); + list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); - /* The cpu_buffer pages are a link list with no head */ + /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; - cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; - cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; - - /* Clear the new_pages list */ - INIT_LIST_HEAD(&cpu_buffer->new_pages); + list_del_init(&cpu_buffer->new_pages); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); @@ -6766,11 +6768,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; - free_pages((unsigned long)cpu_buffer->free_page, old_order); + old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* Free old sub buffers */ + list_for_each_entry_safe(bpage, tmp, &old_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + free_pages((unsigned long)old_free_data_page, old_order); + rb_check_pages(cpu_buffer); } diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index df0745a42a3f..dc819aec43e8 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -306,7 +306,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u static const struct file_operations interface_enable_fops = { .open = simple_open, - .llseek = no_llseek, .write = monitor_enable_write_data, .read = monitor_enable_read_data, }; @@ -329,7 +328,6 @@ static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, static const struct file_operations interface_desc_fops = { .open = simple_open, - .llseek = no_llseek, .read = monitor_desc_read_data, }; @@ -674,7 +672,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us static const struct file_operations monitoring_on_fops = { .open = simple_open, - .llseek = no_llseek, .write = monitoring_on_write_data, .read = monitoring_on_read_data, }; diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 6aae106695b6..7b49cbe388d4 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -426,7 +426,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user static const struct file_operations reacting_on_fops = { .open = simple_open, - .llseek = no_llseek, .write = reacting_on_write_data, .read = reacting_on_read_data, }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b4f348b4653f..a8f52b6527ca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,8 +3697,8 @@ static void test_can_verify(void) void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, va_list ap) { - long text_delta = iter->tr->text_delta; - long data_delta = iter->tr->data_delta; + long text_delta = 0; + long data_delta = 0; const char *p = fmt; const char *str; bool good; @@ -3710,6 +3710,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, if (static_branch_unlikely(&trace_no_verify)) goto print; + /* + * When the kernel is booted with the tp_printk command line + * parameter, trace events go directly through to printk(). + * It also is checked by this function, but it does not + * have an associated trace_array (tr) for it. + */ + if (iter->tr) { + text_delta = iter->tr->text_delta; + data_delta = iter->tr->data_delta; + } + /* Don't bother checking when doing a ftrace_dump() */ if (iter->fmt == static_fmt_buf) goto print; @@ -7557,7 +7568,6 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, - .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { @@ -7636,7 +7646,6 @@ static const struct file_operations snapshot_raw_fops = { .read = tracing_buffers_read, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, - .llseek = no_llseek, }; #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -8466,7 +8475,6 @@ static const struct file_operations tracing_buffers_fops = { .flush = tracing_buffers_flush, .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, - .llseek = no_llseek, .mmap = tracing_buffers_mmap, }; @@ -10613,10 +10621,10 @@ __init static void enable_instances(void) * cannot be deleted by user space, so keep the reference * to it. */ - if (start) + if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT; - else - trace_array_put(tr); + tr->ref++; + } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 62e6a8f4aae9..a079abd8955b 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -21,6 +21,7 @@ #define FPROBE_EVENT_SYSTEM "fprobes" #define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 +#define TRACEPOINT_STUB ERR_PTR(-ENOENT) static int trace_fprobe_create(const char *raw_command); static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -385,6 +386,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, struct tracepoint *tpoint, + struct module *mod, int maxactive, int nargs, bool is_return) { @@ -405,6 +407,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->fp.entry_handler = fentry_dispatcher; tf->tpoint = tpoint; + tf->mod = mod; tf->fp.nr_maxactive = maxactive; ret = trace_probe_init(&tf->tp, event, group, false, nargs); @@ -672,6 +675,24 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) return trace_probe_unregister_event_call(&tf->tp); } +static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) +{ + struct tracepoint *tpoint = tf->tpoint; + unsigned long ip = (unsigned long)tpoint->probestub; + int ret; + + /* + * Here, we do 2 steps to enable fprobe on a tracepoint. + * At first, put __probestub_##TP function on the tracepoint + * and put a fprobe on the stub function. + */ + ret = tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); + if (ret < 0) + return ret; + return register_fprobe_ips(&tf->fp, &ip, 1); +} + /* Internal register function - just handle fprobe and flags */ static int __register_trace_fprobe(struct trace_fprobe *tf) { @@ -698,18 +719,12 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) tf->fp.flags |= FPROBE_FL_DISABLED; if (trace_fprobe_is_tracepoint(tf)) { - struct tracepoint *tpoint = tf->tpoint; - unsigned long ip = (unsigned long)tpoint->probestub; - /* - * Here, we do 2 steps to enable fprobe on a tracepoint. - * At first, put __probestub_##TP function on the tracepoint - * and put a fprobe on the stub function. - */ - ret = tracepoint_probe_register_prio_may_exist(tpoint, - tpoint->probestub, NULL, 0); - if (ret < 0) - return ret; - return register_fprobe_ips(&tf->fp, &ip, 1); + + /* This tracepoint is not loaded yet */ + if (tf->tpoint == TRACEPOINT_STUB) + return 0; + + return __regsiter_tracepoint_fprobe(tf); } /* TODO: handle filter, nofilter or symbol list */ @@ -862,20 +877,106 @@ end: return ret; } +struct __find_tracepoint_cb_data { + const char *tp_name; + struct tracepoint *tpoint; + struct module *mod; +}; + +static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mod, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { + data->tpoint = tp; + if (!data->mod) { + data->mod = mod; + if (!try_module_get(data->mod)) { + data->tpoint = NULL; + data->mod = NULL; + } + } + } +} + +static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) + data->tpoint = tp; +} + +/* + * Find a tracepoint from kernel and module. If the tracepoint is in a module, + * this increments the module refcount to prevent unloading until the + * trace_fprobe is registered to the list. After registering the trace_fprobe + * on the trace_fprobe list, the module refcount is decremented because + * tracepoint_probe_module_cb will handle it. + */ +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + .mod = NULL, + }; + + for_each_kernel_tracepoint(__find_tracepoint_cb, &data); + + if (!data.tpoint && IS_ENABLED(CONFIG_MODULES)) { + for_each_module_tracepoint(__find_tracepoint_module_cb, &data); + *tp_mod = data.mod; + } + + return data.tpoint; +} + #ifdef CONFIG_MODULES +static void reenable_trace_fprobe(struct trace_fprobe *tf) +{ + struct trace_probe *tp = &tf->tp; + + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + __enable_trace_fprobe(tf); + } +} + +static struct tracepoint *find_tracepoint_in_module(struct module *mod, + const char *tp_name) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + .mod = mod, + }; + + for_each_tracepoint_in_module(mod, __find_tracepoint_module_cb, &data); + return data.tpoint; +} + static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; + struct tracepoint *tpoint; struct trace_fprobe *tf; struct dyn_event *pos; - if (val != MODULE_STATE_GOING) + if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) return NOTIFY_DONE; mutex_lock(&event_mutex); for_each_trace_fprobe(tf, pos) { - if (tp_mod->mod == tf->mod) { + if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { + tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); + if (tpoint) { + tf->tpoint = tpoint; + tf->mod = tp_mod->mod; + if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && + trace_probe_is_enabled(&tf->tp)) + reenable_trace_fprobe(tf); + } + } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { tracepoint_probe_unregister(tf->tpoint, tf->tpoint->probestub, NULL); tf->tpoint = NULL; @@ -892,30 +993,6 @@ static struct notifier_block tracepoint_module_nb = { }; #endif /* CONFIG_MODULES */ -struct __find_tracepoint_cb_data { - const char *tp_name; - struct tracepoint *tpoint; -}; - -static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) -{ - struct __find_tracepoint_cb_data *data = priv; - - if (!data->tpoint && !strcmp(data->tp_name, tp->name)) - data->tpoint = tp; -} - -static struct tracepoint *find_tracepoint(const char *tp_name) -{ - struct __find_tracepoint_cb_data data = { - .tp_name = tp_name, - }; - - for_each_kernel_tracepoint(__find_tracepoint_cb, &data); - - return data.tpoint; -} - static int parse_symbol_and_return(int argc, const char *argv[], char **symbol, bool *is_return, bool is_tracepoint) @@ -996,6 +1073,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char abuf[MAX_BTF_ARGS_LEN]; char *dbuf = NULL; bool is_tracepoint = false; + struct module *tp_mod = NULL; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, @@ -1080,15 +1158,20 @@ static int __trace_fprobe_create(int argc, const char *argv[]) if (is_tracepoint) { ctx.flags |= TPARG_FL_TPOINT; - tpoint = find_tracepoint(symbol); - if (!tpoint) { + tpoint = find_tracepoint(symbol, &tp_mod); + if (tpoint) { + ctx.funcname = kallsyms_lookup( + (unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); + } else if (IS_ENABLED(CONFIG_MODULES)) { + /* This *may* be loaded afterwards */ + tpoint = TRACEPOINT_STUB; + ctx.funcname = symbol; + } else { trace_probe_log_set_index(1); trace_probe_log_err(0, NO_TRACEPOINT); goto parse_error; } - ctx.funcname = kallsyms_lookup( - (unsigned long)tpoint->probestub, - NULL, NULL, NULL, sbuf); } else ctx.funcname = symbol; @@ -1110,8 +1193,8 @@ static int __trace_fprobe_create(int argc, const char *argv[]) goto out; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, - argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, + maxactive, argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ @@ -1119,10 +1202,6 @@ static int __trace_fprobe_create(int argc, const char *argv[]) goto out; /* We know tf is not allocated */ } - if (is_tracepoint) - tf->mod = __module_text_address( - (unsigned long)tf->tpoint->probestub); - /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); @@ -1155,6 +1234,8 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } out: + if (tp_mod) + module_put(tp_mod); traceprobe_finish_parse(&ctx); trace_probe_log_clear(); kfree(new_argv); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b791524a6536..3bd6071441ad 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -520,6 +520,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) goto out_unlock; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 1439064f65d6..a50ed23bee77 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1953,12 +1953,8 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; - mutex_lock(&interface_lock); - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (kthread) { - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; - mutex_unlock(&interface_lock); - if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { kthread_stop(kthread); @@ -1972,7 +1968,6 @@ static void stop_kthread(unsigned int cpu) put_task_struct(kthread); } } else { - mutex_unlock(&interface_lock); /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { /* @@ -1994,8 +1989,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - for_each_possible_cpu(cpu) + cpus_read_lock(); + + for_each_online_cpu(cpu) stop_kthread(cpu); + + cpus_read_unlock(); } /* @@ -2007,6 +2006,10 @@ static int start_kthread(unsigned int cpu) void *main = osnoise_main; char comm[24]; + /* Do not start a new thread if it is already running */ + if (per_cpu(per_cpu_osnoise_var, cpu).kthread) + return 0; + if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; @@ -2061,11 +2064,10 @@ static int start_per_cpu_kthreads(void) if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { struct task_struct *kthread; - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (!WARN_ON(!kthread)) kthread_stop(kthread); } - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } for_each_cpu(cpu, current_mask) { @@ -2095,6 +2097,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) mutex_lock(&interface_lock); cpus_read_lock(); + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f7443e996b1b..c40531d2cbad 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,6 +17,7 @@ #include <linux/string.h> #include <linux/rculist.h> #include <linux/filter.h> +#include <linux/percpu.h> #include "trace_dynevent.h" #include "trace_probe.h" @@ -62,7 +63,7 @@ struct trace_uprobe { struct uprobe *uprobe; unsigned long offset; unsigned long ref_ctr_offset; - unsigned long nhit; + unsigned long __percpu *nhits; struct trace_probe tp; }; @@ -337,6 +338,12 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); + tu->nhits = alloc_percpu(unsigned long); + if (!tu->nhits) { + ret = -ENOMEM; + goto error; + } + ret = trace_probe_init(&tu->tp, event, group, true, nargs); if (ret < 0) goto error; @@ -349,6 +356,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) return tu; error: + free_percpu(tu->nhits); kfree(tu); return ERR_PTR(ret); @@ -362,6 +370,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu) path_put(&tu->path); trace_probe_cleanup(&tu->tp); kfree(tu->filename); + free_percpu(tu->nhits); kfree(tu); } @@ -815,13 +824,21 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; struct trace_uprobe *tu; + unsigned long nhits; + int cpu; if (!is_trace_uprobe(ev)) return 0; tu = to_trace_uprobe(ev); + + nhits = 0; + for_each_possible_cpu(cpu) { + nhits += per_cpu(*tu->nhits, cpu); + } + seq_printf(m, " %s %-44s %15lu\n", tu->filename, - trace_probe_name(&tu->tp), tu->nhit); + trace_probe_name(&tu->tp), nhits); return 0; } @@ -1508,7 +1525,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); - tu->nhit++; + + this_cpu_inc(*tu->nhits); udd.tu = tu; udd.bp_addr = instruction_pointer(regs); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 8d1507dd0724..8879da16ef4d 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -735,6 +735,48 @@ static __init int init_tracepoints(void) return ret; } __initcall(init_tracepoints); + +/** + * for_each_tracepoint_in_module - iteration on all tracepoints in a module + * @mod: module + * @fct: callback + * @priv: private data + */ +void for_each_tracepoint_in_module(struct module *mod, + void (*fct)(struct tracepoint *tp, + struct module *mod, void *priv), + void *priv) +{ + tracepoint_ptr_t *begin, *end, *iter; + + lockdep_assert_held(&tracepoint_module_list_mutex); + + if (!mod) + return; + + begin = mod->tracepoints_ptrs; + end = mod->tracepoints_ptrs + mod->num_tracepoints; + + for (iter = begin; iter < end; iter++) + fct(tracepoint_ptr_deref(iter), mod, priv); +} + +/** + * for_each_module_tracepoint - iteration on all tracepoints in all modules + * @fct: callback + * @priv: private data + */ +void for_each_module_tracepoint(void (*fct)(struct tracepoint *tp, + struct module *mod, void *priv), + void *priv) +{ + struct tp_module *tp_mod; + + mutex_lock(&tracepoint_module_list_mutex); + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + for_each_tracepoint_in_module(tp_mod->mod, fct, priv); + mutex_unlock(&tracepoint_module_list_mutex); +} #endif /* CONFIG_MODULES */ /** |