diff options
Diffstat (limited to 'kernel')
51 files changed, 691 insertions, 539 deletions
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 583ee4fe48ef..e52b3ad231b9 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -212,6 +212,7 @@ static u64 arena_map_mem_usage(const struct bpf_map *map) struct vma_list { struct vm_area_struct *vma; struct list_head head; + atomic_t mmap_count; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -221,20 +222,30 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) vml = kmalloc(sizeof(*vml), GFP_KERNEL); if (!vml) return -ENOMEM; + atomic_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; list_add(&vml->head, &arena->vma_list); return 0; } +static void arena_vm_open(struct vm_area_struct *vma) +{ + struct vma_list *vml = vma->vm_private_data; + + atomic_inc(&vml->mmap_count); +} + static void arena_vm_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); - struct vma_list *vml; + struct vma_list *vml = vma->vm_private_data; + if (!atomic_dec_and_test(&vml->mmap_count)) + return; guard(mutex)(&arena->lock); - vml = vma->vm_private_data; + /* update link list under lock */ list_del(&vml->head); vma->vm_private_data = NULL; kfree(vml); @@ -287,6 +298,7 @@ out: } static const struct vm_operations_struct arena_vm_ops = { + .open = arena_vm_open, .close = arena_vm_close, .fault = arena_vm_fault, }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 976cb258a0ed..c938dea5ddbf 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), - nbuckets, GFP_USER | __GFP_NOWARN); + smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, + sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); if (!smap->buckets) { err = -ENOMEM; goto free_smap; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a6c3faa6e4a..695a0fb2cd4d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -736,11 +736,11 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr) return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { struct bpf_ksym *ksym; - char *ret = NULL; + int ret = 0; rcu_read_lock(); ksym = bpf_ksym_find(addr); @@ -748,9 +748,8 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strscpy(sym, ksym->name, KSYM_NAME_LEN); + ret = strscpy(sym, ksym->name, KSYM_NAME_LEN); - ret = sym; if (size) *size = symbol_end - symbol_start; if (off) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a69a9a36c0f..3243c83ef3e3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1084,7 +1084,10 @@ struct bpf_async_cb { struct bpf_prog *prog; void __rcu *callback_fn; void *value; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct delete_work; + }; u64 flags; }; @@ -1107,6 +1110,7 @@ struct bpf_async_cb { struct bpf_hrtimer { struct bpf_async_cb cb; struct hrtimer timer; + atomic_t cancelling; }; struct bpf_work { @@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work) kfree_rcu(w, cb.rcu); } +static void bpf_timer_delete_work(struct work_struct *work) +{ + struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); + + /* Cancel the timer and wait for callback to complete if it was running. + * If hrtimer_cancel() can be safely called it's safe to call + * kfree_rcu(t) right after for both preallocated and non-preallocated + * maps. The async->cb = NULL was already done and no code path can see + * address 't' anymore. Timer if armed for existing bpf_hrtimer before + * bpf_timer_cancel_and_free will have been cancelled. + */ + hrtimer_cancel(&t->timer); + kfree_rcu(t, cb.rcu); +} + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { @@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; + atomic_set(&t->cancelling, 0); + INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; @@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async) BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { - struct bpf_hrtimer *t; + struct bpf_hrtimer *t, *cur_t; + bool inc = false; int ret = 0; if (in_nmi()) @@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) ret = -EINVAL; goto out; } - if (this_cpu_read(hrtimer_running) == t) { + + cur_t = this_cpu_read(hrtimer_running); + if (cur_t == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock - * since it waits for callback_fn to finish + * since it waits for callback_fn to finish. */ ret = -EDEADLK; goto out; } + + /* Only account in-flight cancellations when invoked from a timer + * callback, since we want to avoid waiting only if other _callbacks_ + * are waiting on us, to avoid introducing lockups. Non-callback paths + * are ok, since nobody would synchronously wait for their completion. + */ + if (!cur_t) + goto drop; + atomic_inc(&t->cancelling); + /* Need full barrier after relaxed atomic_inc */ + smp_mb__after_atomic(); + inc = true; + if (atomic_read(&cur_t->cancelling)) { + /* We're cancelling timer t, while some other timer callback is + * attempting to cancel us. In such a case, it might be possible + * that timer t belongs to the other callback, or some other + * callback waiting upon it (creating transitive dependencies + * upon us), and we will enter a deadlock if we continue + * cancelling and waiting for it synchronously, since it might + * do the same. Bail! + */ + ret = -EDEADLK; + goto out; + } +drop: drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); @@ -1467,6 +1516,8 @@ out: * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); + if (inc) + atomic_dec(&t->cancelling); rcu_read_unlock(); return ret; } @@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val) if (!t) return; - /* Cancel the timer and wait for callback to complete if it was running. - * If hrtimer_cancel() can be safely called it's safe to call kfree(t) - * right after for both preallocated and non-preallocated maps. - * The async->cb = NULL was already done and no code path can - * see address 't' anymore. - * - * Check that bpf_map_delete/update_elem() wasn't called from timer - * callback_fn. In such case don't call hrtimer_cancel() (since it will - * deadlock) and don't call hrtimer_try_to_cancel() (since it will just - * return -1). Though callback_fn is still running on this cpu it's + /* We check that bpf_map_delete/update_elem() was called from timer + * callback_fn. In such case we don't call hrtimer_cancel() (since it + * will deadlock) and don't call hrtimer_try_to_cancel() (since it will + * just return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. + * + * However, it is possible the timer callback_fn calling us armed the + * timer _before_ calling us, such that failing to cancel it here will + * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. + * Therefore, we _need_ to cancel any outstanding timers before we do + * kfree_rcu, even though no more timers can be armed. + * + * Moreover, we need to schedule work even if timer does not belong to + * the calling callback_fn, as on two different CPUs, we can end up in a + * situation where both sides run in parallel, try to cancel one + * another, and we end up waiting on both sides in hrtimer_cancel + * without making forward progress, since timer1 depends on time2 + * callback to finish, and vice versa. + * + * CPU 1 (timer1_cb) CPU 2 (timer2_cb) + * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) + * + * To avoid these issues, punt to workqueue context when we are in a + * timer callback. */ - if (this_cpu_read(hrtimer_running) != t) - hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + if (this_cpu_read(hrtimer_running)) + queue_work(system_unbound_wq, &t->cb.delete_work); + else + bpf_timer_delete_work(&t->cb.delete_work); } /* This function is called by map_delete/update_elem for individual element and diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 0ee653a936ea..e20b90c36131 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -51,7 +51,8 @@ struct bpf_ringbuf { * This prevents a user-space application from modifying the * position and ruining in-kernel tracking. The permissions of the * pages depend on who is producing samples: user-space or the - * kernel. + * kernel. Note that the pending counter is placed in the same + * page as the producer, so that it shares the same cache line. * * Kernel-producer * --------------- @@ -70,6 +71,7 @@ struct bpf_ringbuf { */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); + unsigned long pending_pos; char data[] __aligned(PAGE_SIZE); }; @@ -179,6 +181,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb->mask = data_sz - 1; rb->consumer_pos = 0; rb->producer_pos = 0; + rb->pending_pos = 0; return rb; } @@ -404,9 +407,9 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { - unsigned long cons_pos, prod_pos, new_prod_pos, flags; - u32 len, pg_off; + unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; struct bpf_ringbuf_hdr *hdr; + u32 len, pg_off, tmp_size, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; @@ -424,13 +427,29 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) spin_lock_irqsave(&rb->spinlock, flags); } + pend_pos = rb->pending_pos; prod_pos = rb->producer_pos; new_prod_pos = prod_pos + len; - /* check for out of ringbuf space by ensuring producer position - * doesn't advance more than (ringbuf_size - 1) ahead + while (pend_pos < prod_pos) { + hdr = (void *)rb->data + (pend_pos & rb->mask); + hdr_len = READ_ONCE(hdr->len); + if (hdr_len & BPF_RINGBUF_BUSY_BIT) + break; + tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; + tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); + pend_pos += tmp_size; + } + rb->pending_pos = pend_pos; + + /* check for out of ringbuf space: + * - by ensuring producer position doesn't advance more than + * (ringbuf_size - 1) ahead + * - by ensuring oldest not yet committed record until newest + * record does not span more than (ringbuf_size - 1) */ - if (new_prod_pos - cons_pos > rb->mask) { + if (new_prod_pos - cons_pos > rb->mask || + new_prod_pos - pend_pos > rb->mask) { spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 010cfee7ffe9..214a9fa8c6fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6236,6 +6236,7 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size) } reg->u32_min_value = 0; reg->u32_max_value = U32_MAX; + reg->var_off = tnum_subreg(tnum_unknown); } static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) @@ -6280,6 +6281,7 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) reg->s32_max_value = s32_max; reg->u32_min_value = (u32)s32_min; reg->u32_max_value = (u32)s32_max; + reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } @@ -12719,6 +12721,16 @@ static bool signed_add32_overflows(s32 a, s32 b) return res < a; } +static bool signed_add16_overflows(s16 a, s16 b) +{ + /* Do the add in u16, where overflow is well-defined */ + s16 res = (s16)((u16)a + (u16)b); + + if (b < 0) + return res > a; + return res < a; +} + static bool signed_sub_overflows(s64 a, s64 b) { /* Do the sub in u64, where overflow is well-defined */ @@ -17448,11 +17460,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) goto skip_inf_loop_check; } if (is_may_goto_insn_at(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + if (sl->state.may_goto_depth != cur->may_goto_depth && + states_equal(env, &sl->state, cur, RANGE_WITHIN)) { update_loop_entry(cur, &sl->state); goto hit; } - goto skip_inf_loop_check; } if (calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) @@ -18730,6 +18742,39 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +/* + * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the + * jump offset by 'delta'. + */ +static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 insn_cnt = prog->len, i; + + for (i = 0; i < insn_cnt; i++, insn++) { + u8 code = insn->code; + + if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || + BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) + continue; + + if (insn->code == (BPF_JMP32 | BPF_JA)) { + if (i + 1 + insn->imm != tgt_idx) + continue; + if (signed_add32_overflows(insn->imm, delta)) + return -ERANGE; + insn->imm += delta; + } else { + if (i + 1 + insn->off != tgt_idx) + continue; + if (signed_add16_overflows(insn->imm, delta)) + return -ERANGE; + insn->off += delta; + } + } + return 0; +} + static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, u32 off, u32 cnt) { @@ -20004,7 +20049,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) stack_depth_extra = 8; insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off); cnt = 4; @@ -20546,6 +20594,13 @@ next_insn: if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; + /* + * If may_goto is a first insn of a prog there could be a jmp + * insn that points to it, hence adjust all such jmps to point + * to insn after BPF_ST that inits may_goto count. + * Adjustment will succeed because bpf_patch_insn_data() didn't fail. + */ + WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); } /* Since poke tab is now finalized, publish aux to tracker. */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 563877d6c28b..7121f058674a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1859,6 +1859,9 @@ static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return fals void __init bringup_nonboot_cpus(unsigned int max_cpus) { + if (!max_cpus) + return; + /* Try parallel bringup optimization if enabled */ if (cpuhp_bringup_cpus_parallel(max_cpus)) return; @@ -1891,8 +1894,8 @@ int freeze_secondary_cpus(int primary) cpumask_clear(frozen_cpus); pr_info("Disabling non-boot CPUs ...\n"); - for_each_online_cpu(cpu) { - if (cpu == primary) + for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) { + if (!cpu_online(cpu) || cpu == primary) continue; if (pm_wakeup_pending()) { @@ -2446,7 +2449,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * The caller needs to hold cpus read locked while calling this function. * Return: * On success: - * Positive state number if @state is CPUHP_AP_ONLINE_DYN; + * Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN; * 0 for all other states * On failure: proper (negative) error code */ @@ -2469,7 +2472,7 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, ret = cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); - dynstate = state == CPUHP_AP_ONLINE_DYN; + dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN; if (ret > 0 && dynstate) { state = ret; ret = 0; @@ -2500,8 +2503,8 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, out: mutex_unlock(&cpuhp_state_mutex); /* - * If the requested state is CPUHP_AP_ONLINE_DYN, return the - * dynamically allocated state in case of success. + * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN, + * return the dynamically allocated state in case of success. */ if (!ret && dynstate) return state; diff --git a/kernel/exit.c b/kernel/exit.c index f95a2c1338a8..81fcee45d630 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -484,6 +484,8 @@ retry: * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { diff --git a/kernel/fork.c b/kernel/fork.c index 99076dbe27d8..763a042eef9c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -616,12 +616,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) exe_file = get_mm_exe_file(oldmm); RCU_INIT_POINTER(mm->exe_file, exe_file); - /* - * We depend on the oldmm having properly denied write access to the - * exe_file already. - */ - if (exe_file && deny_write_access(exe_file)) - pr_warn_once("deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU @@ -1412,20 +1406,11 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) */ old_exe_file = rcu_dereference_raw(mm->exe_file); - if (new_exe_file) { - /* - * We expect the caller (i.e., sys_execve) to already denied - * write access, so this is unlikely to fail. - */ - if (unlikely(deny_write_access(new_exe_file))) - return -EACCES; + if (new_exe_file) get_file(new_exe_file); - } rcu_assign_pointer(mm->exe_file, new_exe_file); - if (old_exe_file) { - allow_write_access(old_exe_file); + if (old_exe_file) fput(old_exe_file); - } return 0; } @@ -1464,9 +1449,6 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) return ret; } - ret = deny_write_access(new_exe_file); - if (ret) - return -EACCES; get_file(new_exe_file); /* set the new file */ @@ -1475,10 +1457,8 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) rcu_assign_pointer(mm->exe_file, new_exe_file); mmap_write_unlock(mm); - if (old_exe_file) { - allow_write_access(old_exe_file); + if (old_exe_file) fput(old_exe_file); - } return 0; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 22ea19a36e6e..98b9622d372e 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -388,12 +388,12 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } -static const char *kallsyms_lookup_buildid(unsigned long addr, +static int kallsyms_lookup_buildid(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { - const char *ret; + int ret; namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; @@ -410,7 +410,7 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, if (modbuildid) *modbuildid = NULL; - ret = namebuf; + ret = strlen(namebuf); goto found; } @@ -442,8 +442,13 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { - return kallsyms_lookup_buildid(addr, symbolsize, offset, modname, - NULL, namebuf); + int ret = kallsyms_lookup_buildid(addr, symbolsize, offset, modname, + NULL, namebuf); + + if (!ret) + return NULL; + + return namebuf; } int lookup_symbol_name(unsigned long addr, char *symname) @@ -478,19 +483,15 @@ static int __sprint_symbol(char *buffer, unsigned long address, { char *modname; const unsigned char *buildid; - const char *name; unsigned long offset, size; int len; address += symbol_offset; - name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, + len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); - if (!name) + if (!len) return sprintf(buffer, "0x%lx", address - symbol_offset); - if (name != buffer) - strcpy(buffer, name); - len = strlen(buffer); offset -= symbol_offset; if (add_offset) diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 0c17b4c83e1c..117d9d4d3c3b 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1620,5 +1620,6 @@ static struct kunit_suite kcsan_test_suite = { kunit_test_suites(&kcsan_test_suite); +MODULE_DESCRIPTION("KCSAN test suite"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 415d81e6ce70..de95ec07e477 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -30,6 +30,7 @@ #include <linux/torture.h> #include <linux/reboot.h> +MODULE_DESCRIPTION("torture test facility for locking"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 62fb57bb9f16..bf65e0c3c86f 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -321,14 +321,15 @@ void * __weak dereference_module_function_descriptor(struct module *mod, * For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) +int module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) { - const char *ret = NULL; + const char *sym; + int ret = 0; struct module *mod; preempt_disable(); @@ -344,12 +345,10 @@ const char *module_address_lookup(unsigned long addr, #endif } - ret = find_kallsyms_symbol(mod, addr, size, offset); - } - /* Make a copy in here where it's safe */ - if (ret) { - strscpy(namebuf, ret, KSYM_NAME_LEN); - ret = namebuf; + sym = find_kallsyms_symbol(mod, addr, size, offset); + + if (sym) + ret = strscpy(namebuf, sym, KSYM_NAME_LEN); } preempt_enable(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 25f3cf679b35..bdf0087d6442 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -249,24 +249,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; - /* - * Release tasks_rcu_exit_srcu to avoid following deadlock: - * - * 1) TASK A unshare(CLONE_NEWPID) - * 2) TASK A fork() twice -> TASK B (child reaper for new ns) - * and TASK C - * 3) TASK B exits, kills TASK C, waits for TASK A to reap it - * 4) TASK A calls synchronize_rcu_tasks() - * -> synchronize_srcu(tasks_rcu_exit_srcu) - * 5) *DEADLOCK* - * - * It is considered safe to release tasks_rcu_exit_srcu here - * because we assume the current task can not be concurrently - * reaped at this point. - */ - exit_tasks_rcu_stop(); schedule(); - exit_tasks_rcu_start(); } __set_current_state(TASK_RUNNING); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 040fe7d1eda2..39a2b61c7232 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y = printk.o conopt.o +obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/conopt.c b/kernel/printk/conopt.c deleted file mode 100644 index 9d507bac3657..000000000000 --- a/kernel/printk/conopt.c +++ /dev/null @@ -1,146 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Kernel command line console options for hardware based addressing - * - * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/ - * Author: Tony Lindgren <tony@atomide.com> - */ - -#include <linux/console.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/types.h> - -#include <asm/errno.h> - -#include "console_cmdline.h" - -/* - * Allow longer DEVNAME:0.0 style console naming such as abcd0000.serial:0.0 - * in addition to the legacy ttyS0 style naming. - */ -#define CONSOLE_NAME_MAX 32 - -#define CONSOLE_OPT_MAX 16 -#define CONSOLE_BRL_OPT_MAX 16 - -struct console_option { - char name[CONSOLE_NAME_MAX]; - char opt[CONSOLE_OPT_MAX]; - char brl_opt[CONSOLE_BRL_OPT_MAX]; - u8 has_brl_opt:1; -}; - -/* Updated only at console_setup() time, no locking needed */ -static struct console_option conopt[MAX_CMDLINECONSOLES]; - -/** - * console_opt_save - Saves kernel command line console option for driver use - * @str: Kernel command line console name and option - * @brl_opt: Braille console options - * - * Saves a kernel command line console option for driver subsystems to use for - * adding a preferred console during init. Called from console_setup() only. - * - * Return: 0 on success, negative error code on failure. - */ -int __init console_opt_save(const char *str, const char *brl_opt) -{ - struct console_option *con; - size_t namelen, optlen; - const char *opt; - int i; - - namelen = strcspn(str, ","); - if (namelen == 0 || namelen >= CONSOLE_NAME_MAX) - return -EINVAL; - - opt = str + namelen; - if (*opt == ',') - opt++; - - optlen = strlen(opt); - if (optlen >= CONSOLE_OPT_MAX) - return -EINVAL; - - for (i = 0; i < MAX_CMDLINECONSOLES; i++) { - con = &conopt[i]; - - if (con->name[0]) { - if (!strncmp(str, con->name, namelen)) - return 0; - continue; - } - - /* - * The name isn't terminated, only opt is. Empty opt is fine, - * but brl_opt can be either empty or NULL. For more info, see - * _braille_console_setup(). - */ - strscpy(con->name, str, namelen + 1); - strscpy(con->opt, opt, CONSOLE_OPT_MAX); - if (brl_opt) { - strscpy(con->brl_opt, brl_opt, CONSOLE_BRL_OPT_MAX); - con->has_brl_opt = 1; - } - - return 0; - } - - return -ENOMEM; -} - -static struct console_option *console_opt_find(const char *name) -{ - struct console_option *con; - int i; - - for (i = 0; i < MAX_CMDLINECONSOLES; i++) { - con = &conopt[i]; - if (!strcmp(name, con->name)) - return con; - } - - return NULL; -} - -/** - * add_preferred_console_match - Adds a preferred console if a match is found - * @match: Expected console on kernel command line, such as console=DEVNAME:0.0 - * @name: Name of the console character device to add such as ttyS - * @idx: Index for the console - * - * Allows driver subsystems to add a console after translating the command - * line name to the character device name used for the console. Options are - * added automatically based on the kernel command line. Duplicate preferred - * consoles are ignored by __add_preferred_console(). - * - * Return: 0 on success, negative error code on failure. - */ -int add_preferred_console_match(const char *match, const char *name, - const short idx) -{ - struct console_option *con; - char *brl_opt = NULL; - - if (!match || !strlen(match) || !name || !strlen(name) || - idx < 0) - return -EINVAL; - - con = console_opt_find(match); - if (!con) - return -ENOENT; - - /* - * See __add_preferred_console(). It checks for NULL brl_options to set - * the preferred_console flag. Empty brl_opt instead of NULL leads into - * the preferred_console flag not set, and CON_CONSDEV not being set, - * and the boot console won't get disabled at the end of console_setup(). - */ - if (con->has_brl_opt) - brl_opt = con->brl_opt; - - console_opt_add_preferred_console(name, idx, con->opt, brl_opt); - - return 0; -} diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index a125e0235589..3ca74ad391d6 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -2,12 +2,6 @@ #ifndef _CONSOLE_CMDLINE_H #define _CONSOLE_CMDLINE_H -#define MAX_CMDLINECONSOLES 8 - -int console_opt_save(const char *str, const char *brl_opt); -int console_opt_add_preferred_console(const char *name, const short idx, - char *options, char *brl_options); - struct console_cmdline { char name[16]; /* Name of the driver */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 420fd310129d..dddb15f48d59 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -383,6 +383,9 @@ static int console_locked; /* * Array of consoles built from command line options (console=) */ + +#define MAX_CMDLINECONSOLES 8 + static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; @@ -2500,17 +2503,6 @@ static int __init console_setup(char *str) if (_braille_console_setup(&str, &brl_options)) return 1; - /* Save the console for driver subsystem use */ - if (console_opt_save(str, brl_options)) - return 1; - - /* Flag register_console() to not call try_enable_default_console() */ - console_set_on_cmdline = 1; - - /* Don't attempt to parse a DEVNAME:0.0 style console */ - if (strchr(str, ':')) - return 1; - /* * Decode str into name, index, options. */ @@ -2541,13 +2533,6 @@ static int __init console_setup(char *str) } __setup("console=", console_setup); -/* Only called from add_preferred_console_match() */ -int console_opt_add_preferred_console(const char *name, const short idx, - char *options, char *brl_options) -{ - return __add_preferred_console(name, idx, options, brl_options, true); -} - /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name @@ -3522,7 +3507,7 @@ void register_console(struct console *newcon) * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ - if (preferred_console < 0 && !console_set_on_cmdline) { + if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 8db4fedaaa1e..b53a9e8f5904 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -42,6 +42,7 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 807fbf6123a7..08bf7c669dd3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,7 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); @@ -390,6 +391,7 @@ struct rcu_torture_ops { int extendables; int slow_gps; int no_pi_lock; + int debug_objects; const char *name; }; @@ -577,6 +579,7 @@ static struct rcu_torture_ops rcu_ops = { .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, + .debug_objects = 1, .name = "rcu" }; @@ -747,6 +750,7 @@ static struct rcu_torture_ops srcu_ops = { .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, .name = "srcu" }; @@ -786,6 +790,7 @@ static struct rcu_torture_ops srcud_ops = { .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, .name = "srcud" }; @@ -2626,7 +2631,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); rfcpp = rfp->rcu_fwd_cb_tail; rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; - WRITE_ONCE(*rfcpp, rfcp); + smp_store_release(rfcpp, rfcp); WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(rfp->n_launders_hist)) @@ -3455,7 +3460,6 @@ rcu_torture_cleanup(void) cur_ops->gp_slow_unregister(NULL); } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD static void rcu_torture_leak_cb(struct rcu_head *rhp) { } @@ -3473,7 +3477,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ /* * Verify that double-free causes debug-objects to complain, but only @@ -3482,39 +3485,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ static void rcu_test_debug_objects(void) { -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + int idx; + + if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) { + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n", + KBUILD_MODNAME, cur_ops->name); + return; + } + + if (WARN_ON_ONCE(cur_ops->debug_objects && + (!cur_ops->call || !cur_ops->cb_barrier))) + return; + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name); /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu_hurry(&rh2, rcu_torture_leak_cb); - call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */ + cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + cur_ops->call(&rh2, rcu_torture_leak_cb); + cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ if (rhp) { - call_rcu_hurry(rhp, rcu_torture_leak_cb); - call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + cur_ops->call(rhp, rcu_torture_leak_cb); + cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ } - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); + cur_ops->readunlock(idx); /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); + cur_ops->cb_barrier(); + pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); kfree(rhp); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } static void rcutorture_sync(void) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 2c2648a3ad30..f4ea5b1ec068 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -63,6 +63,7 @@ do { \ #define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x) +MODULE_DESCRIPTION("Scalability test for object reference mechanisms"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>"); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5afd5cf494db..549c03336ee9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -277,7 +277,8 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) unsigned long cur_s = READ_ONCE(ssp->srcu_idx); barrier(); - return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); + return cookie == SRCU_GET_STATE_COMPLETED || + ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index bc4b58b0204e..b24db425f16d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -667,7 +667,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); - return; /* Caller forgot to stop doing call_srcu()? */ + return; // Caller forgot to stop doing call_srcu()? + // Or caller invoked start_poll_synchronize_srcu() + // and then cleanup_srcu_struct() before that grace + // period ended? } kfree(sup->node); sup->node = NULL; @@ -845,7 +848,6 @@ static void srcu_gp_end(struct srcu_struct *ssp) bool cbs; bool last_lvl; int cpu; - unsigned long flags; unsigned long gpseq; int idx; unsigned long mask; @@ -907,12 +909,12 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (!(gpseq & counter_wrap_check)) for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irq_rcu_node(sdp); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irq_rcu_node(sdp); } /* Callback initiation done, allow grace periods after next. */ @@ -1540,7 +1542,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) + if (cookie != SRCU_GET_STATE_COMPLETED && + !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 6c2bd9001adc..da60a9947c00 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * we are called at early boot time but this shouldn't happen. */ } - WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1); + rsp->gp_count++; spin_unlock_irq(&rsp->rss_lock); if (gp_state == GP_IDLE) { @@ -151,15 +151,11 @@ void rcu_sync_enter(struct rcu_sync *rsp) */ void rcu_sync_exit(struct rcu_sync *rsp) { - int gpc; - WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); - WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); spin_lock_irq(&rsp->rss_lock); - gpc = rsp->gp_count - 1; - WRITE_ONCE(rsp->gp_count, gpc); - if (!gpc) { + WARN_ON_ONCE(rsp->gp_count == 0); + if (!--rsp->gp_count) { if (rsp->gp_state == GP_PASSED) { WRITE_ONCE(rsp->gp_state, GP_EXIT); rcu_sync_call(rsp); @@ -178,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp) { int gp_state; - WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); + WARN_ON_ONCE(rsp->gp_count); if (rsp->gp_state == GP_REPLAY) WRITE_ONCE(rsp->gp_state, GP_EXIT); gp_state = rsp->gp_state; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e1bf33018e6d..ba3440a45b6d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -858,7 +858,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // not know to synchronize with this RCU Tasks grace period) have // completed exiting. The synchronize_rcu() in rcu_tasks_postgp() // will take care of any tasks stuck in the non-preemptible region -// of do_exit() following its call to exit_tasks_rcu_stop(). +// of do_exit() following its call to exit_tasks_rcu_finish(). // check_all_holdout_tasks(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the @@ -1220,7 +1220,7 @@ void exit_tasks_rcu_start(void) * Remove the task from the "yet another list" because do_exit() is now * non-preemptible, allowing synchronize_rcu() to wait beyond this point. */ -void exit_tasks_rcu_stop(void) +void exit_tasks_rcu_finish(void) { unsigned long flags; struct rcu_tasks_percpu *rtpcp; @@ -1231,22 +1231,12 @@ void exit_tasks_rcu_stop(void) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); list_del_init(&t->rcu_tasks_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); -} -/* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. - */ -void exit_tasks_rcu_finish(void) -{ - exit_tasks_rcu_stop(); - exit_tasks_rcu_finish_trace(current); + exit_tasks_rcu_finish_trace(t); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } -void exit_tasks_rcu_stop(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ @@ -1757,6 +1747,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // allow safe access to the hop list. for_each_online_cpu(cpu) { rcu_read_lock(); + // Note that cpu_curr_snapshot() picks up the target + // CPU's current task while its runqueue is locked with + // an smp_mb__after_spinlock(). This ensures that either + // the grace-period kthread will see that task's read-side + // critical section or the task will see the updater's pre-GP + // accesses. The trailing smp_mb() in cpu_curr_snapshot() + // does not currently play a role other than simplify + // that function's ordering semantics. If these simplified + // ordering semantics continue to be redundant, that smp_mb() + // might be removed. t = cpu_curr_snapshot(cpu); if (rcu_tasks_trace_pertask_prep(t, true)) trc_add_holdout(t, hop); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 28c7031711a3..e641cc681901 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = { .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, rcu_sr_normal_gp_cleanup_work), + .srs_cleanups_pending = ATOMIC_INIT(0), }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -175,6 +176,9 @@ static int gp_init_delay; module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +static int nohz_full_patience_delay; +module_param(nohz_full_patience_delay, int, 0444); +static int nohz_full_patience_delay_jiffies; // Add delay to rcu_read_unlock() for strict grace periods. static int rcu_unlock_delay; @@ -296,16 +300,6 @@ static void rcu_dynticks_eqs_online(void) } /* - * Snapshot the ->dynticks counter with full ordering so as to allow - * stable comparison of this counter with past and future snapshots. - */ -static int rcu_dynticks_snap(int cpu) -{ - smp_mb(); // Fundamental RCU ordering guarantee. - return ct_dynticks_cpu_acquire(cpu); -} - -/* * Return true if the snapshot returned from rcu_dynticks_snap() * indicates that RCU is in an extended quiescent state. */ @@ -321,7 +315,15 @@ static bool rcu_dynticks_in_eqs(int snap) */ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdp->cpu); + /* + * The first failing snapshot is already ordered against the accesses + * performed by the remote CPU after it exits idle. + * + * The second snapshot therefore only needs to order against accesses + * performed by the remote CPU prior to entering idle and therefore can + * rely solely on acquire semantics. + */ + return snap != ct_dynticks_cpu_acquire(rdp->cpu); } /* @@ -769,7 +771,18 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu); + /* + * Full ordering between remote CPU's post idle accesses and updater's + * accesses prior to current GP (and also the started GP sequence number) + * is enforced by rcu_seq_start() implicit barrier and even further by + * smp_mb__after_unlock_lock() barriers chained all the way throughout the + * rnp locking tree since rcu_gp_init() and up to the current leaf rnp + * locking. + * + * Ordering between remote CPU's pre idle accesses and post grace period + * updater's accesses is enforced by the below acquire semantic. + */ + rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); @@ -1660,6 +1673,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) rcu_sr_put_wait_head(rcu); } + + /* Order list manipulations with atomic access. */ + atomic_dec_return_release(&rcu_state.srs_cleanups_pending); } /* @@ -1667,7 +1683,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) */ static void rcu_sr_normal_gp_cleanup(void) { - struct llist_node *wait_tail, *next, *rcu; + struct llist_node *wait_tail, *next = NULL, *rcu = NULL; int done = 0; wait_tail = rcu_state.srs_wait_tail; @@ -1693,16 +1709,34 @@ static void rcu_sr_normal_gp_cleanup(void) break; } - // concurrent sr_normal_gp_cleanup work might observe this update. - smp_store_release(&rcu_state.srs_done_tail, wait_tail); + /* + * Fast path, no more users to process except putting the second last + * wait head if no inflight-workers. If there are in-flight workers, + * they will remove the last wait head. + * + * Note that the ACQUIRE orders atomic access with list manipulation. + */ + if (wait_tail->next && wait_tail->next->next == NULL && + rcu_sr_is_wait_head(wait_tail->next) && + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) { + rcu_sr_put_wait_head(wait_tail->next); + wait_tail->next = NULL; + } + + /* Concurrent sr_normal_gp_cleanup work might observe this update. */ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + smp_store_release(&rcu_state.srs_done_tail, wait_tail); /* * We schedule a work in order to perform a final processing * of outstanding users(if still left) and releasing wait-heads * added by rcu_sr_normal_gp_init() call. */ - queue_work(sync_wq, &rcu_state.srs_cleanup_work); + if (wait_tail->next) { + atomic_inc(&rcu_state.srs_cleanups_pending); + if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) + atomic_dec(&rcu_state.srs_cleanups_pending); + } } /* @@ -1810,7 +1844,7 @@ static noinline_for_stack bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - local_irq_save(flags); + local_irq_disable(); arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1818,7 +1852,7 @@ static noinline_for_stack bool rcu_gp_init(void) /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); continue; } @@ -1855,7 +1889,7 @@ static noinline_for_stack bool rcu_gp_init(void) raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ @@ -4313,11 +4347,15 @@ static int rcu_pending(int user) return 1; /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ - if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu()) + gp_in_progress = rcu_gp_in_progress(); + if ((user || rcu_is_cpu_rrupt_from_idle() || + (gp_in_progress && + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + + nohz_full_patience_delay_jiffies))) && + rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - gp_in_progress = rcu_gp_in_progress(); if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) return 1; @@ -4767,7 +4805,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(ct->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); + WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; @@ -5110,11 +5148,15 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); bool needwake; - if (rcu_rdp_is_offloaded(rdp) || - rcu_segcblist_empty(&rdp->cblist)) - return; /* No callbacks to migrate. */ + if (rcu_rdp_is_offloaded(rdp)) + return; raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (rcu_segcblist_empty(&rdp->cblist)) { + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + return; /* No callbacks to migrate. */ + } + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bae7925c497f..fcf2b4aa3441 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -223,7 +223,6 @@ struct rcu_data { struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ - atomic_t nocb_lock_contended; /* Contention experienced. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ @@ -420,6 +419,7 @@ struct rcu_state { struct llist_node *srs_done_tail; /* ready for GP users. */ struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; struct work_struct srs_cleanup_work; + atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8a1d9c8bd9f7..4acd29d16fdb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -265,7 +265,12 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - smp_mb(); /* Ensure test happens before caller kfree(). */ + /* + * Order GP completion with preceding accesses. Order also GP + * completion with post GP update side accesses. Pairs with + * rcu_seq_end(). + */ + smp_mb(); return true; } return false; @@ -357,7 +362,21 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(cpu); + /* + * Full ordering between remote CPU's post idle accesses + * and updater's accesses prior to current GP (and also + * the started GP sequence number) is enforced by + * rcu_seq_start() implicit barrier, relayed by kworkers + * locking and even further by smp_mb__after_unlock_lock() + * barriers chained all the way throughout the rnp locking + * tree since sync_exp_reset_tree() and up to the current + * leaf rnp locking. + * + * Ordering between remote CPU's pre idle accesses and + * post grace period updater's accesses is enforced by the + * below acquire semantic. + */ + snap = ct_dynticks_cpu_acquire(cpu); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else @@ -953,7 +972,6 @@ void synchronize_rcu_expedited(void) rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3f85577bddd4..3ce30841119a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -91,8 +91,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0); /* * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the - * lock isn't immediately available, increment ->nocb_lock_contended to - * flag the contention. + * lock isn't immediately available, perform minimal sanity check. */ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) __acquires(&rdp->nocb_bypass_lock) @@ -100,29 +99,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) lockdep_assert_irqs_disabled(); if (raw_spin_trylock(&rdp->nocb_bypass_lock)) return; - atomic_inc(&rdp->nocb_lock_contended); + /* + * Contention expected only when local enqueue collide with + * remote flush from kthreads. + */ WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - smp_mb__after_atomic(); /* atomic_inc() before lock. */ raw_spin_lock(&rdp->nocb_bypass_lock); - smp_mb__before_atomic(); /* atomic_dec() after lock. */ - atomic_dec(&rdp->nocb_lock_contended); -} - -/* - * Spinwait until the specified rcu_data structure's ->nocb_lock is - * not contended. Please note that this is extremely special-purpose, - * relying on the fact that at most two kthreads and one CPU contend for - * this lock, and also that the two kthreads are guaranteed to have frequent - * grace-period-duration time intervals between successive acquisitions - * of the lock. This allows us to use an extremely simple throttling - * mechanism, and further to apply it only to the CPU doing floods of - * call_rcu() invocations. Don't try this at home! - */ -static void rcu_nocb_wait_contended(struct rcu_data *rdp) -{ - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) - cpu_relax(); } /* @@ -510,7 +492,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } // We need to use the bypass. - rcu_nocb_wait_contended(rdp); rcu_nocb_bypass_lock(rdp); ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ @@ -635,8 +616,7 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp, - bool *wake_state) +static int nocb_gp_toggle_rdp(struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; @@ -650,8 +630,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, * We will handle this rdp until it ever gets de-offloaded. */ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; ret = 1; } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { @@ -660,8 +638,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, * We will ignore this rdp until it ever gets re-offloaded. */ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; ret = 0; } else { WARN_ON_ONCE(1); @@ -877,16 +853,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - bool wake_state = false; int ret; - ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); + ret = nocb_gp_toggle_rdp(rdp_toggling); if (ret == 1) list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); else if (ret == 0) list_del(&rdp_toggling->nocb_entry_rdp); - if (wake_state) - swake_up_one(&rdp_toggling->nocb_state_wq); + + swake_up_one(&rdp_toggling->nocb_state_wq); } my_rdp->nocb_gp_seq = -1; @@ -913,16 +888,9 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } -static inline bool nocb_cb_can_run(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) { - return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); + return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park(); } /* @@ -934,21 +902,19 @@ static void nocb_cb_wait(struct rcu_data *rdp) struct rcu_segcblist *cblist = &rdp->cblist; unsigned long cur_gp_seq; unsigned long flags; - bool needwake_state = false; bool needwake_gp = false; - bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - if (READ_ONCE(rdp->nocb_cb_sleep)) { - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + if (kthread_should_park()) { + kthread_parkme(); + } else if (READ_ONCE(rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -971,37 +937,16 @@ static void nocb_cb_wait(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - if (rcu_segcblist_ready_cbs(cblist)) - can_sleep = false; + if (!rcu_segcblist_ready_cbs(cblist)) { + WRITE_ONCE(rdp->nocb_cb_sleep, true); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We won't touch the callbacks and keep sleeping until we ever - * get re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; + WRITE_ONCE(rdp->nocb_cb_sleep, false); } - WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); - - if (rdp->nocb_cb_sleep) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); - rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } /* @@ -1094,17 +1039,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp, bool wake_gp = false; rcu_segcblist_offload(cblist, offload); - - if (rdp->nocb_cb_sleep) - rdp->nocb_cb_sleep = false; rcu_nocb_unlock_irqrestore(rdp, flags); - /* - * Ignore former value of nocb_cb_sleep and force wake up as it could - * have been spuriously set to false already. - */ - swake_up_one(&rdp->nocb_cb_wq); - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); @@ -1161,19 +1097,11 @@ static long rcu_nocb_rdp_deoffload(void *arg) if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - /* - * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. - * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. - */ - if (!rdp->nocb_cb_kthread) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_nocb_unlock_irqrestore(rdp, flags); - } - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + !rcu_segcblist_test_flags(cblist, + SEGCBLIST_KTHREAD_GP)); + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list @@ -1181,8 +1109,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) * but we stick to paranoia in this rare path. */ rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); rcu_nocb_unlock_irqrestore(rdp, flags); list_del(&rdp->nocb_entry_rdp); @@ -1282,8 +1209,10 @@ static long rcu_nocb_rdp_offload(void *arg) wake_gp = rdp_offload_toggle(rdp, true, flags); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); + + kthread_unpark(rdp->nocb_cb_kthread); + swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); /* @@ -1468,7 +1397,7 @@ void __init rcu_init_nohz(void) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); @@ -1526,11 +1455,16 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_cb_kthread, rdp, - "rcuo%c/%d", rcu_state.abbr, cpu); + t = kthread_create(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) goto end; + if (rcu_rdp_is_offloaded(rdp)) + wake_up_process(t); + else + kthread_park(t); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); @@ -1678,12 +1612,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], - "cC"[!!atomic_read(&rdp->nocb_lock_contended)], "lL"[raw_spin_is_locked(&rdp->nocb_lock)], "sS"[!!rdp->nocb_cb_sleep], ".W"[swait_active(&rdp->nocb_cb_wq)], diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 340bbefe5f65..c569da65b421 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,8 +28,8 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || rcu_lockdep_is_held_nocb(rdp) || - (rdp == this_cpu_ptr(&rcu_data) && - !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || + (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && + rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" ); @@ -93,6 +93,16 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); + if (nohz_full_patience_delay < 0) { + pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay); + nohz_full_patience_delay = 0; + } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) { + pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC); + nohz_full_patience_delay = 5 * MSEC_PER_SEC; + } else if (nohz_full_patience_delay) { + pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay); + } + nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 460efecd077b..4b0e9d7c4c68 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); + rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) // Print signed value, as negative values indicate a probable bug. @@ -515,7 +515,7 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(cpu) & 0xffff, + ct_dynticks_cpu(cpu) & 0xffff, ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 59032aaccd18..44e83a646264 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -43,6 +43,7 @@ #define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x) +MODULE_DESCRIPTION("Torture tests on the smp_call_function() family of primitives"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); @@ -67,7 +68,7 @@ torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operation torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); -char *torture_type = ""; +static char *torture_type = ""; #ifdef MODULE # define SCFTORT_SHUTDOWN 0 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bcf2c4cc0522..35a35e36024b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; - psi_account_irqtime(rq->curr, irq_delta); delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING @@ -4467,12 +4466,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) * @cpu: The CPU on which to snapshot the task. * * Returns the task_struct pointer of the task "currently" running on - * the specified CPU. If the same task is running on that CPU throughout, - * the return value will be a pointer to that task's task_struct structure. - * If the CPU did any context switches even vaguely concurrently with the - * execution of this function, the return value will be a pointer to the - * task_struct structure of a randomly chosen task that was running on - * that CPU somewhere around the time that this function was executing. + * the specified CPU. * * If the specified CPU was offline, the return value is whatever it * is, perhaps a pointer to the task_struct structure of that CPU's idle @@ -4486,11 +4480,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) */ struct task_struct *cpu_curr_snapshot(int cpu) { + struct rq *rq = cpu_rq(cpu); struct task_struct *t; + struct rq_flags rf; - smp_mb(); /* Pairing determined by caller's synchronization design. */ + rq_lock_irqsave(rq, &rf); + smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */ t = rcu_dereference(cpu_curr(cpu)); + rq_unlock_irqrestore(rq, &rf); smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; } @@ -5665,7 +5664,7 @@ void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; + struct task_struct *curr; struct rq_flags rf; unsigned long hw_pressure; u64 resched_latency; @@ -5677,6 +5676,9 @@ void sched_tick(void) rq_lock(rq, &rf); + curr = rq->curr; + psi_account_irqtime(rq, curr, NULL); + update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); @@ -6737,6 +6739,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) ++*switch_count; migrate_disable_switch(rq, prev); + psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c75d1307d86d..9bedd148f007 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1804,8 +1804,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). + * + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). */ - hrtimer_try_to_cancel(&p->dl.dl_timer); + if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 && + !dl_server(&p->dl)) + put_task_struct(p); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a5b1ae0aa55..24dda708b699 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9149,12 +9149,8 @@ static int detach_tasks(struct lb_env *env) break; env->loop++; - /* - * We've more or less seen every task there is, call it quits - * unless we haven't found any movable task yet. - */ - if (env->loop > env->loop_max && - !(env->flags & LBF_ALL_PINNED)) + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) break; /* take a breather every nr_migrate tasks */ @@ -11393,9 +11389,7 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - /* Stop if we tried all running tasks */ - if (env.loop < busiest->nr_running) - goto more_balance; + goto more_balance; } /* diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7b4aa5809c0f..507d7b8d79af 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu, enum psi_states s; u32 state_mask; + lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* @@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } #ifdef CONFIG_IRQ_TIME_ACCOUNTING -void psi_account_irqtime(struct task_struct *task, u32 delta) +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { - int cpu = task_cpu(task); + int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now; + u64 now, irq; + s64 delta; if (static_branch_likely(&psi_disabled)) return; - if (!task->pid) + if (!curr->pid) + return; + + lockdep_assert_rq_held(rq); + group = task_psi_group(curr); + if (prev && task_psi_group(prev) == group) return; now = cpu_clock(cpu); + irq = irq_time_read(cpu); + delta = (s64)(irq - rq->psi_irq_time); + if (delta < 0) + return; + rq->psi_irq_time = irq; - group = task_psi_group(task); do { if (!group->enabled) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a831af102070..ef20c61004eb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1126,6 +1126,7 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; + u64 psi_irq_time; #endif #ifdef CONFIG_PARAVIRT u64 prev_steal_time; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b..b02dfc322951 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se) void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); -void psi_account_irqtime(struct task_struct *task, u32 delta); - +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); +#else +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} -static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO diff --git a/kernel/signal.c b/kernel/signal.c index 1f9dd41c04be..60c737e423a1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2600,6 +2600,14 @@ static void do_freezer_trap(void) spin_unlock_irq(¤t->sighand->siglock); cgroup_enter_frozen(); schedule(); + + /* + * We could've been woken by task_work, run it to clear + * TIF_NOTIFY_SIGNAL. The caller will retry if necessary. + */ + clear_notify_signal(); + if (unlikely(task_work_pending(current))) + task_work_run(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) diff --git a/kernel/smp.c b/kernel/smp.c index f085ebcdf9e7..aaffecdad319 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -25,6 +25,7 @@ #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> +#include <linux/string_choices.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS @@ -982,8 +983,7 @@ void __init smp_init(void) num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", - num_nodes, (num_nodes > 1 ? "s" : ""), - num_cpus, (num_cpus > 1 ? "s" : "")); + num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus)); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); @@ -1119,6 +1119,7 @@ int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); + destroy_work_on_stack(&sscs.work); return sscs.ret; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d7eee421d4bc..b696b85ac63e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,8 +46,8 @@ COND_SYSCALL(io_getevents_time32); COND_SYSCALL(io_getevents); COND_SYSCALL(io_pgetevents_time32); COND_SYSCALL(io_pgetevents); -COND_SYSCALL_COMPAT(io_pgetevents_time32); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL_COMPAT(io_pgetevents_time64); COND_SYSCALL(io_uring_setup); COND_SYSCALL(io_uring_enter); COND_SYSCALL(io_uring_register); diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index d06185e054ea..62e73444ffe4 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -22,6 +22,7 @@ #include "tick-internal.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 492c14aac642..b8ee320208d4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1285,6 +1285,8 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; + if (WARN_ON_ONCE(!timer->function)) + return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 20d5df631570..783f2297111b 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -155,5 +155,6 @@ static void __exit udelay_test_exit(void) module_exit(udelay_test_exit); +MODULE_DESCRIPTION("udelay test module"); MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 771d1e040303..b4843099a8da 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1141,6 +1141,7 @@ void tick_broadcast_switch_to_oneshot(void) #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *bc; unsigned long flags; @@ -1148,6 +1149,28 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) bc = tick_broadcast_device.evtdev; if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* + * If the broadcast force bit of the current CPU is set, + * then the current CPU has not yet reprogrammed the local + * timer device to avoid a ping-pong race. See + * ___tick_broadcast_oneshot_control(). + * + * If the broadcast device is hrtimer based then + * programming the broadcast event below does not have any + * effect because the local clockevent device is not + * running and not programmed because the broadcast event + * is not earlier than the pending event of the local clock + * event device. As a consequence all CPUs waiting for a + * broadcast event are stuck forever. + * + * Detect this condition and reprogram the cpu local timer + * device to avoid the starvation. + */ + if (tick_check_broadcast_expired()) { + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); + tick_program_event(td->evtdev->next_event, 1); + } + /* This moves the broadcast assignment to this CPU: */ clockevents_program_event(bc, bc->next_event, 1); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 71a792cd8936..753a184c7090 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1026,10 +1026,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; - WARN_ON_ONCE(1); - printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", - basemono, ts->next_tick, dev->next_event, - hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " + "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, + dev->next_event, hrtimer_active(&ts->sched_timer), + hrtimer_get_expires(&ts->sched_timer)); } /* @@ -1385,20 +1385,6 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) return ts->idle_calls; } -/** - * tick_nohz_get_idle_calls - return the current idle calls counter value - * - * Called from the schedutil frequency scaling governor in scheduler context. - * - * Return: the current idle calls counter value for the current CPU - */ -unsigned long tick_nohz_get_idle_calls(void) -{ - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - - return ts->idle_calls; -} - static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 3e5d422dd15c..2889763165e5 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -96,4 +96,5 @@ static struct kunit_suite time_test_suite = { }; kunit_test_suite(time_test_suite); +MODULE_DESCRIPTION("time unit test suite"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4e18db1819f8..2fa87dcfeda9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1195,6 +1195,108 @@ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) return false; } +static bool convert_clock(u64 *val, u32 numerator, u32 denominator) +{ + u64 rem, res; + + if (!numerator || !denominator) + return false; + + res = div64_u64_rem(*val, denominator, &rem) * numerator; + *val = res + div_u64(rem * numerator, denominator); + return true; +} + +static bool convert_base_to_cs(struct system_counterval_t *scv) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + u32 num, den; + + /* The timestamp was taken from the time keeper clock source */ + if (cs->id == scv->cs_id) + return true; + + /* + * Check whether cs_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != scv->cs_id) + return false; + + num = scv->use_nsecs ? cs->freq_khz : base->numerator; + den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; + + if (!convert_clock(&scv->cycles, num, den)) + return false; + + scv->cycles += base->offset; + return true; +} + +static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + + /* + * Check whether base_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != base_id) + return false; + + *cycles -= base->offset; + if (!convert_clock(cycles, base->denominator, base->numerator)) + return false; + return true; +} + +static bool convert_ns_to_cs(u64 *delta) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + + if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) + return false; + + *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); + return true; +} + +/** + * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp + * @treal: CLOCK_REALTIME timestamp to convert + * @base_id: base clocksource id + * @cycles: pointer to store the converted base clock timestamp + * + * Converts a supplied, future realtime clock value to the corresponding base clock value. + * + * Return: true if the conversion is successful, false otherwise. + */ +bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 delta; + + do { + seq = read_seqcount_begin(&tk_core.seq); + if ((u64)treal < tk->tkr_mono.base_real) + return false; + delta = (u64)treal - tk->tkr_mono.base_real; + if (!convert_ns_to_cs(&delta)) + return false; + *cycles = tk->tkr_mono.cycle_last + delta; + if (!convert_cs_to_base(cycles, base_id)) + return false; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return true; +} +EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and @@ -1241,7 +1343,7 @@ int get_device_system_crosststamp(int (*get_time_fn) * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || - tk->tkr_mono.clock->id != system_counterval.cs_id) + !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; @@ -1307,6 +1409,30 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** + * timekeeping_clocksource_has_base - Check whether the current clocksource + * is based on given a base clock + * @id: base clocksource ID + * + * Note: The return value is a snapshot which can become invalid right + * after the function returns. + * + * Return: true if the timekeeper clocksource has a base clock with @id, + * false otherwise + */ +bool timekeeping_clocksource_has_base(enum clocksource_ids id) +{ + /* + * This is a snapshot, so no point in using the sequence + * count. Just prevent the compiler from re-evaluating @base as the + * clocksource might change concurrently. + */ + struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); + + return base ? base->id == id : false; +} +EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); + +/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -2421,6 +2547,7 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { @@ -2489,6 +2616,8 @@ int do_adjtimex(struct __kernel_timex *txc) #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function + * @phase_ts: Pointer to timespec64 structure representing phase timestamp + * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { diff --git a/kernel/torture.c b/kernel/torture.c index c72ab2d251f4..dede150aef01 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -40,6 +40,7 @@ #include <linux/sched/rt.h> #include "rcu/rcu.h" +MODULE_DESCRIPTION("Common functions for in-kernel torture tests"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65208d3b5ed9..eacab4020508 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6969,7 +6969,7 @@ allocate_ftrace_mod_map(struct module *mod, return mod_map; } -static const char * +static int ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, unsigned long addr, unsigned long *size, unsigned long *off, char *sym) @@ -6990,21 +6990,18 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, *size = found_func->size; if (off) *off = addr - found_func->ip; - if (sym) - strscpy(sym, found_func->name, KSYM_NAME_LEN); - - return found_func->name; + return strscpy(sym, found_func->name, KSYM_NAME_LEN); } - return NULL; + return 0; } -const char * +int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { struct ftrace_mod_map *mod_map; - const char *ret = NULL; + int ret = 0; /* mod_map is freed via call_rcu() */ preempt_disable(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 003474c9a77d..3fbaecfc88c2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,6 +125,7 @@ enum wq_internal_consts { HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 32, + WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */ }; /* @@ -2742,6 +2743,26 @@ static void worker_detach_from_pool(struct worker *worker) complete(detach_completion); } +static int format_worker_id(char *buf, size_t size, struct worker *worker, + struct worker_pool *pool) +{ + if (worker->rescue_wq) + return scnprintf(buf, size, "kworker/R-%s", + worker->rescue_wq->name); + + if (pool) { + if (pool->cpu >= 0) + return scnprintf(buf, size, "kworker/%d:%d%s", + pool->cpu, worker->id, + pool->attrs->nice < 0 ? "H" : ""); + else + return scnprintf(buf, size, "kworker/u%d:%d", + pool->id, worker->id); + } else { + return scnprintf(buf, size, "kworker/dying"); + } +} + /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to @@ -2758,7 +2779,6 @@ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; - char id_buf[23]; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); @@ -2777,17 +2797,14 @@ static struct worker *create_worker(struct worker_pool *pool) worker->id = id; if (!(pool->flags & POOL_BH)) { - if (pool->cpu >= 0) - snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, - pool->attrs->nice < 0 ? "H" : ""); - else - snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + char id_buf[WORKER_ID_LEN]; + format_worker_id(id_buf, sizeof(id_buf), worker, pool); worker->task = kthread_create_on_node(worker_thread, worker, - pool->node, "kworker/%s", id_buf); + pool->node, "%s", id_buf); if (IS_ERR(worker->task)) { if (PTR_ERR(worker->task) == -EINTR) { - pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", + pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n", id_buf); } else { pr_err_once("workqueue: Failed to create a worker thread: %pe", @@ -3350,7 +3367,6 @@ woke_up: raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); - set_task_comm(worker->task, "kworker/dying"); ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); WARN_ON_ONCE(!list_empty(&worker->entry)); @@ -5542,6 +5558,7 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; + char id_buf[WORKER_ID_LEN]; int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) @@ -5555,7 +5572,9 @@ static int init_rescuer(struct workqueue_struct *wq) } rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name); + format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL); + + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", @@ -6384,19 +6403,15 @@ void show_freezable_workqueues(void) /* used to show worker information through /proc/PID/{comm,stat,status} */ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) { - int off; - - /* always show the actual comm */ - off = strscpy(buf, task->comm, size); - if (off < 0) - return; - /* stabilize PF_WQ_WORKER and worker pool association */ mutex_lock(&wq_pool_attach_mutex); if (task->flags & PF_WQ_WORKER) { struct worker *worker = kthread_data(task); struct worker_pool *pool = worker->pool; + int off; + + off = format_worker_id(buf, size, worker, pool); if (pool) { raw_spin_lock_irq(&pool->lock); @@ -6415,6 +6430,8 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) } raw_spin_unlock_irq(&pool->lock); } + } else { + strscpy(buf, task->comm, size); } mutex_unlock(&wq_pool_attach_mutex); |