summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/bpf/arraymap.c61
-rw-r--r--kernel/bpf/core.c23
-rw-r--r--kernel/bpf/inode.c40
-rw-r--r--kernel/bpf/sockmap.c11
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/verifier.c105
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c21
-rw-r--r--kernel/configs/nopm.config15
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/delayacct.c42
-rw-r--r--kernel/events/core.c47
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/futex.c92
-rw-r--r--kernel/irq/Kconfig10
-rw-r--r--kernel/irq/affinity.c30
-rw-r--r--kernel/irq/irqdomain.c118
-rw-r--r--kernel/irq/matrix.c20
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/jump_label.c12
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/rtmutex.c26
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/module.c11
-rw-r--r--kernel/pid.c22
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/sched/membarrier.c2
-rw-r--r--kernel/time/hrtimer.c654
-rw-r--r--kernel/time/posix-clock.c2
-rw-r--r--kernel/time/posix-cpu-timers.c7
-rw-r--r--kernel/time/tick-internal.h13
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timer.c92
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ftrace.c29
-rw-r--r--kernel/trace/ring_buffer.c61
-rw-r--r--kernel/trace/trace.c34
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_events_trigger.c13
-rw-r--r--kernel/trace/trace_functions.c49
-rw-r--r--kernel/workqueue.c13
47 files changed, 1158 insertions, 611 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index d15c0ee4d955..addf7732fb56 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_before_jiffies(acct->needcheck))
+ if (time_is_after_jiffies(acct->needcheck))
goto out;
/* May block */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..ab94d304a634 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
+ u32 elem_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
- u64 array_size;
- u32 elem_size;
+ u64 array_size, mask64;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -72,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
+ max_entries = attr->max_entries;
+
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
+
array_size = sizeof(*array);
if (percpu)
- array_size += (u64) attr->max_entries * sizeof(void *);
+ array_size += (u64) max_entries * sizeof(void *);
else
- array_size += (u64) attr->max_entries * elem_size;
+ array_size += (u64) max_entries * elem_size;
/* make sure there is no u32 overflow later in round_up() */
if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
+ array->index_mask = index_mask;
+ array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -121,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * index;
+ return array->value + array->elem_size * (index & array->index_mask);
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
u32 elem_size = round_up(map->value_size, 8);
const int ret = BPF_REG_0;
@@ -135,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ }
if (is_power_of_2(elem_size)) {
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return this_cpu_ptr(array->pptrs[index]);
+ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -177,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
off += size;
@@ -225,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
- memcpy(this_cpu_ptr(array->pptrs[index]),
+ memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
else
- memcpy(array->value + array->elem_size * index,
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
value, map->value_size);
return 0;
}
@@ -262,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
off += size;
@@ -613,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
static u32 array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 elem_size = round_up(map->value_size, 8);
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
@@ -621,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ }
if (is_power_of_2(elem_size))
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
else
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 86b50aa26ee8..7949e8b8f94e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
EXPORT_SYMBOL_GPL(__bpf_call_base);
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
* @ctx: is the data we are operating on
@@ -955,7 +956,7 @@ select_insn:
DST = tmp;
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
@@ -974,7 +975,7 @@ select_insn:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
@@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
+#else
+static unsigned int __bpf_prog_ret0(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ return 0;
+}
+#endif
+
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
@@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+ fp->bpf_func = __bpf_prog_ret0;
+#endif
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
*/
if (!bpf_prog_is_dev_bound(fp->aux)) {
fp = bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ if (!fp->jited) {
+ *err = -ENOTSUPP;
+ return fp;
+ }
+#endif
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 01aaef1a77c5..5bb5e49ef4c3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -368,7 +368,45 @@ out:
putname(pname);
return ret;
}
-EXPORT_SYMBOL_GPL(bpf_obj_get_user);
+
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (inode->i_op == &bpf_map_iops)
+ return ERR_PTR(-EINVAL);
+ if (inode->i_op != &bpf_prog_iops)
+ return ERR_PTR(-EACCES);
+
+ prog = inode->i_private;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!bpf_prog_get_ok(prog, &type, false))
+ return ERR_PTR(-EINVAL);
+
+ return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ struct path path;
+ int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+ prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+ if (!IS_ERR(prog))
+ touch_atime(&path);
+ path_put(&path);
+ return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
static void bpf_evict_inode(struct inode *inode)
{
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41893d9..1712d319c2d8 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map)
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
- smap_list_remove(psock, &stab->sock_map[i]);
- smap_release_sock(psock, sock);
+ /* This check handles a racing sock event that can get the
+ * sk_callback_lock before this case but after xchg happens
+ * causing the refcnt to hit zero and sock user data (psock)
+ * to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ }
write_unlock_bh(&sock->sk_callback_lock);
}
rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c4cfeaa8d5e..5cb783fc8224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static bool bpf_prog_get_ok(struct bpf_prog *prog,
+bool bpf_prog_get_ok(struct bpf_prog *prog,
enum bpf_prog_type *attach_type, bool attach_drv)
{
/* not an attachment, just a refcount inc, always allow */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04b24876cd23..13551e623501 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = cur_regs(env) + regno;
+
+ return reg->type == PTR_TO_CTX;
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -1258,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return -EACCES;
}
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1729,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (meta.map_ptr == NULL) {
+ verbose(env, "verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+ }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -1875,17 +1895,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
- if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(env, env->cur_state);
- verbose(env,
- "verifier internal error: known but bad sbounds\n");
- return -EINVAL;
- }
- if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(env, env->cur_state);
- verbose(env,
- "verifier internal error: known but bad ubounds\n");
- return -EINVAL;
+ if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
@@ -2077,6 +2093,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
src_known = tnum_is_const(src_reg.var_off);
dst_known = tnum_is_const(dst_reg->var_off);
+ if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
if (!src_known &&
opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
__mark_reg_unknown(dst_reg);
@@ -2486,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -3981,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -4433,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
int i, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ /* due to JIT bugs clear upper 32-bits of src register
+ * before div/mod operation
+ */
+ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+ insn_buf[1] = *insn;
+ cnt = 2;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
@@ -4456,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (map_ptr == BPF_MAP_PTR_POISON) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+ if (!map_ptr->unpriv_array)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085daab1a..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
*/
do {
css_task_iter_start(&from->self, 0, &it);
- task = css_task_iter_next(&it);
+
+ do {
+ task = css_task_iter_next(&it);
+ } while (task && (task->flags & PF_EXITING));
+
if (task)
get_task_struct(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe147f24..7e4c44538119 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
else
- strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
}
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
root->flags = opts->flags;
if (opts->release_agent)
- strcpy(root->release_agent_path, opts->release_agent);
+ strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX);
if (opts->name)
- strcpy(root->name, opts->name);
+ strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
if (opts->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
@@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *l = it->task_pos;
+ struct list_head *next;
lockdep_assert_held(&css_set_lock);
- WARN_ON_ONCE(!l);
-
repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
* next cset.
*/
- l = l->next;
+ next = it->task_pos->next;
- if (l == it->tasks_head)
- l = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (l == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
css_task_iter_advance_css_set(it);
else
- it->task_pos = l;
+ it->task_pos = next;
/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cgroup.threads",
+ .flags = CFTYPE_NS_DELEGATABLE,
.release = cgroup_procs_release,
.seq_start = cgroup_threads_start,
.seq_next = cgroup_procs_next,
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
new file mode 100644
index 000000000000..81ff07863576
--- /dev/null
+++ b/kernel/configs/nopm.config
@@ -0,0 +1,15 @@
+CONFIG_PM=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+
+# Triggers PM on OMAP
+CONFIG_CPU_IDLE=n
+
+# Triggers enablement via hibernate callbacks
+CONFIG_XEN=n
+
+# ARM/ARM64 architectures that select PM unconditionally
+CONFIG_ARCH_OMAP2PLUS_TYPICAL=n
+CONFIG_ARCH_RENESAS=n
+CONFIG_ARCH_TEGRA=n
+CONFIG_ARCH_VEXPRESS=n
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(contig_page_data);
#endif
#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_SYMBOL_ARRAY(mem_section);
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
{
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
if (ns > 0) {
- spin_lock_irqsave(&current->delays->lock, flags);
+ spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
}
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
current->delays->blkio_start = ktime_get_ns();
}
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
{
- if (current->delays->flags & DELAYACCT_PF_SWAPIN)
- /* Swapin block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->swapin_delay,
- &current->delays->swapin_count);
- else /* Other block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_delay,
- &current->delays->blkio_count);
+ struct task_delay_info *delays = p->delays;
+ u64 *total;
+ u32 *count;
+
+ if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+ total = &delays->swapin_delay;
+ count = &delays->swapin_count;
+ } else {
+ total = &delays->blkio_delay;
+ count = &delays->blkio_count;
+ }
+
+ delayacct_end(&delays->lock, &delays->blkio_start, total, count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
void __delayacct_freepages_end(void)
{
- delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(
+ &current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4df5b695bf0d..5d8f4031f8d5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ *
+ * cpu_hotplug_lock
+ * pmus_lock
+ * cpuctx->mutex / perf_event_context::mutex
*/
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
+ LIST_HEAD(free_list);
/*
* If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ again:
struct perf_event, child_list);
if (tmp == child) {
perf_remove_from_context(child, DETACH_GROUP);
- list_del(&child->child_list);
- free_event(child);
+ list_move(&child->child_list, &free_list);
/*
* This matches the refcount bump in inherit_event();
* this can't be the last reference.
@@ -4284,6 +4288,11 @@ again:
}
mutex_unlock(&event->child_mutex);
+ list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ list_del(&child->child_list);
+ free_event(child);
+ }
+
no_ctx:
put_event(event); /* Must be the 'last' reference */
return 0;
@@ -8516,6 +8525,29 @@ fail_clear_files:
return ret;
}
+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+ struct perf_event_context *ctx = event->ctx;
+ int ret;
+
+ /*
+ * Beware, here be dragons!!
+ *
+ * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
+ * stuff does not actually need it. So temporarily drop ctx->mutex. As per
+ * perf_event_ctx_lock() we already have a reference on ctx.
+ *
+ * This can result in event getting moved to a different ctx, but that
+ * does not affect the tracepoint state.
+ */
+ mutex_unlock(&ctx->mutex);
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ mutex_lock(&ctx->mutex);
+
+ return ret;
+}
+
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
char *filter_str;
@@ -8532,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
event->attr.type == PERF_TYPE_TRACEPOINT)
- ret = ftrace_profile_set_filter(event, event->attr.config,
- filter_str);
+ ret = perf_tracepoint_set_filter(event, filter_str);
else if (has_addr_filter(event))
ret = perf_event_set_addr_filter(event, filter_str);
@@ -9168,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (!try_module_get(pmu->module))
return -ENODEV;
- if (event->group_leader != event) {
+ /*
+ * A number of pmu->event_init() methods iterate the sibling_list to,
+ * for example, validate if the group fits on the PMU. Therefore,
+ * if this is a sibling event, acquire the ctx->mutex to protect
+ * the sibling_list.
+ */
+ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
/*
* This ctx->mutex can nest when we're called through
* inheritance. See the perf_event_ctx_lock_nested() comment.
diff --git a/kernel/exit.c b/kernel/exit.c
index df0c91d5606c..995453d9fb55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1763,3 +1763,4 @@ __weak void abort(void)
/* if that doesn't kill us, halt */
panic("Oops failed to kill thread");
}
+EXPORT_SYMBOL(abort);
diff --git a/kernel/futex.c b/kernel/futex.c
index 57d0b3657e16..7f719d110908 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
/*
* When PI not supported: return -ENOSYS if requeue_pi is true,
* consequently the compiler knows requeue_pi is always false past
@@ -2294,34 +2297,33 @@ static void unqueue_me_pi(struct futex_q *q)
spin_unlock(q->lock_ptr);
}
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+ struct task_struct *argowner)
{
- u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
u32 uval, uninitialized_var(curval), newval;
- struct task_struct *oldowner;
+ struct task_struct *oldowner, *newowner;
+ u32 newtid;
int ret;
+ lockdep_assert_held(q->lock_ptr);
+
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
oldowner = pi_state->owner;
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
/*
- * We are here either because we stole the rtmutex from the
- * previous highest priority waiter or we are the highest priority
- * waiter but have failed to get the rtmutex the first time.
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
*
- * We have to replace the newowner TID in the user space variable.
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2336,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* in the PID check in lookup_pi_state.
*/
retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock after all, nothing to fix. */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Since we just failed the trylock; there must be an owner.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ BUG_ON(!newowner);
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
*
- * We can safely read pi_state->owner without holding wait_lock
- * because we now own the rt_mutex, only the owner will attempt
- * to change it.
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current) {
+ ret = fixup_pi_state_owner(uaddr, q, NULL);
+ goto out;
+ }
+
+ /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 89e355866450..6fc87ccda1d7 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -103,16 +103,6 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
config GENERIC_IRQ_RESERVATION_MODE
bool
-config IRQ_DOMAIN_DEBUG
- bool "Expose hardware/virtual IRQ mapping via debugfs"
- depends on IRQ_DOMAIN && DEBUG_FS
- help
- This option will show the mapping relationship between hardware irq
- numbers and Linux irq numbers. The mapping is exposed via debugfs
- in the file "irq_domain_mapping".
-
- If you don't know what this means you don't need it.
-
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e12d35108225..a37a3b4b6342 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
}
}
-static cpumask_var_t *alloc_node_to_present_cpumask(void)
+static cpumask_var_t *alloc_node_to_possible_cpumask(void)
{
cpumask_var_t *masks;
int node;
@@ -62,7 +62,7 @@ out_unwind:
return NULL;
}
-static void free_node_to_present_cpumask(cpumask_var_t *masks)
+static void free_node_to_possible_cpumask(cpumask_var_t *masks)
{
int node;
@@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks)
kfree(masks);
}
-static void build_node_to_present_cpumask(cpumask_var_t *masks)
+static void build_node_to_possible_cpumask(cpumask_var_t *masks)
{
int cpu;
- for_each_present_cpu(cpu)
+ for_each_possible_cpu(cpu)
cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
}
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
const struct cpumask *mask, nodemask_t *nodemsk)
{
int n, nodes = 0;
/* Calculate the number of nodes in the supplied affinity mask */
for_each_node(n) {
- if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
+ if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
node_set(n, *nodemsk);
nodes++;
}
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
int last_affv = affv + affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
struct cpumask *masks;
- cpumask_var_t nmsk, *node_to_present_cpumask;
+ cpumask_var_t nmsk, *node_to_possible_cpumask;
/*
* If there aren't any vectors left after applying the pre/post
@@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
if (!masks)
goto out;
- node_to_present_cpumask = alloc_node_to_present_cpumask();
- if (!node_to_present_cpumask)
+ node_to_possible_cpumask = alloc_node_to_possible_cpumask();
+ if (!node_to_possible_cpumask)
goto out;
/* Fill out vectors at the beginning that don't need affinity */
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
/* Stabilize the cpumasks */
get_online_cpus();
- build_node_to_present_cpumask(node_to_present_cpumask);
- nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
+ build_node_to_possible_cpumask(node_to_possible_cpumask);
+ nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
&nodemsk);
/*
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
if (affv <= nodes) {
for_each_node_mask(n, nodemsk) {
cpumask_copy(masks + curvec,
- node_to_present_cpumask[n]);
+ node_to_possible_cpumask[n]);
if (++curvec == last_affv)
break;
}
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
/* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
+ cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
@@ -192,7 +192,7 @@ done:
/* Fill out vectors at the end that don't need affinity */
for (; curvec < nvecs; curvec++)
cpumask_copy(masks + curvec, irq_default_affinity);
- free_node_to_present_cpumask(node_to_present_cpumask);
+ free_node_to_possible_cpumask(node_to_possible_cpumask);
out:
free_cpumask_var(nmsk);
return masks;
@@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
return 0;
get_online_cpus();
- ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
+ ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
put_online_cpus();
return ret;
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 62068ad46930..e6a9c36470ee 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -897,124 +897,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_find_mapping);
-#ifdef CONFIG_IRQ_DOMAIN_DEBUG
-static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
-{
- struct irq_domain *domain;
- struct irq_data *data;
-
- domain = desc->irq_data.domain;
- data = &desc->irq_data;
-
- while (domain) {
- unsigned int irq = data->irq;
- unsigned long hwirq = data->hwirq;
- struct irq_chip *chip;
- bool direct;
-
- if (data == &desc->irq_data)
- seq_printf(m, "%5d ", irq);
- else
- seq_printf(m, "%5d+ ", irq);
- seq_printf(m, "0x%05lx ", hwirq);
-
- chip = irq_data_get_irq_chip(data);
- seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
-
- seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data));
-
- seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
- direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
- seq_printf(m, "%6s%-8s ",
- (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
- direct ? "(DIRECT)" : "");
- seq_printf(m, "%s\n", domain->name);
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- domain = domain->parent;
- data = data->parent_data;
-#else
- domain = NULL;
-#endif
- }
-}
-
-static int virq_debug_show(struct seq_file *m, void *private)
-{
- unsigned long flags;
- struct irq_desc *desc;
- struct irq_domain *domain;
- struct radix_tree_iter iter;
- void __rcu **slot;
- int i;
-
- seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
- "name", "mapped", "linear-max", "direct-max", "devtree-node");
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(domain, &irq_domain_list, link) {
- struct device_node *of_node;
- const char *name;
-
- int count = 0;
-
- of_node = irq_domain_get_of_node(domain);
- if (of_node)
- name = of_node_full_name(of_node);
- else if (is_fwnode_irqchip(domain->fwnode))
- name = container_of(domain->fwnode, struct irqchip_fwid,
- fwnode)->name;
- else
- name = "";
-
- radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
- count++;
- seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
- domain == irq_default_domain ? '*' : ' ', domain->name,
- domain->revmap_size + count, domain->revmap_size,
- domain->revmap_direct_max_irq,
- name);
- }
- mutex_unlock(&irq_domain_mutex);
-
- seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
- "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
- "active", "type", "domain");
-
- for (i = 1; i < nr_irqs; i++) {
- desc = irq_to_desc(i);
- if (!desc)
- continue;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- virq_debug_show_one(m, desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- }
-
- return 0;
-}
-
-static int virq_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, virq_debug_show, inode->i_private);
-}
-
-static const struct file_operations virq_debug_fops = {
- .open = virq_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init irq_debugfs_init(void)
-{
- if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
- NULL, &virq_debug_fops) == NULL)
- return -ENOMEM;
-
- return 0;
-}
-__initcall(irq_debugfs_init);
-#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
-
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
*
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 0ba0dd8863a7..5187dfe809ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m)
int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
bool reserved, unsigned int *mapped_cpu)
{
- unsigned int cpu;
+ unsigned int cpu, best_cpu, maxavl = 0;
+ struct cpumap *cm;
+ unsigned int bit;
+ best_cpu = UINT_MAX;
for_each_cpu(cpu, msk) {
- struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
- unsigned int bit;
+ cm = per_cpu_ptr(m->maps, cpu);
- if (!cm->online)
+ if (!cm->online || cm->available <= maxavl)
continue;
+ best_cpu = cpu;
+ maxavl = cm->available;
+ }
+
+ if (maxavl) {
+ cm = per_cpu_ptr(m->maps, best_cpu);
bit = matrix_alloc_area(m, cm, 1, false);
if (bit < m->alloc_end) {
cm->allocated++;
@@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
m->global_available--;
if (reserved)
m->global_reserved--;
- *mapped_cpu = cpu;
- trace_irq_matrix_alloc(bit, cpu, m, cm);
+ *mapped_cpu = best_cpu;
+ trace_irq_matrix_alloc(bit, best_cpu, m, cm);
return bit;
}
}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 40e9d739c169..6b7cdf17ccf8 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -36,7 +36,7 @@ static bool irq_work_claim(struct irq_work *work)
*/
flags = work->flags & ~IRQ_WORK_PENDING;
for (;;) {
- nflags = flags | IRQ_WORK_FLAGS;
+ nflags = flags | IRQ_WORK_CLAIMED;
oflags = cmpxchg(&work->flags, flags, nflags);
if (oflags == flags)
break;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24e4adc..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-static void static_key_slow_inc_cpuslocked(struct static_key *key)
+void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static void static_key_slow_dec_cpuslocked(struct static_key *key,
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
unsigned long rate_limit,
struct delayed_work *work)
{
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
struct delayed_work *work)
{
cpus_read_lock();
- static_key_slow_dec_cpuslocked(key, rate_limit, work);
+ __static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
}
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE(key);
+ __static_key_slow_dec_cpuslocked(key, 0, NULL);
+}
+
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 5fa1324a4f29..521659044719 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -49,6 +49,7 @@
#include <linux/gfp.h>
#include <linux/random.h>
#include <linux/jhash.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
@@ -4490,6 +4491,7 @@ retry:
if (!unlock)
if (read_trylock(&tasklist_lock))
unlock = 1;
+ touch_nmi_watchdog();
} while_each_thread(g, p);
pr_warn("\n");
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
/*
* Slow path try-lock function:
*/
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ ret = __rt_mutex_slowtrylock(lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
return rt_mutex_slowtrylock(lock);
}
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/module.c b/kernel/module.c
index dea01ac9cb74..09e48eee4d55 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2863,6 +2863,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
}
#endif /* CONFIG_LIVEPATCH */
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ return;
+
+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ mod->name);
+}
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -3029,6 +3038,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
}
+ check_modinfo_retpoline(mod, info);
+
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
pr_warn("%s: module is from the staging directory, the quality "
diff --git a/kernel/pid.c b/kernel/pid.c
index b13b624e2c49..5d30c87e3c42 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,7 +41,19 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
-struct pid init_struct_pid = INIT_STRUCT_PID;
+struct pid init_struct_pid = {
+ .count = ATOMIC_INIT(1),
+ .tasks = {
+ { .first = NULL },
+ { .first = NULL },
+ { .first = NULL },
+ },
+ .level = 0,
+ .numbers = { {
+ .nr = 0,
+ .ns = &init_pid_ns,
+ }, }
+};
int pid_max = PID_MAX_DEFAULT;
@@ -193,10 +205,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
if (unlikely(is_child_reaper(pid))) {
- if (pid_ns_prepare_proc(ns)) {
- disable_pid_allocation(ns);
+ if (pid_ns_prepare_proc(ns))
goto out_free;
- }
}
get_pid_ns(ns);
@@ -226,6 +236,10 @@ out_free:
while (++i <= ns->level)
idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ /* On failure to allocate the first pid, reset the state */
+ if (ns->pid_allocated == PIDNS_ADDING)
+ idr_set_cursor(&ns->idr, 0);
+
spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3a2ca9066583..705c2366dafe 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,6 +22,35 @@ DEFINE_MUTEX(pm_mutex);
#ifdef CONFIG_PM_SLEEP
+void lock_system_sleep(void)
+{
+ current->flags |= PF_FREEZER_SKIP;
+ mutex_lock(&pm_mutex);
+}
+EXPORT_SYMBOL_GPL(lock_system_sleep);
+
+void unlock_system_sleep(void)
+{
+ /*
+ * Don't use freezer_count() because we don't want the call to
+ * try_to_freeze() here.
+ *
+ * Reason:
+ * Fundamentally, we just don't need it, because freezing condition
+ * doesn't come into effect until we release the pm_mutex lock,
+ * since the freezer always works with pm_mutex held.
+ *
+ * More importantly, in the case of hibernation,
+ * unlock_system_sleep() gets called in snapshot_read() and
+ * snapshot_write() when the freezing condition is still in effect.
+ * Which means, if we use try_to_freeze() here, it would make them
+ * enter the refrigerator, thus causing hibernation to lockup.
+ */
+ current->flags &= ~PF_FREEZER_SKIP;
+ mutex_unlock(&pm_mutex);
+}
+EXPORT_SYMBOL_GPL(unlock_system_sleep);
+
/* Routines for PM-transition notifications */
static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bce0464524d8..3d37c279c090 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1645,8 +1645,7 @@ static unsigned long free_unnecessary_pages(void)
* [number of saveable pages] - [number of pages that can be freed in theory]
*
* where the second term is the sum of (1) reclaimable slab pages, (2) active
- * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
- * minus mapped file pages.
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages.
*/
static unsigned long minimum_image_size(unsigned long saveable)
{
@@ -1656,8 +1655,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
- + global_node_page_state(NR_INACTIVE_FILE)
- - global_node_page_state(NR_FILE_MAPPED);
+ + global_node_page_state(NR_INACTIVE_FILE);
return saveable <= size ? 0 : saveable - size;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 293ead59eccc..11b4282c2d20 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb)
static void hib_end_io(struct bio *bio)
{
struct hib_bio_batch *hb = bio->bi_private;
- struct page *page = bio->bi_io_vec[0].bv_page;
+ struct page *page = bio_first_page_all(bio);
if (bio->bi_status) {
pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
@@ -879,7 +879,7 @@ out_clean:
* space avaiable from the resume partition.
*/
-static int enough_swap(unsigned int nr_pages, unsigned int flags)
+static int enough_swap(unsigned int nr_pages)
{
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;
@@ -915,7 +915,7 @@ int swsusp_write(unsigned int flags)
return error;
}
if (flags & SF_NOCOMPRESS_MODE) {
- if (!enough_swap(pages, flags)) {
+ if (!enough_swap(pages)) {
pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
- /*
- * Perform commit of crossrelease here.
- */
- complete_release_commit(x);
-
if (x->done != UINT_MAX)
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 729b4fff93b8..5a31a85bbd84 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2057,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->state = TASK_WAKING;
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2070,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#else /* CONFIG_SMP */
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2123,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
if (!task_on_rq_queued(p)) {
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&rq->nr_iowait);
}
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..26a71ebcd3c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4365,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void)
{
- static_key_slow_inc(&__cfs_bandwidth_used);
+ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
rcu_read_unlock();
}
if (!fallback) {
+ preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
free_cpumask_var(tmpmask);
}
cpus_read_unlock();
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d32520840fde..ae0c8a411fe7 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -60,6 +60,15 @@
#include "tick-internal.h"
/*
+ * Masks for selecting the soft and hard context timers from
+ * cpu_base->active
+ */
+#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
+#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
+#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
+#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+
+/*
* The timer bases:
*
* There are more clockids than hrtimer bases. Thus, we index
@@ -70,7 +79,6 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
- .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
@@ -93,6 +101,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
+ {
+ .index = HRTIMER_BASE_MONOTONIC_SOFT,
+ .clockid = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
+ },
+ {
+ .index = HRTIMER_BASE_REALTIME_SOFT,
+ .clockid = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
+ },
+ {
+ .index = HRTIMER_BASE_BOOTTIME_SOFT,
+ .clockid = CLOCK_BOOTTIME,
+ .get_time = &ktime_get_boottime,
+ },
+ {
+ .index = HRTIMER_BASE_TAI_SOFT,
+ .clockid = CLOCK_TAI,
+ .get_time = &ktime_get_clocktai,
+ },
}
};
@@ -118,7 +146,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .seq = SEQCNT_ZERO(migration_cpu_base),
.clock_base = { { .cpu_base = &migration_cpu_base, }, },
};
@@ -156,45 +183,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * With HIGHRES=y we do not migrate the timer when it is expiring
- * before the next event on the target cpu because we cannot reprogram
- * the target cpu hardware and we would cause it to fire late.
+ * We do not migrate the timer when it is expiring before the next
+ * event on the target cpu. When high resolution is enabled, we cannot
+ * reprogram the target cpu hardware and we would cause it to fire
+ * late. To keep it simple, we handle the high resolution enabled and
+ * disabled case similar.
*
* Called with cpu_base->lock of target cpu held.
*/
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t expires;
- if (!new_base->cpu_base->hres_active)
- return 0;
-
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires <= new_base->cpu_base->expires_next;
-#else
- return 0;
-#endif
+ return expires < new_base->cpu_base->expires_next;
}
-#ifdef CONFIG_NO_HZ_COMMON
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
-{
- if (pinned || !base->migration_enabled)
- return base;
- return &per_cpu(hrtimer_bases, get_nohz_timer_target());
-}
-#else
static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ if (static_branch_likely(&timers_migration_enabled) && !pinned)
+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+#endif
return base;
}
-#endif
/*
* We switch the timer base to a power-optimized selected CPU target,
@@ -396,7 +411,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer)
debug_object_init(timer, &hrtimer_debug_descr);
}
-static inline void debug_hrtimer_activate(struct hrtimer *timer)
+static inline void debug_hrtimer_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode)
{
debug_object_activate(timer, &hrtimer_debug_descr);
}
@@ -429,8 +445,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
#else
+
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
-static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif
@@ -442,10 +460,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid,
trace_hrtimer_init(timer, clockid, mode);
}
-static inline void debug_activate(struct hrtimer *timer)
+static inline void debug_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode)
{
- debug_hrtimer_activate(timer);
- trace_hrtimer_start(timer);
+ debug_hrtimer_activate(timer, mode);
+ trace_hrtimer_start(timer, mode);
}
static inline void debug_deactivate(struct hrtimer *timer)
@@ -454,35 +473,43 @@ static inline void debug_deactivate(struct hrtimer *timer)
trace_hrtimer_cancel(timer);
}
-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
-static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer *timer)
+static struct hrtimer_clock_base *
+__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
- cpu_base->next_timer = timer;
-#endif
+ unsigned int idx;
+
+ if (!*active)
+ return NULL;
+
+ idx = __ffs(*active);
+ *active &= ~(1U << idx);
+
+ return &cpu_base->clock_base[idx];
}
-static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+#define for_each_active_base(base, cpu_base, active) \
+ while ((base = __next_base((cpu_base), &(active))))
+
+static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+ unsigned int active,
+ ktime_t expires_next)
{
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- unsigned int active = cpu_base->active_bases;
- ktime_t expires, expires_next = KTIME_MAX;
+ struct hrtimer_clock_base *base;
+ ktime_t expires;
- hrtimer_update_next_timer(cpu_base, NULL);
- for (; active; base++, active >>= 1) {
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *next;
struct hrtimer *timer;
- if (!(active & 0x01))
- continue;
-
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
- hrtimer_update_next_timer(cpu_base, timer);
+ if (timer->is_soft)
+ cpu_base->softirq_next_timer = timer;
+ else
+ cpu_base->next_timer = timer;
}
}
/*
@@ -494,7 +521,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
expires_next = 0;
return expires_next;
}
-#endif
+
+/*
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
+ * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ *
+ * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
+ * those timers will get run whenever the softirq gets handled, at the end of
+ * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
+ *
+ * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
+ * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
+ * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
+ *
+ * @active_mask must be one of:
+ * - HRTIMER_ACTIVE_ALL,
+ * - HRTIMER_ACTIVE_SOFT, or
+ * - HRTIMER_ACTIVE_HARD.
+ */
+static ktime_t
+__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
+{
+ unsigned int active;
+ struct hrtimer *next_timer = NULL;
+ ktime_t expires_next = KTIME_MAX;
+
+ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ cpu_base->softirq_next_timer = NULL;
+ expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
+
+ next_timer = cpu_base->softirq_next_timer;
+ }
+
+ if (active_mask & HRTIMER_ACTIVE_HARD) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ cpu_base->next_timer = next_timer;
+ expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+ }
+
+ return expires_next;
+}
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
@@ -502,36 +569,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
offs_real, offs_boot, offs_tai);
-}
-/* High resolution timer related functions */
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer enabled ?
- */
-static bool hrtimer_hres_enabled __read_mostly = true;
-unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
-EXPORT_SYMBOL_GPL(hrtimer_resolution);
-
-/*
- * Enable / Disable high resolution mode
- */
-static int __init setup_hrtimer_hres(char *str)
-{
- return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
-}
-
-__setup("highres=", setup_hrtimer_hres);
+ base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
+ base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
+ base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
-/*
- * hrtimer_high_res_enabled - query, if the highres mode is enabled
- */
-static inline int hrtimer_is_hres_enabled(void)
-{
- return hrtimer_hres_enabled;
+ return now;
}
/*
@@ -539,7 +584,8 @@ static inline int hrtimer_is_hres_enabled(void)
*/
static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
- return cpu_base->hres_active;
+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
+ cpu_base->hres_active : 0;
}
static inline int hrtimer_hres_active(void)
@@ -557,10 +603,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
- if (!cpu_base->hres_active)
- return;
+ /*
+ * Find the current next expiration time.
+ */
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
- expires_next = __hrtimer_get_next_event(cpu_base);
+ if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
+ /*
+ * When the softirq is activated, hrtimer has to be
+ * programmed with the first hard hrtimer because soft
+ * timer interrupt could occur too late.
+ */
+ if (cpu_base->softirq_activated)
+ expires_next = __hrtimer_get_next_event(cpu_base,
+ HRTIMER_ACTIVE_HARD);
+ else
+ cpu_base->softirq_expires_next = expires_next;
+ }
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -568,6 +627,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
cpu_base->expires_next = expires_next;
/*
+ * If hres is not active, hardware does not have to be
+ * reprogrammed yet.
+ *
* If a hang was detected in the last timer interrupt then we
* leave the hang delay active in the hardware. We want the
* system to make progress. That also prevents the following
@@ -581,81 +643,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
* set. So we'd effectivly block all timers until the T2 event
* fires.
*/
- if (cpu_base->hang_detected)
+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
tick_program_event(cpu_base->expires_next, 1);
}
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
/*
- * When a timer is enqueued and expires earlier than the already enqueued
- * timers, we have to check, whether it expires earlier than the timer for
- * which the clock event device was armed.
- *
- * Called with interrupts disabled and base->cpu_base.lock held
+ * High resolution timer enabled ?
*/
-static void hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-
- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
-
- /*
- * If the timer is not on the current cpu, we cannot reprogram
- * the other cpus clock event device.
- */
- if (base->cpu_base != cpu_base)
- return;
-
- /*
- * If the hrtimer interrupt is running, then it will
- * reevaluate the clock bases and reprogram the clock event
- * device. The callbacks are always executed in hard interrupt
- * context so we don't need an extra check for a running
- * callback.
- */
- if (cpu_base->in_hrtirq)
- return;
-
- /*
- * CLOCK_REALTIME timer might be requested with an absolute
- * expiry time which is less than base->offset. Set it to 0.
- */
- if (expires < 0)
- expires = 0;
-
- if (expires >= cpu_base->expires_next)
- return;
-
- /* Update the pointer to the next expiring timer */
- cpu_base->next_timer = timer;
-
- /*
- * If a hang was detected in the last timer interrupt then we
- * do not schedule a timer which is earlier than the expiry
- * which we enforced in the hang detection. We want the system
- * to make progress.
- */
- if (cpu_base->hang_detected)
- return;
+static bool hrtimer_hres_enabled __read_mostly = true;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
- /*
- * Program the timer hardware. We enforce the expiry for
- * events which are already in the past.
- */
- cpu_base->expires_next = expires;
- tick_program_event(expires, 1);
+/*
+ * Enable / Disable high resolution mode
+ */
+static int __init setup_hrtimer_hres(char *str)
+{
+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
+__setup("highres=", setup_hrtimer_hres);
+
/*
- * Initialize the high resolution related parts of cpu_base
+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
*/
-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
+static inline int hrtimer_is_hres_enabled(void)
{
- base->expires_next = KTIME_MAX;
- base->hres_active = 0;
+ return hrtimer_hres_enabled;
}
/*
@@ -667,7 +686,7 @@ static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
- if (!base->hres_active)
+ if (!__hrtimer_hres_active(base))
return;
raw_spin_lock(&base->lock);
@@ -714,23 +733,102 @@ void clock_was_set_delayed(void)
#else
-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
-static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
-static inline void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
-static inline int hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- return 0;
-}
-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
/*
+ * When a timer is enqueued and expires earlier than the already enqueued
+ * timers, we have to check, whether it expires earlier than the timer for
+ * which the clock event device was armed.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ struct hrtimer_clock_base *base = timer->base;
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
+
+ /*
+ * CLOCK_REALTIME timer might be requested with an absolute
+ * expiry time which is less than base->offset. Set it to 0.
+ */
+ if (expires < 0)
+ expires = 0;
+
+ if (timer->is_soft) {
+ /*
+ * soft hrtimer could be started on a remote CPU. In this
+ * case softirq_expires_next needs to be updated on the
+ * remote CPU. The soft hrtimer will not expire before the
+ * first hard hrtimer on the remote CPU -
+ * hrtimer_check_target() prevents this case.
+ */
+ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
+
+ if (timer_cpu_base->softirq_activated)
+ return;
+
+ if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
+ return;
+
+ timer_cpu_base->softirq_next_timer = timer;
+ timer_cpu_base->softirq_expires_next = expires;
+
+ if (!ktime_before(expires, timer_cpu_base->expires_next) ||
+ !reprogram)
+ return;
+ }
+
+ /*
+ * If the timer is not on the current cpu, we cannot reprogram
+ * the other cpus clock event device.
+ */
+ if (base->cpu_base != cpu_base)
+ return;
+
+ /*
+ * If the hrtimer interrupt is running, then it will
+ * reevaluate the clock bases and reprogram the clock event
+ * device. The callbacks are always executed in hard interrupt
+ * context so we don't need an extra check for a running
+ * callback.
+ */
+ if (cpu_base->in_hrtirq)
+ return;
+
+ if (expires >= cpu_base->expires_next)
+ return;
+
+ /* Update the pointer to the next expiring timer */
+ cpu_base->next_timer = timer;
+ cpu_base->expires_next = expires;
+
+ /*
+ * If hres is not active, hardware does not have to be
+ * programmed yet.
+ *
+ * If a hang was detected in the last timer interrupt then we
+ * do not schedule a timer which is earlier than the expiry
+ * which we enforced in the hang detection. We want the system
+ * to make progress.
+ */
+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
+ return;
+
+ /*
+ * Program the timer hardware. We enforce the expiry for
+ * events which are already in the past.
+ */
+ tick_program_event(expires, 1);
+}
+
+/*
* Clock realtime was set
*
* Change the offset of the realtime clock vs. the monotonic
@@ -835,9 +933,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* Returns 1 when the new timer is the leftmost timer in the tree.
*/
static int enqueue_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ enum hrtimer_mode mode)
{
- debug_activate(timer);
+ debug_activate(timer, mode);
base->cpu_base->active_bases |= 1 << base->index;
@@ -870,7 +969,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
if (!timerqueue_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
-#ifdef CONFIG_HIGH_RES_TIMERS
/*
* Note: If reprogram is false we do not update
* cpu_base->next_timer. This happens when we remove the first
@@ -881,7 +979,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
*/
if (reprogram && timer == cpu_base->next_timer)
hrtimer_force_reprogram(cpu_base, 1);
-#endif
}
/*
@@ -930,22 +1027,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
return tim;
}
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- */
-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode)
+static void
+hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
- struct hrtimer_clock_base *base, *new_base;
- unsigned long flags;
- int leftmost;
+ ktime_t expires;
- base = lock_hrtimer_base(timer, &flags);
+ /*
+ * Find the next SOFT expiration.
+ */
+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+
+ /*
+ * reprogramming needs to be triggered, even if the next soft
+ * hrtimer expires at the same time than the next hard
+ * hrtimer. cpu_base->softirq_expires_next needs to be updated!
+ */
+ if (expires == KTIME_MAX)
+ return;
+
+ /*
+ * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
+ * cpu_base->*expires_next is only set by hrtimer_reprogram()
+ */
+ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
+}
+
+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode,
+ struct hrtimer_clock_base *base)
+{
+ struct hrtimer_clock_base *new_base;
/* Remove an active timer from the queue: */
remove_hrtimer(timer, base, true);
@@ -960,21 +1071,35 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Switch the timer base, if necessary: */
new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
- leftmost = enqueue_hrtimer(timer, new_base);
- if (!leftmost)
- goto unlock;
+ return enqueue_hrtimer(timer, new_base, mode);
+}
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
+ * softirq based mode is considered for debug purpose only!
+ */
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode)
+{
+ struct hrtimer_clock_base *base;
+ unsigned long flags;
+
+ /*
+ * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
+ * match.
+ */
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+
+ base = lock_hrtimer_base(timer, &flags);
+
+ if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
+ hrtimer_reprogram(timer, true);
- if (!hrtimer_is_hres_active(timer)) {
- /*
- * Kick to reschedule the next tick to handle the new timer
- * on dynticks target.
- */
- if (new_base->cpu_base->nohz_active)
- wake_up_nohz_cpu(new_base->cpu_base->cpu);
- } else {
- hrtimer_reprogram(timer, new_base);
- }
-unlock:
unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1072,7 +1197,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
if (!__hrtimer_hres_active(cpu_base))
- expires = __hrtimer_get_next_event(cpu_base);
+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1095,17 +1220,24 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
+ bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
+ int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
struct hrtimer_cpu_base *cpu_base;
- int base;
memset(timer, 0, sizeof(struct hrtimer));
cpu_base = raw_cpu_ptr(&hrtimer_bases);
- if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
+ /*
+ * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
+ * clock modifications, so they needs to become CLOCK_MONOTONIC to
+ * ensure POSIX compliance.
+ */
+ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
clock_id = CLOCK_MONOTONIC;
- base = hrtimer_clockid_to_base(clock_id);
+ base += hrtimer_clockid_to_base(clock_id);
+ timer->is_soft = softtimer;
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
}
@@ -1114,7 +1246,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id: the clock to be used
- * @mode: timer mode abs/rel
+ * @mode: The modes which are relevant for intitialization:
+ * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
+ * HRTIMER_MODE_REL_SOFT
+ *
+ * The PINNED variants of the above can be handed in,
+ * but the PINNED bit is ignored as pinning happens
+ * when the hrtimer is started
*/
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
@@ -1133,19 +1271,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
*/
bool hrtimer_active(const struct hrtimer *timer)
{
- struct hrtimer_cpu_base *cpu_base;
+ struct hrtimer_clock_base *base;
unsigned int seq;
do {
- cpu_base = READ_ONCE(timer->base->cpu_base);
- seq = raw_read_seqcount_begin(&cpu_base->seq);
+ base = READ_ONCE(timer->base);
+ seq = raw_read_seqcount_begin(&base->seq);
if (timer->state != HRTIMER_STATE_INACTIVE ||
- cpu_base->running == timer)
+ base->running == timer)
return true;
- } while (read_seqcount_retry(&cpu_base->seq, seq) ||
- cpu_base != READ_ONCE(timer->base->cpu_base));
+ } while (read_seqcount_retry(&base->seq, seq) ||
+ base != READ_ONCE(timer->base));
return false;
}
@@ -1171,7 +1309,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
- struct hrtimer *timer, ktime_t *now)
+ struct hrtimer *timer, ktime_t *now,
+ unsigned long flags)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
@@ -1179,16 +1318,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- cpu_base->running = timer;
+ base->running = timer;
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
- * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * hrtimer_active() cannot observe base->running == NULL &&
* timer->state == INACTIVE.
*/
- raw_write_seqcount_barrier(&cpu_base->seq);
+ raw_write_seqcount_barrier(&base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
fn = timer->function;
@@ -1202,15 +1341,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
timer->is_rel = false;
/*
- * Because we run timers from hardirq context, there is no chance
- * they get migrated to another cpu, therefore its safe to unlock
- * the timer base.
+ * The timer is marked as running in the CPU base, so it is
+ * protected against migration to a different CPU even if the lock
+ * is dropped.
*/
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
trace_hrtimer_expire_entry(timer, now);
restart = fn(timer);
trace_hrtimer_expire_exit(timer);
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irq(&cpu_base->lock);
/*
* Note: We clear the running state after enqueue_hrtimer and
@@ -1223,33 +1362,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
*/
if (restart != HRTIMER_NORESTART &&
!(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base);
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
- * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * hrtimer_active() cannot observe base->running.timer == NULL &&
* timer->state == INACTIVE.
*/
- raw_write_seqcount_barrier(&cpu_base->seq);
+ raw_write_seqcount_barrier(&base->seq);
- WARN_ON_ONCE(cpu_base->running != timer);
- cpu_base->running = NULL;
+ WARN_ON_ONCE(base->running != timer);
+ base->running = NULL;
}
-static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
+ unsigned long flags, unsigned int active_mask)
{
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- unsigned int active = cpu_base->active_bases;
+ struct hrtimer_clock_base *base;
+ unsigned int active = cpu_base->active_bases & active_mask;
- for (; active; base++, active >>= 1) {
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *node;
ktime_t basenow;
- if (!(active & 0x01))
- continue;
-
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1272,11 +1409,28 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
if (basenow < hrtimer_get_softexpires_tv64(timer))
break;
- __run_hrtimer(cpu_base, base, timer, &basenow);
+ __run_hrtimer(cpu_base, base, timer, &basenow, flags);
}
}
}
+static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ unsigned long flags;
+ ktime_t now;
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ now = hrtimer_update_base(cpu_base);
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
+
+ cpu_base->softirq_activated = 0;
+ hrtimer_update_softirq_timer(cpu_base, true);
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+}
+
#ifdef CONFIG_HIGH_RES_TIMERS
/*
@@ -1287,13 +1441,14 @@ void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
+ unsigned long flags;
int retries = 0;
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
cpu_base->in_hrtirq = 1;
@@ -1306,17 +1461,23 @@ retry:
*/
cpu_base->expires_next = KTIME_MAX;
- __hrtimer_run_queues(cpu_base, now);
+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
+ cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->softirq_activated = 1;
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
/* Reevaluate the clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base);
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
cpu_base->in_hrtirq = 0;
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
/* Reprogramming necessary ? */
if (!tick_program_event(expires_next, 0)) {
@@ -1337,7 +1498,7 @@ retry:
* Acquire base lock for updating the offsets and retrieving
* the current time.
*/
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
@@ -1350,7 +1511,8 @@ retry:
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
delta = ktime_sub(now, entry_time);
if ((unsigned int)delta > cpu_base->max_hang_time)
cpu_base->max_hang_time = (unsigned int) delta;
@@ -1392,6 +1554,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ unsigned long flags;
ktime_t now;
if (__hrtimer_hres_active(cpu_base))
@@ -1409,10 +1572,17 @@ void hrtimer_run_queues(void)
return;
}
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
- __hrtimer_run_queues(cpu_base, now);
- raw_spin_unlock(&cpu_base->lock);
+
+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
+ cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->softirq_activated = 1;
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
/*
@@ -1590,7 +1760,13 @@ int hrtimers_prepare_cpu(unsigned int cpu)
}
cpu_base->cpu = cpu;
- hrtimer_init_hres(cpu_base);
+ cpu_base->active_bases = 0;
+ cpu_base->hres_active = 0;
+ cpu_base->hang_detected = 0;
+ cpu_base->next_timer = NULL;
+ cpu_base->softirq_next_timer = NULL;
+ cpu_base->expires_next = KTIME_MAX;
+ cpu_base->softirq_expires_next = KTIME_MAX;
return 0;
}
@@ -1622,7 +1798,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* sort out already expired timers and reprogram the
* event device.
*/
- enqueue_hrtimer(timer, new_base);
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
}
}
@@ -1634,6 +1810,12 @@ int hrtimers_dead_cpu(unsigned int scpu)
BUG_ON(cpu_online(scpu));
tick_cancel_sched_timer(scpu);
+ /*
+ * this BH disable ensures that raise_softirq_irqoff() does
+ * not wakeup ksoftirqd (and acquire the pi-lock) while
+ * holding the cpu_base lock
+ */
+ local_bh_disable();
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
new_base = this_cpu_ptr(&hrtimer_bases);
@@ -1649,12 +1831,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
&new_base->clock_base[i]);
}
+ /*
+ * The migration might have changed the first expiring softirq
+ * timer on this CPU. Update it.
+ */
+ hrtimer_update_softirq_timer(new_base, false);
+
raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
local_irq_enable();
+ local_bh_enable();
return 0;
}
@@ -1663,18 +1852,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
void __init hrtimers_init(void)
{
hrtimers_prepare_cpu(smp_processor_id());
+ open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
- * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
+ * @mode: timer mode
+ * @clock_id: timer clock to be used
*/
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
- const enum hrtimer_mode mode, int clock)
+ const enum hrtimer_mode mode, clockid_t clock_id)
{
struct hrtimer_sleeper t;
@@ -1695,7 +1885,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
- hrtimer_init_on_stack(&t.timer, clock, mode);
+ hrtimer_init_on_stack(&t.timer, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_init_sleeper(&t, current);
@@ -1717,7 +1907,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
@@ -1756,7 +1946,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
/**
* schedule_hrtimeout - sleep until timeout
* @expires: timeout value (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 17cdc554c9fe..cc91d90abd84 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -216,7 +216,7 @@ struct posix_clock_desc {
static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
{
- struct file *fp = fget(CLOCKID_TO_FD(id));
+ struct file *fp = fget(clockid_to_fd(id));
int err = -EINVAL;
if (!fp)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1f27887aa194..ec9f5da6f163 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1189,9 +1189,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
u64 now;
WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
- cpu_timer_sample_group(clock_idx, tsk, &now);
- if (oldval) {
+ if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) {
/*
* We are setting itimer. The *oldval is absolute and we update
* it to be relative, *newval argument is relative and we update
@@ -1363,8 +1362,8 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
}
-#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
-#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED)
static int process_cpu_clock_getres(const clockid_t which_clock,
struct timespec64 *tp)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f8e1845aa464..e277284c2831 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { }
#ifdef CONFIG_NO_HZ_COMMON
extern unsigned long tick_nohz_active;
-#else
+extern void timers_update_nohz(void);
+# ifdef CONFIG_SMP
+extern struct static_key_false timers_migration_enabled;
+# endif
+#else /* CONFIG_NO_HZ_COMMON */
+static inline void timers_update_nohz(void) { }
#define tick_nohz_active (0)
#endif
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void timers_update_migration(bool update_nohz);
-#else
-static inline void timers_update_migration(bool update_nohz) { }
-#endif
-
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f7cc7abfcf25..29a5733eff83 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1107,7 +1107,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
ts->nohz_mode = mode;
/* One update is enough */
if (!test_and_set_bit(0, &tick_nohz_active))
- timers_update_migration(true);
+ timers_update_nohz();
}
/**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 89a9e1b4264a..48150ab42de9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -200,8 +200,6 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
- bool migration_enabled;
- bool nohz_active;
bool is_idle;
bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
@@ -210,45 +208,64 @@ struct timer_base {
static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
+
+static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
+static DEFINE_MUTEX(timer_keys_mutex);
+
+static void timer_update_keys(struct work_struct *work);
+static DECLARE_WORK(timer_update_work, timer_update_keys);
+
+#ifdef CONFIG_SMP
unsigned int sysctl_timer_migration = 1;
-void timers_update_migration(bool update_nohz)
+DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
+
+static void timers_update_migration(void)
{
- bool on = sysctl_timer_migration && tick_nohz_active;
- unsigned int cpu;
+ if (sysctl_timer_migration && tick_nohz_active)
+ static_branch_enable(&timers_migration_enabled);
+ else
+ static_branch_disable(&timers_migration_enabled);
+}
+#else
+static inline void timers_update_migration(void) { }
+#endif /* !CONFIG_SMP */
- /* Avoid the loop, if nothing to update */
- if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
- return;
+static void timer_update_keys(struct work_struct *work)
+{
+ mutex_lock(&timer_keys_mutex);
+ timers_update_migration();
+ static_branch_enable(&timers_nohz_active);
+ mutex_unlock(&timer_keys_mutex);
+}
- for_each_possible_cpu(cpu) {
- per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
- per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
- per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
- if (!update_nohz)
- continue;
- per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
- per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
- per_cpu(hrtimer_bases.nohz_active, cpu) = true;
- }
+void timers_update_nohz(void)
+{
+ schedule_work(&timer_update_work);
}
int timer_migration_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- static DEFINE_MUTEX(mutex);
int ret;
- mutex_lock(&mutex);
+ mutex_lock(&timer_keys_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
- timers_update_migration(false);
- mutex_unlock(&mutex);
+ timers_update_migration();
+ mutex_unlock(&timer_keys_mutex);
return ret;
}
-#endif
+
+static inline bool is_timers_nohz_active(void)
+{
+ return static_branch_unlikely(&timers_nohz_active);
+}
+#else
+static inline bool is_timers_nohz_active(void) { return false; }
+#endif /* NO_HZ_COMMON */
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
@@ -534,7 +551,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!is_timers_nohz_active())
return;
/*
@@ -849,21 +866,20 @@ static inline struct timer_base *get_timer_base(u32 tflags)
return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}
-#ifdef CONFIG_NO_HZ_COMMON
static inline struct timer_base *
get_target_base(struct timer_base *base, unsigned tflags)
{
-#ifdef CONFIG_SMP
- if ((tflags & TIMER_PINNED) || !base->migration_enabled)
- return get_timer_this_cpu_base(tflags);
- return get_timer_cpu_base(tflags, get_nohz_timer_target());
-#else
- return get_timer_this_cpu_base(tflags);
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ if (static_branch_likely(&timers_migration_enabled) &&
+ !(tflags & TIMER_PINNED))
+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
#endif
+ return get_timer_this_cpu_base(tflags);
}
static inline void forward_timer_base(struct timer_base *base)
{
+#ifdef CONFIG_NO_HZ_COMMON
unsigned long jnow;
/*
@@ -887,16 +903,8 @@ static inline void forward_timer_base(struct timer_base *base)
base->clk = jnow;
else
base->clk = base->next_expiry;
-}
-#else
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
- return get_timer_this_cpu_base(tflags);
-}
-
-static inline void forward_timer_base(struct timer_base *base) { }
#endif
+}
/*
@@ -1696,7 +1704,7 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
if (time_before(jiffies, base->clk)) {
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 904c952ac383..f54dc62b599c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -355,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
on if you need to profile the system's use of these macros.
config PROFILE_ALL_BRANCHES
- bool "Profile all if conditionals"
+ bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
help
This tracer profiles all branch conditions. Every if ()
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ccdf3664e4a9..554b517c61a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = {
};
/*
- * This is used by __kernel_text_address() to return true if the
- * address is on a dynamically allocated trampoline that would
- * not return true for either core_kernel_text() or
- * is_module_text_address().
+ * Used by the stack undwinder to know about dynamic ftrace trampolines.
*/
-bool is_ftrace_trampoline(unsigned long addr)
+struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
{
- struct ftrace_ops *op;
- bool ret = false;
+ struct ftrace_ops *op = NULL;
/*
* Some of the ops may be dynamically allocated,
@@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr)
if (op->trampoline && op->trampoline_size)
if (addr >= op->trampoline &&
addr < op->trampoline + op->trampoline_size) {
- ret = true;
- goto out;
+ preempt_enable_notrace();
+ return op;
}
} while_for_each_ftrace_op(op);
-
- out:
preempt_enable_notrace();
- return ret;
+ return NULL;
+}
+
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ return ftrace_ops_trampoline(addr) != NULL;
}
struct ftrace_page {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ab18995ff1e..5af2842dea96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2534,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. There are four
- * different contexts that we need to consider.
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
*
- * Normal context.
- * SoftIRQ context
- * IRQ context
- * NMI context
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
*
- * If for some reason the ring buffer starts to recurse, we
- * only allow that to happen at most 4 times (one for each
- * context). If it happens 5 times, then we consider this a
- * recusive loop and do not let it go further.
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
+ *
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- if (cpu_buffer->current_context >= 4)
+ unsigned int val = cpu_buffer->current_context;
+ unsigned long pc = preempt_count();
+ int bit;
+
+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+ bit = RB_CTX_NORMAL;
+ else
+ bit = pc & NMI_MASK ? RB_CTX_NMI :
+ pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+
+ if (unlikely(val & (1 << bit)))
return 1;
- cpu_buffer->current_context++;
- /* Interrupts must see this update */
- barrier();
+ val |= (1 << bit);
+ cpu_buffer->current_context = val;
return 0;
}
@@ -2564,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- /* Don't let the dec leak out */
- barrier();
- cpu_buffer->current_context--;
+ cpu_buffer->current_context &= cpu_buffer->current_context - 1;
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aaf882a8bf1f..4f3a8e24b426 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
+/*
+ * Skip 3:
+ *
+ * trace_buffer_unlock_commit_regs()
+ * trace_event_buffer_commit()
+ * trace_event_raw_event_xxx()
+*/
+# define STACK_SKIP 3
+
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct ring_buffer *buffer,
struct ring_buffer_event *event,
@@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
__buffer_unlock_commit(buffer, event);
/*
- * If regs is not set, then skip the following callers:
- * trace_buffer_unlock_commit_regs
- * event_trigger_unlock_commit
- * trace_event_buffer_commit
- * trace_event_raw_event_sched_switch
+ * If regs is not set, then skip the necessary functions.
* Note, we can still get here via blktrace, wakeup tracer
* and mmiotrace, but that's ok if they lose a function or
- * two. They are that meaningful.
+ * two. They are not that meaningful.
*/
- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
@@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
/*
- * Add two, for this function and the call to save_stack_trace()
+ * Add one, for this function and the call to save_stack_trace()
* If regs is set, then these functions will not be in the way.
*/
+#ifndef CONFIG_UNWINDER_ORC
if (!regs)
- trace.skip += 2;
+ trace.skip++;
+#endif
/*
* Since events can happen in NMIs there's no safe way to
@@ -2700,11 +2707,10 @@ void trace_dump_stack(int skip)
local_save_flags(flags);
- /*
- * Skip 3 more, seems to get us at the caller of
- * this function.
- */
- skip += 3;
+#ifndef CONFIG_UNWINDER_ORC
+ /* Skip 1 to skip this function. */
+ skip++;
+#endif
__ftrace_trace_stack(global_trace.trace_buffer.buffer,
flags, skip, preempt_count(), NULL);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec0f9aa4e151..1b87157edbff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
+ bool first = false;
int last_i;
int i;
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
/* events are usually grouped together with systems */
if (!last_system || call->class->system != last_system) {
+ first = true;
last_i = 0;
last_system = call->class->system;
}
+ /*
+ * Since calls are grouped by systems, the likelyhood that the
+ * next call in the iteration belongs to the same system as the
+ * previous call is high. As an optimization, we skip seaching
+ * for a map[] that matches the call's system if the last call
+ * was from the same system. That's what last_i is for. If the
+ * call has the same system as the previous call, then last_i
+ * will be the index of the first map[] that has a matching
+ * system.
+ */
for (i = last_i; i < len; i++) {
if (call->class->system == map[i]->system) {
/* Save the first system if need be */
- if (!last_i)
+ if (first) {
last_i = i;
+ first = false;
+ }
update_event_printk(call, map[i]);
}
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f2ac9d44f6c4..87411482a46f 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_STACKTRACE
+#ifdef CONFIG_UNWINDER_ORC
+/* Skip 2:
+ * event_triggers_post_call()
+ * trace_event_raw_event_xxx()
+ */
+# define STACK_SKIP 2
+#else
/*
- * Skip 3:
+ * Skip 4:
* stacktrace_trigger()
* event_triggers_post_call()
+ * trace_event_buffer_commit()
* trace_event_raw_event_xxx()
*/
-#define STACK_SKIP 3
+#define STACK_SKIP 4
+#endif
static void
stacktrace_trigger(struct event_trigger_data *data, void *rec)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 27f7ad12c4b1..b611cd36e22d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
}
+#ifdef CONFIG_UNWINDER_ORC
+/*
+ * Skip 2:
+ *
+ * function_stack_trace_call()
+ * ftrace_call()
+ */
+#define STACK_SKIP 2
+#else
+/*
+ * Skip 3:
+ * __trace_stack()
+ * function_stack_trace_call()
+ * ftrace_call()
+ */
+#define STACK_SKIP 3
+#endif
+
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (likely(disabled == 1)) {
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
- /*
- * skip over 5 funcs:
- * __ftrace_trace_stack,
- * __trace_stack,
- * function_stack_trace_call
- * ftrace_list_func
- * ftrace_call
- */
- __trace_stack(tr, flags, 5, pc);
+ __trace_stack(tr, flags, STACK_SKIP, pc);
}
atomic_dec(&data->disabled);
@@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
tracer_tracing_off(tr);
}
+#ifdef CONFIG_UNWINDER_ORC
/*
- * Skip 4:
+ * Skip 3:
+ *
+ * function_trace_probe_call()
+ * ftrace_ops_assist_func()
+ * ftrace_call()
+ */
+#define FTRACE_STACK_SKIP 3
+#else
+/*
+ * Skip 5:
+ *
+ * __trace_stack()
* ftrace_stacktrace()
* function_trace_probe_call()
- * ftrace_ops_list_func()
+ * ftrace_ops_assist_func()
* ftrace_call()
*/
-#define STACK_SKIP 4
+#define FTRACE_STACK_SKIP 5
+#endif
static __always_inline void trace_stack(struct trace_array *tr)
{
@@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr)
local_save_flags(flags);
pc = preempt_count();
- __trace_stack(tr, flags, STACK_SKIP, pc);
+ __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
}
static void
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index acf485eada1b..8c34981d90ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,7 @@
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
+#include <linux/nmi.h>
#include "workqueue_internal.h"
@@ -4463,6 +4464,12 @@ void show_workqueue_state(void)
if (pwq->nr_active || !list_empty(&pwq->delayed_works))
show_pwq(pwq);
spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
}
@@ -4490,6 +4497,12 @@ void show_workqueue_state(void)
pr_cont("\n");
next_pool:
spin_unlock_irqrestore(&pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
rcu_read_unlock_sched();