diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/bloom_filter.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/bpf_local_storage.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 39 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/cpumap.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/cpumask.c | 38 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 6 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 12 | ||||
| -rw-r--r-- | kernel/bpf/lpm_trie.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/memalloc.c | 31 | ||||
| -rw-r--r-- | kernel/bpf/preload/bpf_preload_kern.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/queue_stack_maps.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/reuseport_array.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/stackmap.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 189 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 258 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 20 | ||||
| -rw-r--r-- | kernel/cgroup/legacy_freezer.c | 8 | ||||
| -rw-r--r-- | kernel/time/tick-common.c | 13 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 13 | ||||
| -rw-r--r-- | kernel/trace/trace_events_user.c | 290 | ||||
| -rw-r--r-- | kernel/trace/trace_output.c | 2 |
24 files changed, 660 insertions, 300 deletions
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 540331b610a9..addf3dd57b59 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -86,9 +86,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct bpf_bloom_filter *bloom; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - if (attr->key_size != 0 || attr->value_size == 0 || attr->max_entries == 0 || attr->map_flags & ~BLOOM_CREATE_FLAG_MASK || diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 47d9948d768f..b5149cfce7d4 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -723,9 +723,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!bpf_capable()) - return -EPERM; - if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) return -E2BIG; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d3f0a4825fa6..116a0ce378ec 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -655,9 +655,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) const struct btf_type *t, *vt; struct bpf_map *map; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); if (!st_ops) return ERR_PTR(-ENOTSUPP); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 947f0b83bfad..29fe21099298 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -492,25 +492,26 @@ static bool btf_type_is_fwd(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } -static bool btf_type_nosize(const struct btf_type *t) +static bool btf_type_is_datasec(const struct btf_type *t) { - return btf_type_is_void(t) || btf_type_is_fwd(t) || - btf_type_is_func(t) || btf_type_is_func_proto(t); + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } -static bool btf_type_nosize_or_null(const struct btf_type *t) +static bool btf_type_is_decl_tag(const struct btf_type *t) { - return !t || btf_type_nosize(t); + return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; } -static bool btf_type_is_datasec(const struct btf_type *t) +static bool btf_type_nosize(const struct btf_type *t) { - return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; + return btf_type_is_void(t) || btf_type_is_fwd(t) || + btf_type_is_func(t) || btf_type_is_func_proto(t) || + btf_type_is_decl_tag(t); } -static bool btf_type_is_decl_tag(const struct btf_type *t) +static bool btf_type_nosize_or_null(const struct btf_type *t) { - return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; + return !t || btf_type_nosize(t); } static bool btf_type_is_decl_tag_target(const struct btf_type *t) @@ -751,13 +752,12 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) return offset < btf->hdr.str_len; } -static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +static bool __btf_name_char_ok(char c, bool first) { if ((first ? !isalpha(c) : !isalnum(c)) && c != '_' && - ((c == '.' && !dot_ok) || - c != '.')) + c != '.') return false; return true; } @@ -774,20 +774,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset) return NULL; } -static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) +static bool __btf_name_valid(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; - if (!__btf_name_char_ok(*src, true, dot_ok)) + if (!__btf_name_char_ok(*src, true)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!__btf_name_char_ok(*src, false, dot_ok)) + if (!__btf_name_char_ok(*src, false)) return false; src++; } @@ -795,17 +795,14 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) return !*src; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, false); + return __btf_name_valid(btf, offset); } static bool btf_name_valid_section(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, true); + return __btf_name_valid(btf, offset); } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) @@ -4429,7 +4426,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env, } if (!t->name_off || - !__btf_name_valid(env->btf, t->name_off, true)) { + !__btf_name_valid(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7421487422d4..dc85240a0134 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2064,14 +2064,16 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), -static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, - const struct bpf_insn *insn) = { +static __maybe_unused +u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST +#ifdef CONFIG_BPF_SYSCALL void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); @@ -2080,7 +2082,7 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } - +#endif #else static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8ec18faa74ac..8a33e8747a0e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -28,7 +28,6 @@ #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/kthread.h> -#include <linux/capability.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> @@ -89,9 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || (value_size != offsetofend(struct bpf_cpumap_val, qsize) && diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 7efdf5d770ca..938a60ff4295 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -132,6 +132,21 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) } /** + * bpf_cpumask_first_and() - Return the index of the first nonzero bit from the + * AND of two cpumasks. + * @src1: The first cpumask. + * @src2: The second cpumask. + * + * Find the index of the first nonzero bit of the AND of two cpumasks. + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, + const struct cpumask *src2) +{ + return cpumask_first_and(src1, src2); +} + +/** * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask. * @cpu: The CPU to be set in the cpumask. * @cpumask: The BPF cpumask in which a bit is being set. @@ -367,7 +382,7 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask } /** - * bpf_cpumask_any() - Return a random set CPU from a cpumask. + * bpf_cpumask_any_distribute() - Return a random set CPU from a cpumask. * @cpumask: The cpumask being queried. * * Return: @@ -376,26 +391,28 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask * * A struct bpf_cpumask pointer may be safely passed to @src. */ -__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask) +__bpf_kfunc u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) { - return cpumask_any(cpumask); + return cpumask_any_distribute(cpumask); } /** - * bpf_cpumask_any_and() - Return a random set CPU from the AND of two - * cpumasks. + * bpf_cpumask_any_and_distribute() - Return a random set CPU from the AND of + * two cpumasks. * @src1: The first cpumask. * @src2: The second cpumask. * * Return: - * * A random set bit within [0, num_cpus) if at least one bit is set. + * * A random set bit within [0, num_cpus) from the AND of two cpumasks, if at + * least one bit is set. * * >= num_cpus if no bit is set. * * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. */ -__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) +__bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) { - return cpumask_any_and(src1, src2); + return cpumask_any_and_distribute(src1, src2); } __diag_pop(); @@ -406,6 +423,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU) @@ -422,8 +440,8 @@ BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) -BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU) -BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_SET8_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 802692fa3905..49cc0b5671c6 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -160,9 +160,6 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) struct bpf_dtab *dtab; int err; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9901efee4339..56d3da7d0bc6 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -422,12 +422,6 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !bpf_capable()) - /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_BPF. - */ - return -EPERM; - if (zero_seed && !capable(CAP_SYS_ADMIN)) /* Guard against local DoS, and discourage production use. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4ef4c4f8a355..9e80efa59a5d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1933,8 +1933,12 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta * bpf_refcount type so that it is emitted in vmlinux BTF */ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); + if (!refcount_inc_not_zero((refcount_t *)ref)) + return NULL; - refcount_inc((refcount_t *)ref); + /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null + * in verifier.c + */ return (void *)p__refcounted_kptr; } @@ -1950,7 +1954,7 @@ static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head INIT_LIST_HEAD(h); if (!list_empty(n)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl(n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; } @@ -2032,7 +2036,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, if (!RB_EMPTY_NODE(n)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl(n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; } @@ -2406,7 +2410,7 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e0d3ddf2037a..17c7e7782a1f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -544,9 +544,6 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 410637c225fb..0668bcd7c926 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -211,9 +211,9 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) mem_cgroup_put(memcg); } -static void free_one(struct bpf_mem_cache *c, void *obj) +static void free_one(void *obj, bool percpu) { - if (c->percpu_size) { + if (percpu) { free_percpu(((void **)obj)[1]); kfree(obj); return; @@ -222,14 +222,19 @@ static void free_one(struct bpf_mem_cache *c, void *obj) kfree(obj); } -static void __free_rcu(struct rcu_head *head) +static void free_all(struct llist_node *llnode, bool percpu) { - struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); - struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); struct llist_node *pos, *t; llist_for_each_safe(pos, t, llnode) - free_one(c, pos); + free_one(pos, percpu); +} + +static void __free_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + + free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); atomic_set(&c->call_rcu_in_progress, 0); } @@ -432,7 +437,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) static void drain_mem_cache(struct bpf_mem_cache *c) { - struct llist_node *llnode, *t; + bool percpu = !!c->percpu_size; /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in @@ -441,14 +446,10 @@ static void drain_mem_cache(struct bpf_mem_cache *c) * Except for waiting_for_gp list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra)) - free_one(c, llnode); + free_all(__llist_del_all(&c->free_by_rcu), percpu); + free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(__llist_del_all(&c->free_llist), percpu); + free_all(__llist_del_all(&c->free_llist_extra), percpu); } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index b56f9f3314fd..0c63bc2cd895 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -23,9 +23,9 @@ static void free_links_and_skel(void) static int preload(struct bpf_preload_info *obj) { - strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); + strscpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); obj[0].link = maps_link; - strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); + strscpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); obj[1].link = progs_link; return 0; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 601609164ef3..8d2ddcb7566b 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,7 +7,6 @@ #include <linux/bpf.h> #include <linux/list.h> #include <linux/slab.h> -#include <linux/capability.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" @@ -46,9 +45,6 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!bpf_capable()) - return -EPERM; - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index cbf2d8d784b8..4b4f9670f1a9 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -151,9 +151,6 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b25fce425b2c..458bb80b14d5 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -74,9 +74,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 476ec959fab4..a2aef900519c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -109,37 +109,6 @@ const struct bpf_map_ops bpf_map_offload_ops = { .map_mem_usage = bpf_map_offload_map_mem_usage, }; -static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) -{ - const struct bpf_map_ops *ops; - u32 type = attr->map_type; - struct bpf_map *map; - int err; - - if (type >= ARRAY_SIZE(bpf_map_types)) - return ERR_PTR(-EINVAL); - type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); - ops = bpf_map_types[type]; - if (!ops) - return ERR_PTR(-EINVAL); - - if (ops->map_alloc_check) { - err = ops->map_alloc_check(attr); - if (err) - return ERR_PTR(err); - } - if (attr->map_ifindex) - ops = &bpf_map_offload_ops; - if (!ops->map_mem_usage) - return ERR_PTR(-EINVAL); - map = ops->map_alloc(attr); - if (IS_ERR(map)) - return map; - map->ops = ops; - map->map_type = type; - return map; -} - static void bpf_map_write_active_inc(struct bpf_map *map) { atomic64_inc(&map->writecnt); @@ -1127,7 +1096,9 @@ free_map_tab: /* called via syscall */ static int map_create(union bpf_attr *attr) { + const struct bpf_map_ops *ops; int numa_node = bpf_map_attr_numa_node(attr); + u32 map_type = attr->map_type; struct bpf_map *map; int f_flags; int err; @@ -1158,9 +1129,85 @@ static int map_create(union bpf_attr *attr) return -EINVAL; /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ - map = find_and_alloc_map(attr); + map_type = attr->map_type; + if (map_type >= ARRAY_SIZE(bpf_map_types)) + return -EINVAL; + map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); + ops = bpf_map_types[map_type]; + if (!ops) + return -EINVAL; + + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return err; + } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; + if (!ops->map_mem_usage) + return -EINVAL; + + /* Intent here is for unprivileged_bpf_disabled to block BPF map + * creation for unprivileged users; other actions depend + * on fd availability and access to bpffs, so are dependent on + * object creation success. Even with unprivileged BPF disabled, + * capability checks are still carried out. + */ + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + return -EPERM; + + /* check privileged map type permissions */ + switch (map_type) { + case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: + case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: + /* unprivileged */ + break; + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: + case BPF_MAP_TYPE_BLOOM_FILTER: + case BPF_MAP_TYPE_LPM_TRIE: + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + case BPF_MAP_TYPE_STACK_TRACE: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_STRUCT_OPS: + case BPF_MAP_TYPE_CPUMAP: + if (!bpf_capable()) + return -EPERM; + break; + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_XSKMAP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + break; + default: + WARN(1, "unsupported map type %d", map_type); + return -EPERM; + } + + map = ops->map_alloc(attr); if (IS_ERR(map)) return PTR_ERR(map); + map->ops = ops; + map->map_type = map_type; err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); @@ -2507,7 +2554,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) struct btf *attach_btf = NULL; int err; char license[128]; - bool is_gpl; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; @@ -2526,15 +2572,15 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) !bpf_capable()) return -EPERM; - /* copy eBPF program license from user space */ - if (strncpy_from_bpfptr(license, - make_bpfptr(attr->license, uattr.is_kernel), - sizeof(license) - 1) < 0) - return -EFAULT; - license[sizeof(license) - 1] = 0; - - /* eBPF programs must be GPL compatible to use GPL-ed functions */ - is_gpl = license_is_gpl_compatible(license); + /* Intent here is for unprivileged_bpf_disabled to block BPF program + * creation for unprivileged users; other actions depend + * on fd availability and access to bpffs, so are dependent on + * object creation success. Even with unprivileged BPF disabled, + * capability checks are still carried out for these + * and other operations. + */ + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + return -EPERM; if (attr->insn_cnt == 0 || attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) @@ -2618,12 +2664,20 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) goto free_prog_sec; + /* copy eBPF program license from user space */ + if (strncpy_from_bpfptr(license, + make_bpfptr(attr->license, uattr.is_kernel), + sizeof(license) - 1) < 0) + goto free_prog_sec; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; prog->orig_prog = NULL; prog->jited = 0; atomic64_set(&prog->aux->refcnt, 1); - prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); @@ -2797,28 +2851,31 @@ static void bpf_link_put_deferred(struct work_struct *work) bpf_link_free(link); } -/* bpf_link_put can be called from atomic context, but ensures that resources - * are freed from process context +/* bpf_link_put might be called from atomic context. It needs to be called + * from sleepable context in order to acquire sleeping locks during the process. */ void bpf_link_put(struct bpf_link *link) { if (!atomic64_dec_and_test(&link->refcnt)) return; - if (in_atomic()) { - INIT_WORK(&link->work, bpf_link_put_deferred); - schedule_work(&link->work); - } else { - bpf_link_free(link); - } + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); } EXPORT_SYMBOL(bpf_link_put); +static void bpf_link_put_direct(struct bpf_link *link) +{ + if (!atomic64_dec_and_test(&link->refcnt)) + return; + bpf_link_free(link); +} + static int bpf_link_release(struct inode *inode, struct file *filp) { struct bpf_link *link = filp->private_data; - bpf_link_put(link); + bpf_link_put_direct(link); return 0; } @@ -3463,6 +3520,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; + case BPF_PROG_TYPE_KPROBE: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + return 0; default: return 0; } @@ -4796,7 +4858,7 @@ out_put_progs: if (ret) bpf_prog_put(new_prog); out_put_link: - bpf_link_put(link); + bpf_link_put_direct(link); return ret; } @@ -4819,7 +4881,7 @@ static int link_detach(union bpf_attr *attr) else ret = -EOPNOTSUPP; - bpf_link_put(link); + bpf_link_put_direct(link); return ret; } @@ -4889,7 +4951,7 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) fd = bpf_link_new_fd(link); if (fd < 0) - bpf_link_put(link); + bpf_link_put_direct(link); return fd; } @@ -4966,7 +5028,7 @@ static int bpf_iter_create(union bpf_attr *attr) return PTR_ERR(link); err = bpf_iter_new_fd(link); - bpf_link_put(link); + bpf_link_put_direct(link); return err; } @@ -5036,23 +5098,8 @@ out_prog_put: static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; - bool capable; int err; - capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; - - /* Intent here is for unprivileged_bpf_disabled to block key object - * creation commands for unprivileged users; other actions depend - * of fd availability and access to bpffs, so are dependent on - * object creation success. Capabilities are later verified for - * operations such as load and map create, so even with unprivileged - * BPF disabled, capability checks are still carried out for these - * and other operations. - */ - if (!capable && - (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) - return -EPERM; - err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 086b2a14905b..11e54dd8b6dd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -197,6 +197,7 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); static void specialize_kfunc(struct bpf_verifier_env *env, u32 func_id, u16 offset, unsigned long *addr); +static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { @@ -298,16 +299,19 @@ struct bpf_kfunc_call_arg_meta { bool found; } arg_constant; - /* arg_btf and arg_btf_id are used by kfunc-specific handling, + /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, * generally to pass info about user-defined local kptr types to later * verification logic * bpf_obj_drop * Record the local kptr type to be drop'd * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) - * Record the local kptr type to be refcount_incr'd + * Record the local kptr type to be refcount_incr'd and use + * arg_owning_ref to determine whether refcount_acquire should be + * fallible */ struct btf *arg_btf; u32 arg_btf_id; + bool arg_owning_ref; struct { struct btf_field *field; @@ -439,8 +443,11 @@ static bool type_may_be_null(u32 type) return type & PTR_MAYBE_NULL; } -static bool reg_type_not_null(enum bpf_reg_type type) +static bool reg_not_null(const struct bpf_reg_state *reg) { + enum bpf_reg_type type; + + type = reg->type; if (type_may_be_null(type)) return false; @@ -450,6 +457,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || + (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || type == PTR_TO_MEM; } @@ -3771,6 +3779,96 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ } } +static bool idset_contains(struct bpf_idset *s, u32 id) +{ + u32 i; + + for (i = 0; i < s->count; ++i) + if (s->ids[i] == id) + return true; + + return false; +} + +static int idset_push(struct bpf_idset *s, u32 id) +{ + if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids))) + return -EFAULT; + s->ids[s->count++] = id; + return 0; +} + +static void idset_reset(struct bpf_idset *s) +{ + s->count = 0; +} + +/* Collect a set of IDs for all registers currently marked as precise in env->bt. + * Mark all registers with these IDs as precise. + */ +static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_idset *precise_ids = &env->idset_scratch; + struct backtrack_state *bt = &env->bt; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + DECLARE_BITMAP(mask, 64); + int i, fr; + + idset_reset(precise_ids); + + for (fr = bt->frame; fr >= 0; fr--) { + func = st->frame[fr]; + + bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (!reg->id || reg->type != SCALAR_VALUE) + continue; + if (idset_push(precise_ids, reg->id)) + return -EFAULT; + } + + bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) + break; + if (!is_spilled_scalar_reg(&func->stack[i])) + continue; + reg = &func->stack[i].spilled_ptr; + if (!reg->id) + continue; + if (idset_push(precise_ids, reg->id)) + return -EFAULT; + } + } + + for (fr = 0; fr <= st->curframe; ++fr) { + func = st->frame[fr]; + + for (i = BPF_REG_0; i < BPF_REG_10; ++i) { + reg = &func->regs[i]; + if (!reg->id) + continue; + if (!idset_contains(precise_ids, reg->id)) + continue; + bt_set_frame_reg(bt, fr, i); + } + for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) { + if (!is_spilled_scalar_reg(&func->stack[i])) + continue; + reg = &func->stack[i].spilled_ptr; + if (!reg->id) + continue; + if (!idset_contains(precise_ids, reg->id)) + continue; + bt_set_frame_slot(bt, fr, i); + } + } + + return 0; +} + /* * __mark_chain_precision() backtracks BPF program instruction sequence and * chain of verifier states making sure that register *regno* (if regno >= 0) @@ -3902,6 +4000,31 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bt->frame, last_idx, first_idx, subseq_idx); } + /* If some register with scalar ID is marked as precise, + * make sure that all registers sharing this ID are also precise. + * This is needed to estimate effect of find_equal_scalars(). + * Do this at the last instruction of each state, + * bpf_reg_state::id fields are valid for these instructions. + * + * Allows to track precision in situation like below: + * + * r2 = unknown value + * ... + * --- state #0 --- + * ... + * r1 = r2 // r1 and r2 now share the same ID + * ... + * --- state #1 {r1.id = A, r2.id = A} --- + * ... + * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1 + * ... + * --- state #2 {r1.id = A, r2.id = A} --- + * r3 = r10 + * r3 += r1 // need to mark both r1 and r2 + */ + if (mark_precise_scalar_ids(env, st)) + return -EFAULT; + if (last_idx < 0) { /* we are at the entry into subprog, which * is expected for global funcs, but only if @@ -4222,6 +4345,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + /* Break the relation on a narrowing spill. */ + if (fls64(reg->umax_value) > BITS_PER_BYTE * size) + state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; @@ -5891,7 +6017,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * program allocated objects (which always have ref_obj_id > 0), * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. */ - if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) { verbose(env, "only read is supported\n"); return -EACCES; } @@ -7511,7 +7637,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (base_type(arg_type) == ARG_PTR_TO_MEM) type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC) + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) type &= ~MEM_ALLOC; for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { @@ -9678,11 +9804,6 @@ static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_ACQUIRE; } -static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_RET_NULL; -} - static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_RELEASE; @@ -9998,6 +10119,16 @@ BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) +{ + if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && + meta->arg_owning_ref) { + return false; + } + + return meta->kfunc_flags & KF_RET_NULL; +} + static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock]; @@ -10475,6 +10606,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, node_off, btf_name_by_offset(reg->btf, t->name_off)); return -EINVAL; } + meta->arg_btf = reg->btf; + meta->arg_btf_id = reg->btf_id; if (node_off != field->graph_root.node_offset) { verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n", @@ -10878,10 +11011,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: - if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) { + if (!type_is_ptr_alloc_obj(reg->type)) { verbose(env, "arg#%d is neither owning or non-owning ref\n", i); return -EINVAL; } + if (!type_is_non_owning_ref(reg->type)) + meta->arg_owning_ref = true; rec = reg_btf_record(reg); if (!rec) { @@ -11044,6 +11179,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; insn_aux->insert_off = regs[BPF_REG_2].off; + insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", @@ -12801,12 +12937,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { struct bpf_reg_state *src_reg = regs + insn->src_reg; struct bpf_reg_state *dst_reg = regs + insn->dst_reg; + bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id && + !tnum_is_const(src_reg->var_off); if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 * copy register state to dest reg */ - if (src_reg->type == SCALAR_VALUE && !src_reg->id) + if (need_id) /* Assign src and dst registers the same ID * that will be used by find_equal_scalars() * to propagate min/max range. @@ -12825,7 +12963,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; - if (is_src_reg_u32 && !src_reg->id) + if (is_src_reg_u32 && need_id) src_reg->id = ++env->id_gen; copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared if src_reg is not in u32 range otherwise @@ -13157,7 +13295,7 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { if (__is_pointer_value(false, reg)) { - if (!reg_type_not_null(reg->type)) + if (!reg_not_null(reg)) return -1; /* If pointer is valid tests against zero will fail so we can @@ -14981,8 +15119,9 @@ static bool range_within(struct bpf_reg_state *old, * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) { + struct bpf_id_pair *map = idmap->map; unsigned int i; /* either both IDs should be set or both should be zero */ @@ -14993,20 +15132,34 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) return true; for (i = 0; i < BPF_ID_MAP_SIZE; i++) { - if (!idmap[i].old) { + if (!map[i].old) { /* Reached an empty slot; haven't seen this id before */ - idmap[i].old = old_id; - idmap[i].cur = cur_id; + map[i].old = old_id; + map[i].cur = cur_id; return true; } - if (idmap[i].old == old_id) - return idmap[i].cur == cur_id; + if (map[i].old == old_id) + return map[i].cur == cur_id; + if (map[i].cur == cur_id) + return false; } /* We ran out of idmap slots, which should be impossible */ WARN_ON_ONCE(1); return false; } +/* Similar to check_ids(), but allocate a unique temporary ID + * for 'old_id' or 'cur_id' of zero. + * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid. + */ +static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) +{ + old_id = old_id ? old_id : ++idmap->tmp_id_gen; + cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; + + return check_ids(old_id, cur_id, idmap); +} + static void clean_func_state(struct bpf_verifier_env *env, struct bpf_func_state *st) { @@ -15105,7 +15258,7 @@ next: static bool regs_exact(const struct bpf_reg_state *rold, const struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) + struct bpf_idmap *idmap) { return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && @@ -15114,7 +15267,7 @@ static bool regs_exact(const struct bpf_reg_state *rold, /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) + struct bpf_reg_state *rcur, struct bpf_idmap *idmap) { if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ @@ -15151,15 +15304,42 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, switch (base_type(rold->type)) { case SCALAR_VALUE: - if (regs_exact(rold, rcur, idmap)) - return true; - if (env->explore_alu_limits) - return false; + if (env->explore_alu_limits) { + /* explore_alu_limits disables tnum_in() and range_within() + * logic and requires everything to be strict + */ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + check_scalar_ids(rold->id, rcur->id, idmap); + } if (!rold->precise) return true; - /* new val must satisfy old val knowledge */ + /* Why check_ids() for scalar registers? + * + * Consider the following BPF code: + * 1: r6 = ... unbound scalar, ID=a ... + * 2: r7 = ... unbound scalar, ID=b ... + * 3: if (r6 > r7) goto +1 + * 4: r6 = r7 + * 5: if (r6 > X) goto ... + * 6: ... memory operation using r7 ... + * + * First verification path is [1-6]: + * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; + * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark + * r7 <= X, because r6 and r7 share same id. + * Next verification path is [1-4, 6]. + * + * Instruction (6) would be reached in two states: + * I. r6{.id=b}, r7{.id=b} via path 1-6; + * II. r6{.id=a}, r7{.id=b} via path 1-4, 6. + * + * Use check_ids() to distinguish these states. + * --- + * Also verify that new value satisfies old value range knowledge. + */ return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); + tnum_in(rold->var_off, rcur->var_off) && + check_scalar_ids(rold->id, rcur->id, idmap); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: @@ -15205,7 +15385,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_id_pair *idmap) + struct bpf_func_state *cur, struct bpf_idmap *idmap) { int i, spi; @@ -15308,7 +15488,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, } static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, - struct bpf_id_pair *idmap) + struct bpf_idmap *idmap) { int i; @@ -15356,13 +15536,13 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], - env->idmap_scratch)) + &env->idmap_scratch)) return false; - if (!stacksafe(env, old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, &env->idmap_scratch)) return false; - if (!refsafe(old, cur, env->idmap_scratch)) + if (!refsafe(old, cur, &env->idmap_scratch)) return false; return true; @@ -15377,7 +15557,8 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; - memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + env->idmap_scratch.tmp_id_gen = env->id_gen; + memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. @@ -15395,7 +15576,7 @@ static bool states_equal(struct bpf_verifier_env *env, return false; if (old->active_lock.id && - !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch)) + !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch)) return false; if (old->active_rcu_lock != cur->active_rcu_lock) @@ -17613,9 +17794,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) } /* finally lock prog and jit images for all functions and - * populate kallsysm + * populate kallsysm. Begin at the first subprogram, since + * bpf_prog_load will add the kallsyms for the main program. */ - for (i = 0; i < env->subprog_cnt; i++) { + for (i = 1; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -17641,6 +17823,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->jited_len = func[0]->jited_len; + prog->aux->extable = func[0]->aux->extable; + prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245cf62ce85a..4d42f0cbc11e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1798,7 +1798,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1842,7 +1842,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1860,9 +1861,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 936473203a6b..122dacb3a443 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -108,16 +108,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css) struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); + cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); + cpus_read_unlock(); return 0; } @@ -132,14 +134,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); + cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 65b8658da829..e9138cd7a0f5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - ktime_t next_p; - u32 rem; - tick_do_timer_cpu = cpu; - - next_p = ktime_get(); - div_u64_rem(next_p, TICK_NSEC, &rem); - if (rem) { - next_p -= rem; - next_p += TICK_NSEC; - } - - tick_next_period = next_p; + tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52254679ec48..42c0be3080bd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ - if (last_jiffies_update == 0) + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + last_jiffies_update = tick_next_period; + } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index dbb14705d0d3..8df0550415e7 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -50,6 +50,18 @@ #define EVENT_STATUS_OTHER BIT(7) /* + * User register flags are not allowed yet, keep them here until we are + * ready to expose them out to the user ABI. + */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + +/* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. */ @@ -85,8 +97,10 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; + struct work_struct put_work; refcount_t refcnt; int min_size; + int reg_flags; char status; }; @@ -165,76 +179,151 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser); + struct user_event **newuser, int reg_flags); static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); +static int destroy_user_event(struct user_event *user); static u32 user_event_key(char *name) { return jhash(name, strlen(name), 0); } -static void user_event_group_destroy(struct user_event_group *group) +static struct user_event *user_event_get(struct user_event *user) { - kfree(group->system_name); - kfree(group); + refcount_inc(&user->refcnt); + + return user; } -static char *user_event_group_system_name(struct user_namespace *user_ns) +static void delayed_destroy_user_event(struct work_struct *work) { - char *system_name; - int len = sizeof(USER_EVENTS_SYSTEM) + 1; + struct user_event *user = container_of( + work, struct user_event, put_work); - if (user_ns != &init_user_ns) { + mutex_lock(&event_mutex); + + if (!refcount_dec_and_test(&user->refcnt)) + goto out; + + if (destroy_user_event(user)) { /* - * Unexpected at this point: - * We only currently support init_user_ns. - * When we enable more, this will trigger a failure so log. + * The only reason this would fail here is if we cannot + * update the visibility of the event. In this case the + * event stays in the hashtable, waiting for someone to + * attempt to delete it later. */ - pr_warn("user_events: Namespace other than init_user_ns!\n"); - return NULL; + pr_warn("user_events: Unable to delete event\n"); + refcount_set(&user->refcnt, 1); } +out: + mutex_unlock(&event_mutex); +} - system_name = kmalloc(len, GFP_KERNEL); +static void user_event_put(struct user_event *user, bool locked) +{ + bool delete; - if (!system_name) - return NULL; + if (unlikely(!user)) + return; - snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + /* + * When the event is not enabled for auto-delete there will always + * be at least 1 reference to the event. During the event creation + * we initially set the refcnt to 2 to achieve this. In those cases + * the caller must acquire event_mutex and after decrement check if + * the refcnt is 1, meaning this is the last reference. When auto + * delete is enabled, there will only be 1 ref, IE: refcnt will be + * only set to 1 during creation to allow the below checks to go + * through upon the last put. The last put must always be done with + * the event mutex held. + */ + if (!locked) { + lockdep_assert_not_held(&event_mutex); + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex); + } else { + lockdep_assert_held(&event_mutex); + delete = refcount_dec_and_test(&user->refcnt); + } - return system_name; + if (!delete) + return; + + /* + * We now have the event_mutex in all cases, which ensures that + * no new references will be taken until event_mutex is released. + * New references come through find_user_event(), which requires + * the event_mutex to be held. + */ + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* We should not get here when persist flag is set */ + pr_alert("BUG: Auto-delete engaged on persistent event\n"); + goto out; + } + + /* + * Unfortunately we have to attempt the actual destroy in a work + * queue. This is because not all cases handle a trace_event_call + * being removed within the class->reg() operation for unregister. + */ + INIT_WORK(&user->put_work, delayed_destroy_user_event); + + /* + * Since the event is still in the hashtable, we have to re-inc + * the ref count to 1. This count will be decremented and checked + * in the work queue to ensure it's still the last ref. This is + * needed because a user-process could register the same event in + * between the time of event_mutex release and the work queue + * running the delayed destroy. If we removed the item now from + * the hashtable, this would result in a timing window where a + * user process would fail a register because the trace_event_call + * register would fail in the tracing layers. + */ + refcount_set(&user->refcnt, 1); + + if (WARN_ON_ONCE(!schedule_work(&user->put_work))) { + /* + * If we fail we must wait for an admin to attempt delete or + * another register/close of the event, whichever is first. + */ + pr_warn("user_events: Unable to queue delayed destroy\n"); + } +out: + /* Ensure if we didn't have event_mutex before we unlock it */ + if (!locked) + mutex_unlock(&event_mutex); } -static inline struct user_event_group -*user_event_group_from_user_ns(struct user_namespace *user_ns) +static void user_event_group_destroy(struct user_event_group *group) { - if (user_ns == &init_user_ns) - return init_group; - - return NULL; + kfree(group->system_name); + kfree(group); } -static struct user_event_group *current_user_event_group(void) +static char *user_event_group_system_name(void) { - struct user_namespace *user_ns = current_user_ns(); - struct user_event_group *group = NULL; + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; - while (user_ns) { - group = user_event_group_from_user_ns(user_ns); + system_name = kmalloc(len, GFP_KERNEL); - if (group) - break; + if (!system_name) + return NULL; - user_ns = user_ns->parent; - } + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); - return group; + return system_name; } -static struct user_event_group -*user_event_group_create(struct user_namespace *user_ns) +static struct user_event_group *current_user_event_group(void) +{ + return init_group; +} + +static struct user_event_group *user_event_group_create(void) { struct user_event_group *group; @@ -243,7 +332,7 @@ static struct user_event_group if (!group) return NULL; - group->system_name = user_event_group_system_name(user_ns); + group->system_name = user_event_group_system_name(); if (!group->system_name) goto error; @@ -259,12 +348,13 @@ error: return NULL; }; -static void user_event_enabler_destroy(struct user_event_enabler *enabler) +static void user_event_enabler_destroy(struct user_event_enabler *enabler, + bool locked) { list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ - refcount_dec(&enabler->event->refcnt); + user_event_put(enabler->event, locked); kfree(enabler); } @@ -326,7 +416,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) /* User asked for enabler to be removed during fault */ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); goto out; } @@ -501,14 +591,12 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, if (!enabler) return false; - enabler->event = orig->event; + enabler->event = user_event_get(orig->event); enabler->addr = orig->addr; /* Only dup part of value (ignore future flags, etc) */ enabler->values = orig->values & ENABLE_VAL_DUP_MASK; - refcount_inc(&enabler->event->refcnt); - /* Enablers not exposed yet, RCU not required */ list_add(&enabler->mm_enablers_link, &mm->enablers); @@ -625,7 +713,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm) struct user_event_enabler *enabler, *next; list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, false); mmdrop(mm->mm); kfree(mm); @@ -780,7 +868,7 @@ retry: * exit or run exec(), which includes forks and clones. */ if (!*write_result) { - refcount_inc(&enabler->event->refcnt); + user_event_get(user); list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } @@ -803,7 +891,12 @@ out: static __always_inline __must_check bool user_event_last_ref(struct user_event *user) { - return refcount_read(&user->refcnt) == 1; + int last = 0; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) + last = 1; + + return refcount_read(&user->refcnt) == last; } static __always_inline __must_check @@ -842,7 +935,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * Upon success user_event has its ref count increased by 1. */ static int user_event_parse_cmd(struct user_event_group *group, - char *raw_command, struct user_event **newuser) + char *raw_command, struct user_event **newuser, + int reg_flags) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -856,7 +950,7 @@ static int user_event_parse_cmd(struct user_event_group *group, if (flags) *flags++ = '\0'; - return user_event_parse(group, name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser, reg_flags); } static int user_field_array_size(const char *type) @@ -1367,10 +1461,8 @@ static struct user_event *find_user_event(struct user_event_group *group, *outkey = key; hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) { - refcount_inc(&user->refcnt); - return user; - } + if (!strcmp(EVENT_NAME(user), name)) + return user_event_get(user); return NULL; } @@ -1432,7 +1524,7 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i, if (unlikely(!entry)) return; - if (unlikely(!copy_nofault(entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1473,7 +1565,7 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i, perf_fetch_caller_regs(regs); - if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1584,12 +1676,12 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - refcount_inc(&user->refcnt); + user_event_get(user); update_enable_bit_for(user); return 0; dec: update_enable_bit_for(user); - refcount_dec(&user->refcnt); + user_event_put(user, true); return 0; } @@ -1620,10 +1712,11 @@ static int user_event_create(const char *raw_command) mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(group, name, &user); + /* Dyn events persist, otherwise they would cleanup immediately */ + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST); if (!ret) - refcount_dec(&user->refcnt); + user_event_put(user, false); mutex_unlock(&group->reg_mutex); @@ -1745,6 +1838,8 @@ static bool user_event_match(const char *system, const char *event, if (match && argc > 0) match = user_fields_match(user, argc, argv); + else if (match && argc == 0) + match = list_empty(&user->fields); return match; } @@ -1781,11 +1876,17 @@ static int user_event_trace_register(struct user_event *user) */ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser) + struct user_event **newuser, int reg_flags) { int ret; u32 key; struct user_event *user; + int argc = 0; + char **argv; + + /* User register flags are not ready yet */ + if (reg_flags != 0 || flags != NULL) + return -EINVAL; /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); @@ -1793,13 +1894,35 @@ static int user_event_parse(struct user_event_group *group, char *name, mutex_unlock(&event_mutex); if (user) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + if (!argv) { + ret = -ENOMEM; + goto error; + } + + ret = user_fields_match(user, argc, (const char **)argv); + argv_free(argv); + + } else + ret = list_empty(&user->fields); + + if (ret) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + } else { + ret = -EADDRINUSE; + goto error; + } + return 0; +error: + user_event_put(user, false); + return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); @@ -1852,8 +1975,15 @@ static int user_event_parse(struct user_event_group *group, char *name, if (ret) goto put_user_lock; - /* Ensure we track self ref and caller ref (2) */ - refcount_set(&user->refcnt, 2); + user->reg_flags = reg_flags; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); + } else { + /* Ensure we track only caller ref (1) */ + refcount_set(&user->refcnt, 1); + } dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); @@ -1885,7 +2015,7 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user) return -ENOENT; - refcount_dec(&user->refcnt); + user_event_put(user, true); if (!user_event_last_ref(user)) return -EBUSY; @@ -2044,9 +2174,7 @@ static int user_events_ref_add(struct user_event_file_info *info, for (i = 0; i < count; ++i) new_refs->events[i] = refs->events[i]; - new_refs->events[i] = user; - - refcount_inc(&user->refcnt); + new_refs->events[i] = user_event_get(user); rcu_assign_pointer(info->refs, new_refs); @@ -2077,8 +2205,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (ret) return ret; - /* Ensure no flags, since we don't support any yet */ - if (kreg->flags != 0) + /* Ensure only valid flags */ + if (kreg->flags & ~(USER_EVENT_REG_MAX-1)) return -EINVAL; /* Ensure supported size */ @@ -2150,7 +2278,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, return ret; } - ret = user_event_parse_cmd(info->group, name, &user); + ret = user_event_parse_cmd(info->group, name, &user, reg.flags); if (ret) { kfree(name); @@ -2160,7 +2288,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - refcount_dec(&user->refcnt); + user_event_put(user, false); /* Positive number is index and valid */ if (ret < 0) @@ -2309,7 +2437,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); /* Removed at least one */ ret = 0; @@ -2367,7 +2495,6 @@ static int user_events_release(struct inode *node, struct file *file) struct user_event_file_info *info = file->private_data; struct user_event_group *group; struct user_event_refs *refs; - struct user_event *user; int i; if (!info) @@ -2391,12 +2518,9 @@ static int user_events_release(struct inode *node, struct file *file) * The underlying user_events are ref counted, and cannot be freed. * After this decrement, the user_events may be freed elsewhere. */ - for (i = 0; i < refs->count; ++i) { - user = refs->events[i]; + for (i = 0; i < refs->count; ++i) + user_event_put(refs->events[i], false); - if (user) - refcount_dec(&user->refcnt); - } out: file->private_data = NULL; @@ -2577,7 +2701,7 @@ static int __init trace_events_user_init(void) if (!fault_cache) return -ENOMEM; - init_group = user_event_group_create(&init_user_ns); + init_group = user_event_group_create(); if (!init_group) { kmem_cache_destroy(fault_cache); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 15f05faaae44..1e33f367783e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -847,7 +847,7 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c int ret; void *pos; - list_for_each_entry(field, head, link) { + list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); if (field->offset + field->size > iter->ent_size) { trace_seq_puts(&iter->seq, "<OVERFLOW>"); |
