diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 14 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 5 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 7 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 39 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | kernel/futex.c | 93 | ||||
-rw-r--r-- | kernel/irq/manage.c | 11 | ||||
-rw-r--r-- | kernel/notifier.c | 2 | ||||
-rw-r--r-- | kernel/sys.c | 2 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 2 |
10 files changed, 114 insertions, 65 deletions
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ca5cc8cdb6eb..26cb51f2db72 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -490,13 +490,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); - if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + switch (prev_state) { + case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); if (refcount_dec_and_test(&st_map->kvalue.refcnt)) bpf_map_put(map); + return 0; + case BPF_STRUCT_OPS_STATE_TOBEFREE: + return -EINPROGRESS; + case BPF_STRUCT_OPS_STATE_INIT: + return -ENOENT; + default: + WARN_ON_ONCE(1); + /* Should never happen. Treat it as not found. */ + return -ENOENT; } - - return 0; } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3b6dcfb6ea49..d65c6912bdaf 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2418,7 +2418,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env, struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < sizeof(int)) { + if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; @@ -4600,7 +4600,7 @@ int btf_get_info_by_fd(const struct btf *btf, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; - struct bpf_btf_info info = {}; + struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; u32 uinfo_len; @@ -4609,6 +4609,7 @@ int btf_get_info_by_fd(const struct btf *btf, uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 80676fc00d81..cb305e71e7de 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -302,6 +302,9 @@ cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; @@ -418,8 +421,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl; int err; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e0a3b34d7039..64783da34202 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -690,14 +690,15 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. - * Return 0 on success and < 0 on error. +/* dst and src must have at least "size" number of bytes. + * Return strlen on success and < 0 on error. */ -static int bpf_obj_name_cpy(char *dst, const char *src) +int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { - const char *end = src + BPF_OBJ_NAME_LEN; + const char *end = src + size; + const char *orig_src = src; - memset(dst, 0, BPF_OBJ_NAME_LEN); + memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && @@ -706,11 +707,11 @@ static int bpf_obj_name_cpy(char *dst, const char *src) *dst++ = *src++; } - /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; - return 0; + return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, @@ -804,8 +805,9 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = bpf_obj_name_cpy(map->name, attr->map_name); - if (err) + err = bpf_obj_name_cpy(map->name, attr->map_name, + sizeof(attr->map_name)); + if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); @@ -1515,6 +1517,11 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + fdput(f); + return -ENOTSUPP; + } + mutex_lock(&map->freeze_mutex); if (map->writecnt) { @@ -2099,8 +2106,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); - err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); - if (err) + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) goto free_prog; /* run eBPF verifier */ @@ -2978,7 +2986,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_prog_info info = {}; + struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_stats stats; char __user *uinsns; @@ -2990,6 +2998,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; @@ -3253,7 +3262,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_map_info info = {}; + struct bpf_map_info info; u32 info_len = attr->info.info_len; int err; @@ -3262,6 +3271,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -3650,7 +3660,7 @@ out_put_progs: SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { - union bpf_attr attr = {}; + union bpf_attr attr; int err; if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) @@ -3662,6 +3672,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(&attr, 0, sizeof(attr)); if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; diff --git a/kernel/fork.c b/kernel/fork.c index 86425305cd4a..d90af13431c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -397,8 +397,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } diff --git a/kernel/futex.c b/kernel/futex.c index 0cf84c8664f2..82dfacb3250e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -385,9 +385,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); + return &futex_queues[hash & (futex_hashsize - 1)]; } @@ -429,7 +429,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies smp_mb(); (B) */ @@ -463,7 +463,6 @@ static void drop_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); @@ -505,6 +504,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, return timeout; } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -517,9 +556,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -659,8 +704,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -692,40 +735,14 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + out: put_page(page); return err; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7eee98c38f25..fe40c658f86f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, + desc->affinity_notify->release); + } } irqd_set(data, IRQD_AFFINITY_SET); @@ -423,7 +427,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { - cancel_work_sync(&old_notify->work); + if (cancel_work_sync(&old_notify->work)) { + /* Pending work had a ref, put that one too */ + kref_put(&old_notify->kref, old_notify->release); + } kref_put(&old_notify->kref, old_notify->release); } diff --git a/kernel/notifier.c b/kernel/notifier.c index 63d7501ac638..5989bbb93039 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_all(); + vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/sys.c b/kernel/sys.c index f9bc5c303e3f..d325f3ab624a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/sched.h> @@ -2546,6 +2547,7 @@ static int do_sysinfo(struct sysinfo *info) memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); + timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 37ffceab608f..ca1796747a77 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -730,7 +730,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (in_nmi()) { + if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ |