diff options
Diffstat (limited to 'kernel')
112 files changed, 5689 insertions, 1520 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 6e699100872f..34d1e77ee9df 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,5 +1,6 @@ # # Generated files # +kheaders.md5 timeconst.h hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index 62471e75a2b0..33824f0385b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -71,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -122,3 +123,12 @@ $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) + +$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz + +quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@ +$(obj)/kheaders_data.tar.xz: FORCE + $(call cmd,genikh) + +clean-files := kheaders_data.tar.xz kheaders.md5 diff --git a/kernel/acct.c b/kernel/acct.c index addf7732fb56..81f9831a7859 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = mnt_want_write(internal); + err = __mnt_want_write(internal); if (err) { mntput(internal); kfree(acct); @@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - mnt_drop_write(mnt); + __mnt_drop_write(mnt); mntput(mnt); return 0; } diff --git a/kernel/async.c b/kernel/async.c index f6bd0d9885e1..12c332e4e13e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); diff --git a/kernel/audit.c b/kernel/audit.c index c89ea48c70a6..b96bf69183f4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2220,7 +2220,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; diff --git a/kernel/audit.h b/kernel/audit.h index 958d5b8fc1b3..2071725a999f 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -231,7 +231,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); -extern int audit_compare_dname_path(const char *dname, const char *path, int plen); +extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen); extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size); extern void audit_panic(const char *message); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 37ae95cfb7f4..b5737b826951 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -164,7 +164,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie, + const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index abfb112f26aa..e49c912f862d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1040,7 +1040,7 @@ static void evict_chunk(struct audit_chunk *chunk) static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie, + const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e8d1adeb2223..b50c574223fa 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -255,7 +255,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, + const struct qstr *dname, dev_t dev, unsigned long ino, unsigned invalidating) { struct audit_watch *owatch, *nwatch, *nextw; @@ -482,7 +482,7 @@ void audit_remove_watch_rule(struct audit_krule *krule) static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie, + const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 63f8b3f26fab..303fb04770ce 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1114,22 +1114,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) int err = 0; struct audit_entry *entry; - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - switch (type) { case AUDIT_ADD_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); break; case AUDIT_DEL_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); break; default: - err = -EINVAL; WARN_ON(1); + return -EINVAL; } if (err || type == AUDIT_DEL_RULE) { @@ -1290,12 +1292,12 @@ int parent_len(const char *path) * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL * here indicates that we must compute this value. */ -int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen) { int dlen, pathlen; const char *p; - dlen = strlen(dname); + dlen = dname->len; pathlen = strlen(path); if (pathlen < dlen) return 1; @@ -1306,7 +1308,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) p = path + parentlen; - return strncmp(p, dname, dlen); + return strncmp(p, dname->name, dlen); } int audit_filter(int msgtype, unsigned int listtype) @@ -1315,8 +1317,6 @@ int audit_filter(int msgtype, unsigned int listtype) int ret = 1; /* Audit by default */ rcu_read_lock(); - if (list_empty(&audit_filter_list[listtype])) - goto unlock_and_return; list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { int i, result = 0; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d1eab1d4a930..95ae27edd417 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -771,15 +771,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_DISABLED; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return state; - } + list_for_each_entry_rcu(e, list, list) { + if (audit_in_mask(&e->rule, ctx->major) && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state, false)) { + rcu_read_unlock(); + ctx->current_state = state; + return state; } } rcu_read_unlock(); @@ -798,9 +796,6 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_entry *e; enum audit_state state; - if (list_empty(list)) - return 0; - list_for_each_entry_rcu(e, list, list) { if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { @@ -808,7 +803,6 @@ static int audit_filter_inode_name(struct task_struct *tsk, return 1; } } - return 0; } @@ -840,6 +834,13 @@ static inline void audit_proctitle_free(struct audit_context *context) context->proctitle.len = 0; } +static inline void audit_free_module(struct audit_context *context) +{ + if (context->type == AUDIT_KERN_MODULE) { + kfree(context->module.name); + context->module.name = NULL; + } +} static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; @@ -923,6 +924,7 @@ int audit_alloc(struct task_struct *tsk) static inline void audit_free_context(struct audit_context *context) { + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); free_tree_refs(context); @@ -1139,7 +1141,8 @@ out: kfree(buf_head); } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +static void audit_log_cap(struct audit_buffer *ab, char *prefix, + kernel_cap_t *cap) { int i; @@ -1266,7 +1269,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "name="); if (context->module.name) { audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); } else audit_log_format(ab, "(null)"); @@ -1628,7 +1630,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; } - context->arch = syscall_get_arch(); + context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -1697,6 +1699,7 @@ void __audit_syscall_exit(int success, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); audit_free_aux(context); @@ -1897,8 +1900,9 @@ static inline int audit_copy_fcaps(struct audit_names *name, } /* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - struct inode *inode, unsigned int flags) +static void audit_copy_inode(struct audit_names *name, + const struct dentry *dentry, + struct inode *inode, unsigned int flags) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; @@ -1935,18 +1939,16 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, return; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - - if (f->type == AUDIT_FSTYPE - && audit_comparator(inode->i_sb->s_magic, - f->op, f->val) - && e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(inode->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } @@ -2045,7 +2047,7 @@ void __audit_inode_child(struct inode *parent, { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); - const char *dname = dentry->d_name.name; + const struct qstr *dname = &dentry->d_name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; @@ -2055,18 +2057,16 @@ void __audit_inode_child(struct inode *parent, return; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - - if (f->type == AUDIT_FSTYPE - && audit_comparator(parent->i_sb->s_magic, - f->op, f->val) - && e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(parent->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } @@ -2099,7 +2099,7 @@ void __audit_inode_child(struct inode *parent, (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; - if (!strcmp(dname, n->name->name) || + if (!strcmp(dname->name, n->name->name) || !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : @@ -2512,6 +2512,35 @@ void __audit_fanotify(unsigned int response) AUDIT_FANOTIFY, "resp=%u", response); } +void __audit_tk_injoffset(struct timespec64 offset) +{ + audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, + "sec=%lli nsec=%li", + (long long)offset.tv_sec, offset.tv_nsec); +} + +static void audit_log_ntp_val(const struct audit_ntp_data *ad, + const char *op, enum audit_ntp_type type) +{ + const struct audit_ntp_val *val = &ad->vals[type]; + + if (val->newval == val->oldval) + return; + + audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, + "op=%s old=%lli new=%lli", op, val->oldval, val->newval); +} + +void __audit_ntp_log(const struct audit_ntp_data *ad) +{ + audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET); + audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ); + audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS); + audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI); + audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK); + audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; @@ -2580,7 +2609,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, + signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c72e0d8e1e65..584636c9e2eb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -22,7 +22,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -63,6 +63,7 @@ int array_map_alloc_check(union bpf_attr *attr) if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; @@ -160,6 +161,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * (index & array->index_mask); } +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, + u32 off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + if (map->max_entries != 1) + return -ENOTSUPP; + if (off >= map->value_size) + return -EINVAL; + + *imm = (unsigned long)array->value; + return 0; +} + +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, + u32 *off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u64 base = (unsigned long)array->value; + u64 range = array->elem_size; + + if (map->max_entries != 1) + return -ENOTSUPP; + if (imm < base || imm >= base + range) + return -ENOENT; + + *off = imm - base; + return 0; +} + /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -360,7 +391,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, return; } - seq_printf(m, "%u: ", *(u32 *)key); + if (map->btf_key_type_id) + seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); @@ -397,6 +429,18 @@ static int array_map_check_btf(const struct bpf_map *map, { u32 int_data; + /* One exception for keyless BTF: .bss/.data/.rodata map */ + if (btf_type_is_void(key_type)) { + if (map->map_type != BPF_MAP_TYPE_ARRAY || + map->max_entries != 1) + return -EINVAL; + + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) + return -EINVAL; + + return 0; + } + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; @@ -419,6 +463,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_direct_value_addr = array_map_direct_value_addr, + .map_direct_value_meta = array_map_direct_value_meta, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; @@ -440,6 +486,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr) /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; + /* Program read-only/write-only not supported for special maps yet. */ + if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) + return -EINVAL; return array_map_alloc_check(attr); } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bd3921b1514b..cad09858a5f2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -185,6 +185,16 @@ i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, struct_type, member) \ + for (i = 0, member = btf_type_var_secinfo(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +#define for_each_vsi_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + static DEFINE_IDR(btf_idr); static DEFINE_SPINLOCK(btf_idr_lock); @@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = "RESTRICT", [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", }; struct btf_kind_operations { @@ -314,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } -static bool btf_type_is_void(const struct btf_type *t) +bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; } @@ -375,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +static bool btf_type_is_datasec(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; +} + +/* Types that act only as a source, not sink or intermediate + * type when resolving. + */ +static bool btf_type_is_resolve_source_only(const struct btf_type *t) +{ + return btf_type_is_var(t) || + btf_type_is_datasec(t); +} + /* What types need to be resolved? * * btf_type_is_modifier() is an obvious one. * * btf_type_is_struct() because its member refers to * another type (through member->type). - + * + * btf_type_is_var() because the variable refers to + * another type. btf_type_is_datasec() holds multiple + * btf_type_is_var() types that need resolving. + * * btf_type_is_array() because its element (array->type) * refers to another type. Array can be thought of a * special case of struct while array just has the same @@ -390,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t) static bool btf_type_needs_resolve(const struct btf_type *t) { return btf_type_is_modifier(t) || - btf_type_is_ptr(t) || - btf_type_is_struct(t) || - btf_type_is_array(t); + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t) || + btf_type_is_var(t) || + btf_type_is_datasec(t); } /* t->size can be used */ @@ -403,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_DATASEC: return true; } @@ -467,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t) return (const struct btf_enum *)(t + 1); } +static const struct btf_var *btf_type_var(const struct btf_type *t) +{ + return (const struct btf_var *)(t + 1); +} + +static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -478,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +{ + if ((first ? !isalpha(c) : + !isalnum(c)) && + c != '_' && + ((c == '.' && !dot_ok) || + c != '.')) + return false; + return true; +} + +static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ const char *src = &btf->strings[offset]; const char *src_limit; - if (!isalpha(*src) && *src != '_') + if (!__btf_name_char_ok(*src, true, dot_ok)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!isalnum(*src) && *src != '_') + if (!__btf_name_char_ok(*src, false, dot_ok)) return false; src++; } @@ -502,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, false); +} + +static bool btf_name_valid_section(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, true); +} + static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) @@ -697,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, __btf_verifier_log(log, "\n"); } +__printf(4, 5) +static void btf_verifier_log_vsi(struct btf_verifier_env *env, + const struct btf_type *datasec_type, + const struct btf_var_secinfo *vsi, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + if (env->phase != CHECK_META) + btf_verifier_log_type(env, datasec_type, NULL); + + __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", + vsi->type, vsi->offset, vsi->size); + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + static void btf_verifier_log_hdr(struct btf_verifier_env *env, u32 btf_data_size) { @@ -974,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { - if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && + !btf_type_is_var(size_type))) return NULL; size = btf->resolved_sizes[size_type_id]; @@ -1509,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1542,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, return 0; } +static int btf_var_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type || btf_type_is_resolve_source_only(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + /* We must resolve to something concrete at this point, no + * forward types or similar that would resolve to size of + * zero is allowed. + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + static int btf_ptr_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -1551,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1609,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf, btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } +static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) @@ -1776,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_nosize_or_null(index_type)) { + if (btf_type_is_resolve_source_only(index_type) || + btf_type_nosize_or_null(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1795,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_nosize_or_null(elem_type)) { + if (btf_type_is_resolve_source_only(elem_type) || + btf_type_nosize_or_null(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2016,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_nosize_or_null(member_type)) { + if (btf_type_is_resolve_source_only(member_type) || + btf_type_nosize_or_null(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; @@ -2411,6 +2566,222 @@ static struct btf_kind_operations func_ops = { .seq_show = btf_df_seq_show, }; +static s32 btf_var_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var *var; + u32 meta_needed = sizeof(*var); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !__btf_name_valid(env->btf, t->name_off, true)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + /* A var cannot be in type void */ + if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + var = btf_type_var(t); + if (var->linkage != BTF_VAR_STATIC && + var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { + btf_verifier_log_type(env, t, "Linkage not supported"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) +{ + const struct btf_var *var = btf_type_var(t); + + btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); +} + +static const struct btf_kind_operations var_ops = { + .check_meta = btf_var_check_meta, + .resolve = btf_var_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_var_log, + .seq_show = btf_var_seq_show, +}; + +static s32 btf_datasec_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var_secinfo *vsi; + u64 last_vsi_end_off = 0, sum = 0; + u32 i, meta_needed; + + meta_needed = btf_type_vlen(t) * sizeof(*vsi); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (!btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen == 0"); + return -EINVAL; + } + + if (!t->size) { + btf_verifier_log_type(env, t, "size == 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !btf_name_valid_section(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_vsi(i, t, vsi) { + /* A var cannot be in type void */ + if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid type_id"); + return -EINVAL; + } + + if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset"); + return -EINVAL; + } + + if (!vsi->size || vsi->size > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid size"); + return -EINVAL; + } + + last_vsi_end_off = vsi->offset + vsi->size; + if (last_vsi_end_off > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset+size"); + return -EINVAL; + } + + btf_verifier_log_vsi(env, t, vsi, NULL); + sum += vsi->size; + } + + if (t->size < sum) { + btf_verifier_log_type(env, t, "Invalid btf_info size"); + return -EINVAL; + } + + return meta_needed; +} + +static int btf_datasec_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_var_secinfo *vsi; + struct btf *btf = env->btf; + u16 i; + + for_each_vsi_from(i, v->next_member, v->t, vsi) { + u32 var_type_id = vsi->type, type_id, type_size = 0; + const struct btf_type *var_type = btf_type_by_id(env->btf, + var_type_id); + if (!var_type || !btf_type_is_var(var_type)) { + btf_verifier_log_vsi(env, v->t, vsi, + "Not a VAR kind member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, var_type) && + !env_type_is_resolved(env, var_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, var_type, var_type_id); + } + + type_id = var_type->type; + if (!btf_type_id_size(btf, &type_id, &type_size)) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); + return -EINVAL; + } + + if (vsi->size < type_size) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); + return -EINVAL; + } + } + + env_stack_pop_resolved(env, 0, 0); + return 0; +} + +static void btf_datasec_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_datasec_seq_show(const struct btf *btf, + const struct btf_type *t, u32 type_id, + void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + u32 i; + + seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); + for_each_vsi(i, t, vsi) { + var = btf_type_by_id(btf, vsi->type); + if (i) + seq_puts(m, ","); + btf_type_ops(var)->seq_show(btf, var, vsi->type, + data + vsi->offset, bits_offset, m); + } + seq_puts(m, "}"); +} + +static const struct btf_kind_operations datasec_ops = { + .check_meta = btf_datasec_check_meta, + .resolve = btf_datasec_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_datasec_log, + .seq_show = btf_datasec_seq_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -2542,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = &modifier_ops, [BTF_KIND_FUNC] = &func_ops, [BTF_KIND_FUNC_PROTO] = &func_proto_ops, + [BTF_KIND_VAR] = &var_ops, + [BTF_KIND_DATASEC] = &datasec_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -2622,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, if (!env_type_is_resolved(env, type_id)) return false; - if (btf_type_is_struct(t)) + if (btf_type_is_struct(t) || btf_type_is_datasec(t)) return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + !btf->resolved_sizes[type_id]; - if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || + btf_type_is_var(t)) { t = btf_type_id_resolve(btf, &type_id); - return t && !btf_type_is_modifier(t); + return t && + !btf_type_is_modifier(t) && + !btf_type_is_var(t) && + !btf_type_is_datasec(t); } if (btf_type_is_array(t)) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4e807973aa80..fcde0f7b2585 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -11,7 +11,10 @@ #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/cgroup.h> +#include <linux/filter.h> #include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/string.h> #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> @@ -701,7 +704,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -710,6 +713,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: @@ -725,6 +734,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -762,3 +777,356 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { .get_func_proto = cgroup_dev_func_proto, .is_valid_access = cgroup_dev_is_valid_access, }; + +/** + * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl + * + * @head: sysctl table header + * @table: sysctl table + * @write: sysctl is being read (= 0) or written (= 1) + * @buf: pointer to buffer passed by user space + * @pcount: value-result argument: value is size of buffer pointed to by @buf, + * result is size of @new_buf if program set new value, initial value + * otherwise + * @ppos: value-result argument: value is position at which read from or write + * to sysctl is happening, result is new position if program overrode it, + * initial value otherwise + * @new_buf: pointer to pointer to new buffer that will be allocated if program + * overrides new value provided by user space on sysctl write + * NOTE: it's caller responsibility to free *new_buf if it was set + * @type: type of program to be executed + * + * Program is run when sysctl is being accessed, either read or written, and + * can allow or deny such access. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases 0 is returned. + */ +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + void __user *buf, size_t *pcount, + loff_t *ppos, void **new_buf, + enum bpf_attach_type type) +{ + struct bpf_sysctl_kern ctx = { + .head = head, + .table = table, + .write = write, + .ppos = ppos, + .cur_val = NULL, + .cur_len = PAGE_SIZE, + .new_val = NULL, + .new_len = 0, + .new_updated = 0, + }; + struct cgroup *cgrp; + int ret; + + ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); + if (ctx.cur_val) { + mm_segment_t old_fs; + loff_t pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, + &ctx.cur_len, &pos)) { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + set_fs(old_fs); + } else { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + + if (write && buf && *pcount) { + /* BPF program should be able to override new value with a + * buffer bigger than provided by user. + */ + ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); + ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); + if (!ctx.new_val || + copy_from_user(ctx.new_val, buf, ctx.new_len)) + /* Let BPF program decide how to proceed. */ + ctx.new_len = 0; + } + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + rcu_read_unlock(); + + kfree(ctx.cur_val); + + if (ret == 1 && ctx.new_updated) { + *new_buf = ctx.new_val; + *pcount = ctx.new_len; + } else { + kfree(ctx.new_val); + } + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); + +static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, + size_t *lenp) +{ + ssize_t tmp_ret = 0, ret; + + if (dir->header.parent) { + tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); + if (ret < 0) + return ret; + *bufp += ret; + *lenp -= ret; + ret += tmp_ret; + + /* Avoid leading slash. */ + if (!ret) + return ret; + + tmp_ret = strscpy(*bufp, "/", *lenp); + if (tmp_ret < 0) + return tmp_ret; + *bufp += tmp_ret; + *lenp -= tmp_ret; + + return ret + tmp_ret; +} + +BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len, u64, flags) +{ + ssize_t tmp_ret = 0, ret; + + if (!buf) + return -EINVAL; + + if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { + if (!ctx->head) + return -EINVAL; + tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(buf, ctx->table->procname, buf_len); + + return ret < 0 ? ret : tmp_ret + ret; +} + +static const struct bpf_func_proto bpf_sysctl_get_name_proto = { + .func = bpf_sysctl_get_name, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static int copy_sysctl_value(char *dst, size_t dst_len, char *src, + size_t src_len) +{ + if (!dst) + return -EINVAL; + + if (!dst_len) + return -E2BIG; + + if (!src || !src_len) { + memset(dst, 0, dst_len); + return -EINVAL; + } + + memcpy(dst, src, min(dst_len, src_len)); + + if (dst_len > src_len) { + memset(dst + src_len, '\0', dst_len - src_len); + return src_len; + } + + dst[dst_len - 1] = '\0'; + + return -E2BIG; +} + +BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, + char *, buf, size_t, buf_len) +{ + return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { + .func = bpf_sysctl_get_current_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len) +{ + if (!ctx->write) { + if (buf && buf_len) + memset(buf, '\0', buf_len); + return -EINVAL; + } + return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { + .func = bpf_sysctl_get_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, + const char *, buf, size_t, buf_len) +{ + if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) + return -EINVAL; + + if (buf_len > PAGE_SIZE - 1) + return -E2BIG; + + memcpy(ctx->new_val, buf, buf_len); + ctx->new_len = buf_len; + ctx->new_updated = 1; + + return 0; +} + +static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { + .func = bpf_sysctl_set_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto * +sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; + case BPF_FUNC_sysctl_get_name: + return &bpf_sysctl_get_name_proto; + case BPF_FUNC_sysctl_get_current_value: + return &bpf_sysctl_get_current_value_proto; + case BPF_FUNC_sysctl_get_new_value: + return &bpf_sysctl_get_new_value_proto; + case BPF_FUNC_sysctl_set_new_value: + return &bpf_sysctl_set_new_value_proto; + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) + return false; + + switch (off) { + case offsetof(struct bpf_sysctl, write): + if (type != BPF_READ) + return false; + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sysctl, file_pos): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } else { + return size == size_default; + } + default: + return false; + } +} + +static u32 sysctl_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sysctl, write): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sysctl_kern, write, + FIELD_SIZEOF(struct bpf_sysctl_kern, + write), + target_size)); + break; + case offsetof(struct bpf_sysctl, file_pos): + /* ppos is a pointer so it should be accessed via indirect + * loads and stores. Also for stores additional temporary + * register is used since neither src_reg nor dst_reg can be + * overridden. + */ + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM( + BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_STX_MEM( + BPF_SIZEOF(u32), treg, si->src_reg, 0); + *insn++ = BPF_LDX_MEM( + BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + } + *target_size = sizeof(u32); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops cg_sysctl_verifier_ops = { + .get_func_proto = sysctl_func_proto, + .is_valid_access = sysctl_is_valid_access, + .convert_ctx_access = sysctl_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sysctl_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c605397c79f0..242a643af82f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && - dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + (dst[i].src_reg == BPF_PSEUDO_MAP_FD || + dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && @@ -337,7 +338,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) } static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, - s32 end_new, u32 curr, const bool probe_pass) + s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; s32 delta = end_new - end_old; @@ -355,7 +356,7 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, - s32 end_new, u32 curr, const bool probe_pass) + s32 end_new, s32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; s32 delta = end_new - end_old; @@ -438,6 +439,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; + int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { @@ -453,8 +455,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, off + 1, off + len, true)) - return NULL; + (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) + return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as @@ -463,7 +465,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) - return NULL; + return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; @@ -1095,13 +1097,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); - if (!tmp) { + if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); - return ERR_PTR(-ENOMEM); + return tmp; } clone = tmp; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3c18260403dd..cf727d77c6c6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -160,12 +160,12 @@ static void cpu_map_kthread_stop(struct work_struct *work) } static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf) + struct xdp_frame *xdpf, + struct sk_buff *skb) { unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; - struct sk_buff *skb; /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; @@ -191,8 +191,8 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb(pkt_data_start, frame_size); - if (!skb) + skb = build_skb_around(skb, pkt_data_start, frame_size); + if (unlikely(!skb)) return NULL; skb_reserve(skb, hard_start_headroom); @@ -240,6 +240,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +#define CPUMAP_BATCH 8 + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -252,8 +254,11 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_frame *xdpf; + unsigned int drops = 0, sched = 0; + void *frames[CPUMAP_BATCH]; + void *skbs[CPUMAP_BATCH]; + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n, m; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -269,18 +274,38 @@ static int cpu_map_kthread_run(void *data) sched = cond_resched(); } - /* Process packets in rcpu->queue */ - local_bh_disable(); /* * The bpf_cpu_map_entry is single consumer, with this * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdpf = __ptr_ring_consume(rcpu->queue))) { - struct sk_buff *skb; + n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + for (i = 0; i < n; i++) { + void *f = frames[i]; + struct page *page = virt_to_page(f); + + /* Bring struct page memory area to curr CPU. Read by + * build_skb_around via page_is_pfmemalloc(), and when + * freed written by page_frag_free call. + */ + prefetchw(page); + } + + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < n; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops = n; + } + + local_bh_disable(); + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf); + skb = cpu_map_build_skb(rcpu, xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; @@ -290,13 +315,9 @@ static int cpu_map_kthread_run(void *data) ret = netif_receive_skb_core(skb); if (ret == NET_RX_DROP) drops++; - - /* Limit BH-disable period */ - if (++processed == 8) - break; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); local_bh_enable(); /* resched point, may call do_softirq() */ } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index de73f55e42fd..d9ce383c0f9c 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; - if (map_ptr && !allow_ptr_leaks) + if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fed15cf94dca..192d32e77db3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) + BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -262,8 +262,8 @@ static int htab_map_alloc_check(union bpf_attr *attr) /* Guard against local DoS, and discourage production use. */ return -EPERM; - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (!lru && percpu_lru) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a411fc17d265..4266ffde07ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,9 @@ #include <linux/sched.h> #include <linux/uidgid.h> #include <linux/filter.h> +#include <linux/ctype.h> + +#include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -363,4 +366,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif + +#define BPF_STRTOX_BASE_MASK 0x1F + +static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, + unsigned long long *res, bool *is_negative) +{ + unsigned int base = flags & BPF_STRTOX_BASE_MASK; + const char *cur_buf = buf; + size_t cur_len = buf_len; + unsigned int consumed; + size_t val_len; + char str[64]; + + if (!buf || !buf_len || !res || !is_negative) + return -EINVAL; + + if (base != 0 && base != 8 && base != 10 && base != 16) + return -EINVAL; + + if (flags & ~BPF_STRTOX_BASE_MASK) + return -EINVAL; + + while (cur_buf < buf + buf_len && isspace(*cur_buf)) + ++cur_buf; + + *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); + if (*is_negative) + ++cur_buf; + + consumed = cur_buf - buf; + cur_len -= consumed; + if (!cur_len) + return -EINVAL; + + cur_len = min(cur_len, sizeof(str) - 1); + memcpy(str, cur_buf, cur_len); + str[cur_len] = '\0'; + cur_buf = str; + + cur_buf = _parse_integer_fixup_radix(cur_buf, &base); + val_len = _parse_integer(cur_buf, base, res); + + if (val_len & KSTRTOX_OVERFLOW) + return -ERANGE; + + if (val_len == 0) + return -EINVAL; + + cur_buf += val_len; + consumed += cur_buf - str; + + return consumed; +} + +static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, + long long *res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) { + if ((long long)-_res > 0) + return -ERANGE; + *res = -_res; + } else { + if ((long long)_res < 0) + return -ERANGE; + *res = _res; + } + return err; +} + +BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, + long *, res) +{ + long long _res; + int err; + + err = __bpf_strtoll(buf, buf_len, flags, &_res); + if (err < 0) + return err; + if (_res != (long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtol_proto = { + .func = bpf_strtol, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, + unsigned long *, res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) + return -EINVAL; + if (_res != (unsigned long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtoul_proto = { + .func = bpf_strtoul, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; #endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4a8f390a2b82..bc53e5b20ddc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -566,9 +566,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_destroy_inode_deferred(struct rcu_head *head) +static void bpf_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); enum bpf_type type; if (S_ISLNK(inode->i_mode)) @@ -578,16 +577,11 @@ static void bpf_destroy_inode_deferred(struct rcu_head *head) free_inode_nonrcu(inode); } -static void bpf_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); -} - static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 6b572e2de7fb..980e8f1f6cb5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_cgroup_storage_map { struct bpf_map map; @@ -282,8 +282,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); - if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return ERR_PTR(-EINVAL); if (attr->max_entries) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 93a5cbbde421..e61630c2e50b 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -538,7 +538,7 @@ out: #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_ACCESS_MASK) static struct bpf_map *trie_alloc(union bpf_attr *attr) { @@ -553,6 +553,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->map_flags & ~LPM_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index b384ea9f3254..0b140d236889 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -11,8 +11,7 @@ #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; @@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || - attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index afca36f53c49..ad3ccf82f31d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -166,13 +166,25 @@ void bpf_map_area_free(void *area) kvfree(area); } +static u32 bpf_map_flags_retain_permanent(u32 flags) +{ + /* Some map creation flags are not tied to the map object but + * rather to the map fd instead, so they have no meaning upon + * map object inspection since multiple file descriptors with + * different (access) properties can exist here. Thus, given + * this has zero meaning for the map itself, lets clear these + * from here. + */ + return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); +} + void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) { map->map_type = attr->map_type; map->key_size = attr->key_size; map->value_size = attr->value_size; map->max_entries = attr->max_entries; - map->map_flags = attr->map_flags; + map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); } @@ -343,6 +355,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) +{ + fmode_t mode = f.file->f_mode; + + /* Our file permissions may have been overridden by global + * map permissions facing syscall side. + */ + if (READ_ONCE(map->frozen)) + mode &= ~FMODE_CAN_WRITE; + return mode; +} + #ifdef CONFIG_PROC_FS static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { @@ -364,14 +388,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "max_entries:\t%u\n" "map_flags:\t%#x\n" "memlock:\t%llu\n" - "map_id:\t%u\n", + "map_id:\t%u\n" + "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, map->pages * 1ULL << PAGE_SHIFT, - map->id); + map->id, + READ_ONCE(map->frozen)); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -448,10 +474,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src) const char *end = src + BPF_OBJ_NAME_LEN; memset(dst, 0, BPF_OBJ_NAME_LEN); - - /* Copy all isalnum() and '_' char */ + /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { - if (!isalnum(*src) && *src != '_') + if (!isalnum(*src) && + *src != '_' && *src != '.') return -EINVAL; *dst++ = *src++; } @@ -478,9 +504,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 key_size, value_size; int ret = 0; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; + /* Some maps allow key to be unspecified. */ + if (btf_key_id) { + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + } else { + key_type = btf_type_by_id(btf, 0); + if (!map->ops->map_check_btf) + return -EINVAL; + } value_type = btf_type_id_size(btf, &btf_value_id, &value_size); if (!value_type || value_size != map->value_size) @@ -489,9 +522,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->spin_lock_off = btf_find_spin_lock(btf, value_type); if (map_value_has_spin_lock(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { @@ -545,7 +581,7 @@ static int map_create(union bpf_attr *attr) if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; - if (!attr->btf_key_type_id || !attr->btf_value_type_id) { + if (!attr->btf_value_type_id) { err = -EINVAL; goto free_map_nouncharge; } @@ -713,8 +749,7 @@ static int map_lookup_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -843,8 +878,7 @@ static int map_update_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -955,8 +989,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1007,8 +1040,7 @@ static int map_get_next_key(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -1075,8 +1107,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1118,6 +1149,36 @@ err_put: return err; } +#define BPF_MAP_FREEZE_LAST_FIELD map_fd + +static int map_freeze(const union bpf_attr *attr) +{ + int err = 0, ufd = attr->map_fd; + struct bpf_map *map; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_FREEZE)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + if (READ_ONCE(map->frozen)) { + err = -EBUSY; + goto err_put; + } + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto err_put; + } + + WRITE_ONCE(map->frozen, true); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -1557,7 +1618,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); - if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) + if (attr->insn_cnt == 0 || + attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && @@ -1728,12 +1790,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } raw_tp->btp = btp; - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, - BPF_PROG_TYPE_RAW_TRACEPOINT); + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out_free_tp; } + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } err = bpf_probe_register(raw_tp->btp, prog); if (err) @@ -1827,6 +1893,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_FLOW_DISSECTOR: ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; break; + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1905,6 +1974,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1938,9 +2010,12 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: + case BPF_CGROUP_SYSCTL: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_prog_query(attr, uattr); default: return -EINVAL; } @@ -1948,7 +2023,7 @@ static int bpf_prog_query(const union bpf_attr *attr, return cgroup_bpf_prog_query(attr, uattr); } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration +#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1961,6 +2036,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr, if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; + if ((attr->test.ctx_size_in && !attr->test.ctx_in) || + (!attr->test.ctx_size_in && attr->test.ctx_in)) + return -EINVAL; + + if ((attr->test.ctx_size_out && !attr->test.ctx_out) || + (!attr->test.ctx_size_out && attr->test.ctx_out)) + return -EINVAL; + prog = bpf_prog_get(attr->test.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); @@ -2071,13 +2154,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) } static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, - unsigned long addr) + unsigned long addr, u32 *off, + u32 *type) { + const struct bpf_map *map; int i; - for (i = 0; i < prog->aux->used_map_cnt; i++) - if (prog->aux->used_maps[i] == (void *)addr) - return prog->aux->used_maps[i]; + for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map == (void *)addr) { + *type = BPF_PSEUDO_MAP_FD; + return map; + } + if (!map->ops->map_direct_value_meta) + continue; + if (!map->ops->map_direct_value_meta(map, addr, off)) { + *type = BPF_PSEUDO_MAP_VALUE; + return map; + } + } + return NULL; } @@ -2085,6 +2181,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) { const struct bpf_map *map; struct bpf_insn *insns; + u32 off, type; u64 imm; int i; @@ -2112,11 +2209,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; - map = bpf_map_from_imm(prog, imm); + map = bpf_map_from_imm(prog, imm, &off, &type); if (map) { - insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].src_reg = type; insns[i].imm = map->id; - insns[i + 1].imm = 0; + insns[i + 1].imm = off; continue; } } @@ -2706,6 +2803,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_MAP_FREEZE: + err = map_freeze(&attr); + break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr, uattr); break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 09d5d972c9ff..95f9354495ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,6 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 #define BPF_COMPLEXITY_LIMIT_STATES 64 @@ -377,7 +376,8 @@ static bool is_release_function(enum bpf_func_id func_id) static bool is_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp; + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp; } static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -405,6 +405,7 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", + [PTR_TO_TP_BUFFER] = "tp_buffer", }; static char slot_type_char[] = { @@ -1091,7 +1092,7 @@ static int check_subprogs(struct bpf_verifier_env *env) */ subprog[env->subprog_cnt].start = insn_cnt; - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); @@ -1138,6 +1139,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ + int cnt = 0; while (parent) { /* if read wasn't screened by an earlier write ... */ @@ -1149,12 +1151,25 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } + if (parent->live & REG_LIVE_READ) + /* The parentage chain never changes and + * this parent was already marked as LIVE_READ. + * There is no need to keep walking the chain again and + * keep re-marking all parents as LIVE_READ. + * This case happens when the same register is read + * multiple times without writes into it in-between. + */ + break; /* ... then we depend on parent's value */ parent->live |= REG_LIVE_READ; state = parent; parent = state->parent; writes = true; + cnt++; } + + if (env->longest_mark_read_walk < cnt) + env->longest_mark_read_walk = cnt; return 0; } @@ -1163,30 +1178,32 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); return -EINVAL; } + reg = ®s[regno]; if (t == SRC_OP) { /* check whether register used as source operand can be read */ - if (regs[regno].type == NOT_INIT) { + if (reg->type == NOT_INIT) { verbose(env, "R%d !read_ok\n", regno); return -EACCES; } /* We don't need to worry about FP liveness because it's read-only */ - if (regno != BPF_REG_FP) - return mark_reg_read(env, ®s[regno], - regs[regno].parent); + if (regno == BPF_REG_FP) + return 0; + + return mark_reg_read(env, reg, reg->parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - regs[regno].live |= REG_LIVE_WRITTEN; + reg->live |= REG_LIVE_WRITTEN; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } @@ -1412,7 +1429,7 @@ static int check_stack_access(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } @@ -1425,6 +1442,28 @@ static int check_stack_access(struct bpf_verifier_env *env, return 0; } +static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, + int off, int size, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; + u32 cap = bpf_map_flags_to_cap(map); + + if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { + verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { + verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -1454,7 +1493,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, state); /* The minimum value is only important with signed @@ -1955,6 +1994,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env, return 0; } +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + if (off < 0) { + verbose(env, + "R%d invalid tracepoint buffer access: off=%d, size=%d", + regno, off, size); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d invalid variable buffer offset: off=%d, var_off=%s", + regno, off, tn_buf); + return -EACCES; + } + if (off + size > env->prog->aux->max_tp_access) + env->prog->aux->max_tp_access = off + size; + + return 0; +} + + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -2011,7 +2076,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - + err = check_map_access_type(env, regno, off, size, t); + if (err) + return err; err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -2097,6 +2164,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_TP_BUFFER) { + err = check_tp_buffer_access(env, reg, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -2157,6 +2228,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1, true); } +static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, + int off, int access_size, + bool zero_size_allowed) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", + regno, tn_buf, access_size); + } + return -EACCES; + } + return 0; +} + /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -2169,7 +2263,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); - int off, i, slot, spi; + int err, min_off, max_off, i, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -2183,21 +2277,57 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; + if (tnum_is_const(reg->var_off)) { + min_off = max_off = reg->var_off.value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + } else { + /* Variable offset is prohibited for unprivileged mode for + * simplicity since it requires corresponding support in + * Spectre masking for stack ALU. + * See also retrieve_ptr_limit(). + */ + if (!env->allow_ptr_leaks) { + char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable stack read R%d var_off=%s\n", - regno, tn_buf); - return -EACCES; - } - off = reg->off + reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - return -EACCES; + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", + regno, tn_buf); + return -EACCES; + } + /* Only initialized buffer on stack is allowed to be accessed + * with variable offset. With uninitialized buffer it's hard to + * guarantee that whole memory is marked as initialized on + * helper return since specific bounds are unknown what may + * cause uninitialized stack leaking. + */ + if (meta && meta->raw_mode) + meta = NULL; + + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smax_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "R%d unbounded indirect variable offset stack access\n", + regno); + return -EACCES; + } + min_off = reg->smin_value + reg->off; + max_off = reg->smax_value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) { + verbose(env, "R%d min value is outside of stack bound\n", + regno); + return err; + } + err = __check_stack_boundary(env, regno, max_off, access_size, + zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of stack bound\n", + regno); + return err; + } } if (meta && meta->raw_mode) { @@ -2206,10 +2336,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } - for (i = 0; i < access_size; i++) { + for (i = min_off; i < max_off + access_size; i++) { u8 *stype; - slot = -(off + i) - 1; + slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) goto err; @@ -2222,8 +2352,16 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, goto mark; } err: - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + min_off, i - min_off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", + tn_buf, i - min_off, access_size); + } return -EACCES; mark: /* reading any byte out of 8-byte 'spill_slot' will cause @@ -2232,7 +2370,7 @@ mark: mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent); } - return update_stack_depth(env, state, off); + return update_stack_depth(env, state, min_off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -2247,6 +2385,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_VALUE: + if (check_map_access_type(env, regno, reg->off, access_size, + meta && meta->raw_mode ? BPF_WRITE : + BPF_READ)) + return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ @@ -2353,6 +2495,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_int_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_INT || + type == ARG_PTR_TO_LONG; +} + +static int int_ptr_type_to_size(enum bpf_arg_type type) +{ + if (type == ARG_PTR_TO_INT) + return sizeof(u32); + else if (type == ARG_PTR_TO_LONG) + return sizeof(u64); + + return -EINVAL; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -2385,10 +2543,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && - type != expected_type) + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) + /* final test in check_stack_boundary() */; + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -2420,6 +2583,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } meta->ref_obj_id = reg->ref_obj_id; } + } else if (arg_type == ARG_PTR_TO_SOCKET) { + expected_type = PTR_TO_SOCKET; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -2445,6 +2612,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_int_ptr(arg_type)) { + expected_type = PTR_TO_STACK; + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) + goto err_type; } else { verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -2471,6 +2644,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta->map_ptr->key_size, false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && + !register_is_null(reg)) || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -2526,6 +2701,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + } else if (arg_type_is_int_ptr(arg_type)) { + int size = int_ptr_type_to_size(arg_type); + + err = check_helper_mem_access(env, regno, size, false, meta); + if (err) + return err; + err = check_ptr_alignment(env, reg, 0, size, true); } return err; @@ -2613,6 +2795,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_SK_STORAGE: + if (func_id != BPF_FUNC_sk_storage_get && + func_id != BPF_FUNC_sk_storage_delete) + goto error; + break; default: break; } @@ -2676,6 +2863,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_STACK) goto error; break; + case BPF_FUNC_sk_storage_get: + case BPF_FUNC_sk_storage_delete: + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + goto error; + break; default: break; } @@ -2905,7 +3097,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = target_insn; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, caller); verbose(env, "callee:\n"); @@ -2945,7 +3137,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; *insn_idx = callee->callsite + 1; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee); verbose(env, "to caller at %d:\n", *insn_idx); @@ -2979,6 +3171,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map = meta->map_ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -2989,11 +3182,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_peek_elem) return 0; - if (meta->map_ptr == NULL) { + if (map == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); return -EINVAL; } + /* In case of read-only, some additional restrictions + * need to be applied in order to prevent altering the + * state of the map from program side. + */ + if ((map->map_flags & BPF_F_RDONLY_PROG) && + (func_id == BPF_FUNC_map_delete_elem || + func_id == BPF_FUNC_map_update_elem || + func_id == BPF_FUNC_map_push_elem || + func_id == BPF_FUNC_map_pop_elem)) { + verbose(env, "write into map forbidden\n"); + return -EACCES; + } + if (!BPF_MAP_PTR(aux->map_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); @@ -3157,19 +3363,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - if (is_acquire_function(func_id)) { - int id = acquire_reference_state(env, insn_idx); - - if (id < 0) - return id; - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; - } else { - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = ++env->id_gen; - } + regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; @@ -3180,9 +3378,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } - if (is_ptr_cast_function(func_id)) + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; + } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); @@ -3282,6 +3490,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, switch (ptr_reg->type) { case PTR_TO_STACK: + /* Indirect variable offset stack access is prohibited in + * unprivileged mode so it's not handled here. + */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) *ptr_limit = MAX_BPF_STACK + off; @@ -4982,23 +5193,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->dst_reg); return -EACCES; } - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } -/* return the map pointer stored inside BPF_LD_IMM64 instruction */ -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) -{ - u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; - - return (struct bpf_map *) (unsigned long) imm64; -} - /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -5022,11 +5227,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ - BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + map = env->used_maps[aux->map_index]; + mark_reg_known_zero(env, regs, insn->dst_reg); + regs[insn->dst_reg].map_ptr = map; + + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].off = aux->map_off; + if (map_value_has_spin_lock(map)) + regs[insn->dst_reg].id = ++env->id_gen; + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + } else { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; - regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); return 0; } @@ -5150,6 +5366,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: break; default: return 0; @@ -5220,10 +5437,6 @@ enum { #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) -static int *insn_stack; /* stack of insns to process */ -static int cur_stack; /* current stack index */ -static int *insn_state; - /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction @@ -5231,6 +5444,9 @@ static int *insn_state; */ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -5251,9 +5467,9 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) /* tree-edge */ insn_state[t] = DISCOVERED | e; insn_state[w] = DISCOVERED; - if (cur_stack >= env->prog->len) + if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; - insn_stack[cur_stack++] = w; + insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { verbose_linfo(env, t, "%d: ", t); @@ -5277,27 +5493,28 @@ static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; + int *insn_stack, *insn_state; int ret = 0; int i, t; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { - kfree(insn_state); + kvfree(insn_state); return -ENOMEM; } insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ - cur_stack = 1; + env->cfg.cur_stack = 1; peek_stack: - if (cur_stack == 0) + if (env->cfg.cur_stack == 0) goto check_state; - t = insn_stack[cur_stack - 1]; + t = insn_stack[env->cfg.cur_stack - 1]; if (BPF_CLASS(insns[t].code) == BPF_JMP || BPF_CLASS(insns[t].code) == BPF_JMP32) { @@ -5366,7 +5583,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; - if (cur_stack-- <= 0) { + if (env->cfg.cur_stack-- <= 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; @@ -5384,8 +5601,9 @@ check_state: ret = 0; /* cfg looks good */ err_free: - kfree(insn_state); - kfree(insn_stack); + kvfree(insn_state); + kvfree(insn_stack); + env->cfg.insn_state = env->cfg.insn_stack = NULL; return ret; } @@ -6074,6 +6292,22 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +static int propagate_liveness_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct bpf_reg_state *parent_reg) +{ + int err; + + if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + return 0; + + err = mark_reg_read(env, reg, parent_reg); + if (err) + return err; + + return 0; +} + /* A write screens off any subsequent reads; but write marks come from the * straight-line code between a state and its parent. When we arrive at an * equivalent state (jump target or such) we didn't arrive by the straight-line @@ -6085,8 +6319,9 @@ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, struct bpf_verifier_state *vparent) { - int i, frame, err = 0; + struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; + int i, frame, err = 0; if (vparent->curframe != vstate->curframe) { WARN(1, "propagate_live: parent frame %d current frame %d\n", @@ -6096,30 +6331,27 @@ static int propagate_liveness(struct bpf_verifier_env *env, /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); for (frame = 0; frame <= vstate->curframe; frame++) { + parent = vparent->frame[frame]; + state = vstate->frame[frame]; + parent_reg = parent->regs; + state_reg = state->regs; /* We don't need to worry about FP liveness, it's read-only */ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) - continue; - if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[frame]->regs[i], - &vparent->frame[frame]->regs[i]); - if (err) - return err; - } + err = propagate_liveness_reg(env, &state_reg[i], + &parent_reg[i]); + if (err) + return err; } - } - /* ... and stack slots */ - for (frame = 0; frame <= vstate->curframe; frame++) { - state = vstate->frame[frame]; - parent = vparent->frame[frame]; + /* Propagate stack slots. */ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_reg_read(env, &state->stack[i].spilled_ptr, - &parent->stack[i].spilled_ptr); + parent_reg = &parent->stack[i].spilled_ptr; + state_reg = &state->stack[i].spilled_ptr; + err = propagate_liveness_reg(env, state_reg, + parent_reg); + if (err) + return err; } } return err; @@ -6128,11 +6360,13 @@ static int propagate_liveness(struct bpf_verifier_env *env, static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl; + struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - sl = env->explored_states[insn_idx]; + pprev = &env->explored_states[insn_idx]; + sl = *pprev; + if (!sl) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here @@ -6143,6 +6377,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) while (sl != STATE_LIST_MARK) { if (states_equal(env, &sl->state, cur)) { + sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -6158,10 +6393,40 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - sl = sl->next; states_cnt++; + sl->miss_cnt++; + /* heuristic to determine whether this state is beneficial + * to keep checking from state equivalence point of view. + * Higher numbers increase max_states_per_insn and verification time, + * but do not meaningfully decrease insn_processed. + */ + if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { + /* the state is unlikely to be useful. Remove it to + * speed up verification + */ + *pprev = sl->next; + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + free_verifier_state(&sl->state, false); + kfree(sl); + env->peak_states--; + } else { + /* cannot free this state, since parentage chain may + * walk it later. Add it for free_list instead to + * be freed at the end of verification + */ + sl->next = env->free_list; + env->free_list = sl; + } + sl = *pprev; + continue; + } + pprev = &sl->next; + sl = *pprev; } + if (env->max_states_per_insn < states_cnt) + env->max_states_per_insn = states_cnt; + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return 0; @@ -6175,6 +6440,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; + env->total_states++; + env->peak_states++; /* add new state to the head of linked list */ new = &new_sl->state; @@ -6259,8 +6526,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len, i; - int insn_processed = 0; + int insn_cnt = env->prog->len; bool do_print_state = false; env->prev_linfo = NULL; @@ -6295,10 +6561,10 @@ static int do_check(struct bpf_verifier_env *env) insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { + if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, "BPF program is too large. Processed %d insn\n", - insn_processed); + env->insn_processed); return -E2BIG; } @@ -6307,7 +6573,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { if (do_print_state) verbose(env, "\nfrom %d to %d%s: safe\n", env->prev_insn_idx, env->insn_idx, @@ -6325,8 +6591,9 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level > 1 || (env->log.level && do_print_state)) { - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2 || + (env->log.level & BPF_LOG_LEVEL && do_print_state)) { + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "%d:", env->insn_idx); else verbose(env, "\nfrom %d to %d%s:", @@ -6337,7 +6604,7 @@ static int do_check(struct bpf_verifier_env *env) do_print_state = false; } - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, .private_data = env, @@ -6602,16 +6869,6 @@ process_bpf_exit: env->insn_idx++; } - verbose(env, "processed %d insns (limit %d), stack depth ", - insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } - verbose(env, "\n"); env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -6709,8 +6966,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_insn_aux_data *aux; struct bpf_map *map; struct fd f; + u64 addr; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -6719,13 +6978,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return -EINVAL; } - if (insn->src_reg == 0) + if (insn[0].src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; - if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || - insn[1].imm != 0) { - verbose(env, "unrecognized bpf_ld_imm64 insn\n"); + /* In final convert_pseudo_ld_imm64() step, this is + * converted into regular 64-bit imm load insn. + */ + if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && + insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || + (insn[0].src_reg == BPF_PSEUDO_MAP_FD && + insn[1].imm != 0)) { + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -6743,16 +7008,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return err; } - /* store map pointer inside BPF_LD_IMM64 instruction */ - insn[0].imm = (u32) (unsigned long) map; - insn[1].imm = ((u64) (unsigned long) map) >> 32; + aux = &env->insn_aux_data[i]; + if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + addr = (unsigned long)map; + } else { + u32 off = insn[1].imm; + + if (off >= BPF_MAX_VAR_OFF) { + verbose(env, "direct value offset of %u is not allowed\n", off); + fdput(f); + return -EINVAL; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + fdput(f); + return -EINVAL; + } + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) { + verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", + map->value_size, off); + fdput(f); + return err; + } + + aux->map_off = off; + addr += off; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; /* check whether we recorded this map already */ - for (j = 0; j < env->used_map_cnt; j++) + for (j = 0; j < env->used_map_cnt; j++) { if (env->used_maps[j] == map) { + aux->map_index = j; fdput(f); goto next_insn; } + } if (env->used_map_cnt >= MAX_USED_MAPS) { fdput(f); @@ -6769,6 +7065,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) fdput(f); return PTR_ERR(map); } + + aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && @@ -6874,8 +7172,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of struct bpf_prog *new_prog; new_prog = bpf_patch_insn_single(env->prog, off, patch, len); - if (!new_prog) + if (IS_ERR(new_prog)) { + if (PTR_ERR(new_prog) == -ERANGE) + verbose(env, + "insn %d cannot be patched due to 16-bit range\n", + env->insn_aux_data[off].orig_idx); return NULL; + } if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; adjust_subprog_starts(env, off, len); @@ -7296,7 +7599,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1ULL << size * 8) - 1); } } @@ -7413,9 +7716,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - func[subprog]->bpf_func - - __bpf_call_base; + insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - + __bpf_call_base; } /* we use the aux data to keep a list of the start addresses @@ -7817,6 +8119,14 @@ static void free_states(struct bpf_verifier_env *env) struct bpf_verifier_state_list *sl, *sln; int i; + sl = env->free_list; + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } + if (!env->explored_states) return; @@ -7832,12 +8142,37 @@ static void free_states(struct bpf_verifier_env *env) } } - kfree(env->explored_states); + kvfree(env->explored_states); +} + +static void print_verification_stats(struct bpf_verifier_env *env) +{ + int i; + + if (env->log.level & BPF_LOG_STATS) { + verbose(env, "verification time %lld usec\n", + div_u64(env->verification_time, 1000)); + verbose(env, "stack depth "); + for (i = 0; i < env->subprog_cnt; i++) { + u32 depth = env->subprog_info[i].stack_depth; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt) + verbose(env, "+"); + } + verbose(env, "\n"); + } + verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " + "total_states %d peak_states %d mark_read %d\n", + env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS, + env->max_states_per_insn, env->total_states, + env->peak_states, env->longest_mark_read_walk); } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { + u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; struct bpf_verifier_log *log; int i, len, ret = -EINVAL; @@ -7865,9 +8200,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; + is_priv = capable(CAP_SYS_ADMIN); /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); + if (!is_priv) + mutex_lock(&bpf_verifier_lock); if (attr->log_level || attr->log_buf || attr->log_size) { /* user requested verbose verifier output @@ -7879,8 +8216,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) + if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || + !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) goto err_unlock; } @@ -7890,7 +8227,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - is_priv = capable(CAP_SYS_ADMIN); env->allow_ptr_leaks = is_priv; ret = replace_map_fd_with_map_ptr(env); @@ -7903,7 +8239,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kcalloc(env->prog->len, + env->explored_states = kvcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; @@ -7961,6 +8297,9 @@ skip_full_check: if (ret == 0) ret = fixup_call_args(env); + env->verification_time = ktime_get_ns() - start_time; + print_verification_stats(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { @@ -8000,7 +8339,8 @@ err_release_maps: release_maps(env); *prog = env->prog; err_unlock: - mutex_unlock(&bpf_verifier_lock); + if (!is_priv) + mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: kfree(env); diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index bfcdae896122..5d7a76bfbbb7 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o -obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 30e39f3932ad..809e34a3c017 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -28,12 +28,15 @@ extern void __init enable_debug_cgroup(void); #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ - spin_lock(&trace_cgroup_path_lock); \ + unsigned long flags; \ + spin_lock_irqsave(&trace_cgroup_path_lock, \ + flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ - spin_unlock(&trace_cgroup_path_lock); \ + spin_unlock_irqrestore(&trace_cgroup_path_lock, \ + flags); \ } \ } while (0) @@ -240,6 +243,7 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c126b34fd4ff..68ca5de7ec27 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -342,22 +342,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, return l; } -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += link->cset->nr_tasks; - spin_unlock_irq(&css_set_lock); - return count; -} - /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3f2b4bde0f9c..217cec4e22c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -593,6 +593,39 @@ static void cgroup_get_live(struct cgroup *cgrp) css_get(&cgrp->self); } +/** + * __cgroup_task_count - count the number of tasks in a cgroup. The caller + * is responsible for taking the css_set_lock. + * @cgrp: the cgroup in question + */ +int __cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += link->cset->nr_tasks; + + return count; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +int cgroup_task_count(const struct cgroup *cgrp) +{ + int count; + + spin_lock_irq(&css_set_lock); + count = __cgroup_task_count(cgrp); + spin_unlock_irq(&css_set_lock); + + return count; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -783,6 +816,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) break; cgroup1_check_for_release(cgrp); + TRACE_CGROUP_PATH(notify_populated, cgrp, + cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; @@ -2402,8 +2437,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); - put_css_set_locked(from_cset); from_cset->nr_tasks--; + /* + * If the source or destination cgroup is frozen, + * the task might require to change its state. + */ + cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, + to_cset->dfl_cgrp); + put_css_set_locked(from_cset); + } } spin_unlock_irq(&css_set_lock); @@ -2602,7 +2644,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) - goto err; + return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2634,9 +2676,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) } return 0; -err: - cgroup_migrate_finish(mgctx); - return -ENOMEM; } /** @@ -3447,8 +3486,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "populated %d\n", - cgroup_is_populated(seq_css(seq)->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); + seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); + return 0; } @@ -3498,17 +3540,118 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_CPU); +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + psi_trigger_replace(&of->priv, new); + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */ + +static int cgroup_freeze_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgrp->freezer.freeze); + + return 0; +} + +static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int freeze; + + ret = kstrtoint(strstrip(buf), 0, &freeze); + if (ret) + return ret; + + if (freeze < 0 || freeze > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgroup_freeze(cgrp, freeze); + + cgroup_kn_unlock(of->kn); + + return nbytes; } -#endif static int cgroup_file_open(struct kernfs_open_file *of) { @@ -4654,6 +4797,12 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.freeze", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_freeze_show, + .write = cgroup_freeze_write, + }, + { .name = "cpu.stat", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, @@ -4661,20 +4810,26 @@ static struct cftype cgroup_base_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ }; @@ -4781,9 +4936,11 @@ static void css_release_work_fn(struct work_struct *work) if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; + spin_unlock_irq(&css_set_lock); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5001,12 +5158,31 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_psi_free; + /* + * New cgroup inherits effective freeze counter, and + * if the parent has to be frozen, the child has too. + */ + cgrp->freezer.e_freeze = parent->freezer.e_freeze; + if (cgrp->freezer.e_freeze) + set_bit(CGRP_FROZEN, &cgrp->flags); + + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; - if (tcgrp != cgrp) + if (tcgrp != cgrp) { tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups + * get a new frozen descendant, but their state can't + * change because of this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } } + spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5291,10 +5467,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; + /* + * If the dying cgroup is frozen, decrease frozen descendants + * counters of ancestor cgroups. + */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + tcgrp->freezer.nr_frozen_descendants--; } + spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); @@ -5746,6 +5930,26 @@ void cgroup_post_fork(struct task_struct *child) cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } + + /* + * If the cgroup has to be frozen, the new task has too. + * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get + * the task into the frozen state. + */ + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later + * from do_freezer_trap(). So we avoid cgroup's + * transient switch from the frozen state and back. + */ + } + spin_unlock_irq(&css_set_lock); } @@ -5794,6 +5998,11 @@ void cgroup_exit(struct task_struct *tsk) spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); cset->nr_tasks--; + + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f1b87330bee..80aa3f027ac3 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v) css = cset->subsys[ss->id]; if (!css) continue; - seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, - (unsigned long)css, css->id); + seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, + css, css->id); } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); @@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) if (css->parent) snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", css->parent->id); - seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, - (unsigned long)css, css->id, + seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name, + css, css->id, atomic_read(&css->online_cnt), pbuf); } diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 08236798d173..8cf010680678 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,481 +1,314 @@ -/* - * cgroup_freezer.c - control group freezer subsystem - * - * Copyright IBM Corporation, 2007 - * - * Author : Cedric Le Goater <clg@fr.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include <linux/export.h> -#include <linux/slab.h> +//SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/uaccess.h> -#include <linux/freezer.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> - -/* - * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is - * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared - * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING - * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of - * its ancestors has FREEZING_SELF set. - */ -enum freezer_state_flags { - CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ - CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ - CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ - CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/signal.h> - /* mask for all FREEZING flags */ - CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, -}; +#include "cgroup-internal.h" -struct freezer { - struct cgroup_subsys_state css; - unsigned int state; -}; +#include <trace/events/cgroup.h> -static DEFINE_MUTEX(freezer_mutex); - -static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +/* + * Propagate the cgroup frozen state upwards by the cgroup tree. + */ +static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) { - return css ? container_of(css, struct freezer, css) : NULL; -} + int desc = 1; -static inline struct freezer *task_freezer(struct task_struct *task) -{ - return css_freezer(task_css(task, freezer_cgrp_id)); + /* + * If the new state is frozen, some freezing ancestor cgroups may change + * their state too, depending on if all their descendants are frozen. + * + * Otherwise, all ancestor cgroups are forced into the non-frozen state. + */ + while ((cgrp = cgroup_parent(cgrp))) { + if (frozen) { + cgrp->freezer.nr_frozen_descendants += desc; + if (!test_bit(CGRP_FROZEN, &cgrp->flags) && + test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_descendants == + cgrp->nr_descendants) { + set_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); + desc++; + } + } else { + cgrp->freezer.nr_frozen_descendants -= desc; + if (test_bit(CGRP_FROZEN, &cgrp->flags)) { + clear_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); + desc++; + } + } + } } -static struct freezer *parent_freezer(struct freezer *freezer) +/* + * Revisit the cgroup frozen state. + * Checks if the cgroup is really frozen and perform all state transitions. + */ +void cgroup_update_frozen(struct cgroup *cgrp) { - return css_freezer(freezer->css.parent); -} + bool frozen; -bool cgroup_freezing(struct task_struct *task) -{ - bool ret; + lockdep_assert_held(&css_set_lock); - rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; - rcu_read_unlock(); + /* + * If the cgroup has to be frozen (CGRP_FREEZE bit set), + * and all tasks are frozen and/or stopped, let's consider + * the cgroup frozen. Otherwise it's not frozen. + */ + frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); - return ret; -} + if (frozen) { + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + return; -static const char *freezer_state_strs(unsigned int state) -{ - if (state & CGROUP_FROZEN) - return "FROZEN"; - if (state & CGROUP_FREEZING) - return "FREEZING"; - return "THAWED"; -}; - -static struct cgroup_subsys_state * -freezer_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct freezer *freezer; + set_bit(CGRP_FROZEN, &cgrp->flags); + } else { + /* Already there? */ + if (!test_bit(CGRP_FROZEN, &cgrp->flags)) + return; - freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); - if (!freezer) - return ERR_PTR(-ENOMEM); + clear_bit(CGRP_FROZEN, &cgrp->flags); + } + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); - return &freezer->css; + /* Update the state of ancestor cgroups. */ + cgroup_propagate_frozen(cgrp, frozen); } -/** - * freezer_css_online - commit creation of a freezer css - * @css: css being created - * - * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. +/* + * Increment cgroup's nr_frozen_tasks. */ -static int freezer_css_online(struct cgroup_subsys_state *css) +static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) { - struct freezer *freezer = css_freezer(css); - struct freezer *parent = parent_freezer(freezer); - - mutex_lock(&freezer_mutex); - - freezer->state |= CGROUP_FREEZER_ONLINE; - - if (parent && (parent->state & CGROUP_FREEZING)) { - freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); - } - - mutex_unlock(&freezer_mutex); - return 0; + cgrp->freezer.nr_frozen_tasks++; } -/** - * freezer_css_offline - initiate destruction of a freezer css - * @css: css being destroyed - * - * @css is going away. Mark it dead and decrement system_freezing_count if - * it was holding one. +/* + * Decrement cgroup's nr_frozen_tasks. */ -static void freezer_css_offline(struct cgroup_subsys_state *css) +static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) { - struct freezer *freezer = css_freezer(css); - - mutex_lock(&freezer_mutex); - - if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); - - freezer->state = 0; - - mutex_unlock(&freezer_mutex); + cgrp->freezer.nr_frozen_tasks--; + WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); } -static void freezer_css_free(struct cgroup_subsys_state *css) +/* + * Enter frozen/stopped state, if not yet there. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + */ +void cgroup_enter_frozen(void) { - kfree(css_freezer(css)); + struct cgroup *cgrp; + + if (current->frozen) + return; + + spin_lock_irq(&css_set_lock); + current->frozen = true; + cgrp = task_dfl_cgroup(current); + cgroup_inc_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); } /* - * Tasks can be migrated into a different freezer anytime regardless of its - * current state. freezer_attach() is responsible for making new tasks - * conform to the current state. + * Conditionally leave frozen/stopped state. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. * - * Freezer state changes and task migration are synchronized via - * @freezer->lock. freezer_attach() makes the new tasks conform to the - * current state and all following state changes can see the new tasks. + * If always_leave is not set, and the cgroup is freezing, + * we're racing with the cgroup freezing. In this case, we don't + * drop the frozen counter to avoid a transient switch to + * the unfrozen state. */ -static void freezer_attach(struct cgroup_taskset *tset) +void cgroup_leave_frozen(bool always_leave) { - struct task_struct *task; - struct cgroup_subsys_state *new_css; - - mutex_lock(&freezer_mutex); - - /* - * Make the new tasks conform to the current state of @new_css. - * For simplicity, when migrating any task to a FROZEN cgroup, we - * revert it to FREEZING and let update_if_frozen() determine the - * correct state later. - * - * Tasks in @tset are on @new_css but may not conform to its - * current state before executing the following - !frozen tasks may - * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. - */ - cgroup_taskset_for_each(task, new_css, tset) { - struct freezer *freezer = css_freezer(new_css); - - if (!(freezer->state & CGROUP_FREEZING)) { - __thaw_task(task); - } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ - while (freezer && (freezer->state & CGROUP_FROZEN)) { - freezer->state &= ~CGROUP_FROZEN; - freezer = parent_freezer(freezer); - } - } + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + cgrp = task_dfl_cgroup(current); + if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { + cgroup_dec_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + WARN_ON_ONCE(!current->frozen); + current->frozen = false; + } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { + spin_lock(¤t->sighand->siglock); + current->jobctl |= JOBCTL_TRAP_FREEZE; + set_thread_flag(TIF_SIGPENDING); + spin_unlock(¤t->sighand->siglock); } - - mutex_unlock(&freezer_mutex); + spin_unlock_irq(&css_set_lock); } -/** - * freezer_fork - cgroup post fork callback - * @task: a task which has just been forked - * - * @task has just been created and should conform to the current state of - * the cgroup_freezer it belongs to. This function may race against - * freezer_attach(). Losing to freezer_attach() means that we don't have - * to do anything as freezer_attach() will put @task into the appropriate - * state. +/* + * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE + * jobctl bit. */ -static void freezer_fork(struct task_struct *task) +static void cgroup_freeze_task(struct task_struct *task, bool freeze) { - struct freezer *freezer; + unsigned long flags; - /* - * The root cgroup is non-freezable, so we can skip locking the - * freezer. This is safe regardless of race with task migration. - * If we didn't race or won, skipping is obviously the right thing - * to do. If we lost and root is the new cgroup, noop is still the - * right thing to do. - */ - if (task_css_is_root(task, freezer_cgrp_id)) + /* If the task is about to die, don't bother with freezing it. */ + if (!lock_task_sighand(task, &flags)) return; - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - freezer = task_freezer(task); - if (freezer->state & CGROUP_FREEZING) - freeze_task(task); + if (freeze) { + task->jobctl |= JOBCTL_TRAP_FREEZE; + signal_wake_up(task, false); + } else { + task->jobctl &= ~JOBCTL_TRAP_FREEZE; + wake_up_process(task); + } - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); + unlock_task_sighand(task, &flags); } -/** - * update_if_frozen - update whether a cgroup finished freezing - * @css: css of interest - * - * Once FREEZING is initiated, transition to FROZEN is lazily updated by - * calling this function. If the current state is FREEZING but not FROZEN, - * this function checks whether all tasks of this cgroup and the descendant - * cgroups finished freezing and, if so, sets FROZEN. - * - * The caller is responsible for grabbing RCU read lock and calling - * update_if_frozen() on all descendants prior to invoking this function. - * - * Task states and freezer state might disagree while tasks are being - * migrated into or out of @css, so we can't verify task states against - * @freezer state here. See freezer_attach() for details. +/* + * Freeze or unfreeze all tasks in the given cgroup. */ -static void update_if_frozen(struct cgroup_subsys_state *css) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) { - struct freezer *freezer = css_freezer(css); - struct cgroup_subsys_state *pos; struct css_task_iter it; struct task_struct *task; - lockdep_assert_held(&freezer_mutex); - - if (!(freezer->state & CGROUP_FREEZING) || - (freezer->state & CGROUP_FROZEN)) - return; + lockdep_assert_held(&cgroup_mutex); - /* are all (live) children frozen? */ - rcu_read_lock(); - css_for_each_child(pos, css) { - struct freezer *child = css_freezer(pos); - - if ((child->state & CGROUP_FREEZER_ONLINE) && - !(child->state & CGROUP_FROZEN)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + spin_lock_irq(&css_set_lock); + if (freeze) + set_bit(CGRP_FREEZE, &cgrp->flags); + else + clear_bit(CGRP_FREEZE, &cgrp->flags); + spin_unlock_irq(&css_set_lock); - /* are all tasks frozen? */ - css_task_iter_start(css, 0, &it); + if (freeze) + TRACE_CGROUP_PATH(freeze, cgrp); + else + TRACE_CGROUP_PATH(unfreeze, cgrp); + css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } - } - - freezer->state |= CGROUP_FROZEN; -out_iter_end: - css_task_iter_end(&it); -} - -static int freezer_read(struct seq_file *m, void *v) -{ - struct cgroup_subsys_state *css = seq_css(m), *pos; - - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - /* update states bottom-up */ - css_for_each_descendant_post(pos, css) { - if (!css_tryget_online(pos)) + /* + * Ignore kernel threads here. Freezing cgroups containing + * kthreads isn't supported. + */ + if (task->flags & PF_KTHREAD) continue; - rcu_read_unlock(); - - update_if_frozen(pos); - - rcu_read_lock(); - css_put(pos); + cgroup_freeze_task(task, freeze); } - - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); - - seq_puts(m, freezer_state_strs(css_freezer(css)->state)); - seq_putc(m, '\n'); - return 0; -} - -static void freeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - freeze_task(task); css_task_iter_end(&it); -} -static void unfreeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - __thaw_task(task); - css_task_iter_end(&it); + /* + * Cgroup state should be revisited here to cover empty leaf cgroups + * and cgroups which descendants are already in the desired state. + */ + spin_lock_irq(&css_set_lock); + if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); } -/** - * freezer_apply_state - apply state change to a single cgroup_freezer - * @freezer: freezer to apply state change to - * @freeze: whether to freeze or unfreeze - * @state: CGROUP_FREEZING_* flag to set or clear - * - * Set or clear @state on @cgroup according to @freeze, and perform - * freezing or thawing as necessary. +/* + * Adjust the task state (freeze or unfreeze) and revisit the state of + * source and destination cgroups. */ -static void freezer_apply_state(struct freezer *freezer, bool freeze, - unsigned int state) +void cgroup_freezer_migrate_task(struct task_struct *task, + struct cgroup *src, struct cgroup *dst) { - /* also synchronizes against task migration, see freezer_attach() */ - lockdep_assert_held(&freezer_mutex); + lockdep_assert_held(&css_set_lock); - if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + /* + * Kernel threads are not supposed to be frozen at all. + */ + if (task->flags & PF_KTHREAD) return; - if (freeze) { - if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); - freezer->state |= state; - freeze_cgroup(freezer); - } else { - bool was_freezing = freezer->state & CGROUP_FREEZING; - - freezer->state &= ~state; - - if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); - freezer->state &= ~CGROUP_FROZEN; - unfreeze_cgroup(freezer); - } + /* + * Adjust counters of freezing and frozen tasks. + * Note, that if the task is frozen, but the destination cgroup is not + * frozen, we bump both counters to keep them balanced. + */ + if (task->frozen) { + cgroup_inc_frozen_cnt(dst); + cgroup_dec_frozen_cnt(src); } -} - -/** - * freezer_change_state - change the freezing state of a cgroup_freezer - * @freezer: freezer of interest - * @freeze: whether to freeze or thaw - * - * Freeze or thaw @freezer according to @freeze. The operations are - * recursive - all descendants of @freezer will be affected. - */ -static void freezer_change_state(struct freezer *freezer, bool freeze) -{ - struct cgroup_subsys_state *pos; + cgroup_update_frozen(dst); + cgroup_update_frozen(src); /* - * Update all its descendants in pre-order traversal. Each - * descendant will try to inherit its parent's FREEZING state as - * CGROUP_FREEZING_PARENT. + * Force the task to the desired state. */ - mutex_lock(&freezer_mutex); - rcu_read_lock(); - css_for_each_descendant_pre(pos, &freezer->css) { - struct freezer *pos_f = css_freezer(pos); - struct freezer *parent = parent_freezer(pos_f); - - if (!css_tryget_online(pos)) - continue; - rcu_read_unlock(); - - if (pos_f == freezer) - freezer_apply_state(pos_f, freeze, - CGROUP_FREEZING_SELF); - else - freezer_apply_state(pos_f, - parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); - - rcu_read_lock(); - css_put(pos); - } - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); + cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } -static ssize_t freezer_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +void cgroup_freeze(struct cgroup *cgrp, bool freeze) { - bool freeze; + struct cgroup_subsys_state *css; + struct cgroup *dsct; + bool applied = false; - buf = strstrip(buf); + lockdep_assert_held(&cgroup_mutex); - if (strcmp(buf, freezer_state_strs(0)) == 0) - freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) - freeze = true; - else - return -EINVAL; + /* + * Nothing changed? Just exit. + */ + if (cgrp->freezer.freeze == freeze) + return; - freezer_change_state(css_freezer(of_css(of)), freeze); - return nbytes; -} + cgrp->freezer.freeze = freeze; -static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); + /* + * Propagate changes downwards the cgroup tree. + */ + css_for_each_descendant_pre(css, &cgrp->self) { + dsct = css->cgroup; - return (bool)(freezer->state & CGROUP_FREEZING_SELF); -} + if (cgroup_is_dead(dsct)) + continue; -static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); + if (freeze) { + dsct->freezer.e_freeze++; + /* + * Already frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 1) + continue; + } else { + dsct->freezer.e_freeze--; + /* + * Still frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 0) + continue; - return (bool)(freezer->state & CGROUP_FREEZING_PARENT); -} + WARN_ON_ONCE(dsct->freezer.e_freeze < 0); + } + + /* + * Do change actual state: freeze or unfreeze. + */ + cgroup_do_freeze(dsct, freeze); + applied = true; + } -static struct cftype files[] = { - { - .name = "state", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = freezer_read, - .write = freezer_write, - }, - { - .name = "self_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_self_freezing_read, - }, - { - .name = "parent_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_parent_freezing_read, - }, - { } /* terminate */ -}; - -struct cgroup_subsys freezer_cgrp_subsys = { - .css_alloc = freezer_css_alloc, - .css_online = freezer_css_online, - .css_offline = freezer_css_offline, - .css_free = freezer_css_free, - .attach = freezer_attach, - .fork = freezer_fork, - .legacy_cftypes = files, -}; + /* + * Even if the actual state hasn't changed, let's notify a user. + * The state can be enforced by an ancestor cgroup: the cgroup + * can already be in the desired state or it can be locked in the + * opposite state, so that the transition will never happen. + * In both cases it's better to notify a user, that there is + * nothing to wait for. + */ + if (!applied) { + TRACE_CGROUP_PATH(notify_frozen, cgrp, + test_bit(CGRP_FROZEN, &cgrp->flags)); + cgroup_file_notify(&cgrp->events_file); + } +} diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c new file mode 100644 index 000000000000..08236798d173 --- /dev/null +++ b/kernel/cgroup/legacy_freezer.c @@ -0,0 +1,481 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater <clg@fr.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/freezer.h> +#include <linux/seq_file.h> +#include <linux/mutex.h> + +/* + * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ +enum freezer_state_flags { + CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ + CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ + CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ + + /* mask for all FREEZING flags */ + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, +}; + +struct freezer { + struct cgroup_subsys_state css; + unsigned int state; +}; + +static DEFINE_MUTEX(freezer_mutex); + +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct freezer, css) : NULL; +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return css_freezer(task_css(task, freezer_cgrp_id)); +} + +static struct freezer *parent_freezer(struct freezer *freezer) +{ + return css_freezer(freezer->css.parent); +} + +bool cgroup_freezing(struct task_struct *task) +{ + bool ret; + + rcu_read_lock(); + ret = task_freezer(task)->state & CGROUP_FREEZING; + rcu_read_unlock(); + + return ret; +} + +static const char *freezer_state_strs(unsigned int state) +{ + if (state & CGROUP_FROZEN) + return "FROZEN"; + if (state & CGROUP_FREEZING) + return "FREEZING"; + return "THAWED"; +}; + +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + return &freezer->css; +} + +/** + * freezer_css_online - commit creation of a freezer css + * @css: css being created + * + * We're committing to creation of @css. Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. + */ +static int freezer_css_online(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct freezer *parent = parent_freezer(freezer); + + mutex_lock(&freezer_mutex); + + freezer->state |= CGROUP_FREEZER_ONLINE; + + if (parent && (parent->state & CGROUP_FREEZING)) { + freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; + atomic_inc(&system_freezing_cnt); + } + + mutex_unlock(&freezer_mutex); + return 0; +} + +/** + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed + * + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. + */ +static void freezer_css_offline(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + + mutex_lock(&freezer_mutex); + + if (freezer->state & CGROUP_FREEZING) + atomic_dec(&system_freezing_cnt); + + freezer->state = 0; + + mutex_unlock(&freezer_mutex); +} + +static void freezer_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_freezer(css)); +} + +/* + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. + */ +static void freezer_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *new_css; + + mutex_lock(&freezer_mutex); + + /* + * Make the new tasks conform to the current state of @new_css. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_css but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. + */ + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + + if (!(freezer->state & CGROUP_FREEZING)) { + __thaw_task(task); + } else { + freeze_task(task); + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } + } + } + + mutex_unlock(&freezer_mutex); +} + +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to. This function may race against + * freezer_attach(). Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ +static void freezer_fork(struct task_struct *task) +{ + struct freezer *freezer; + + /* + * The root cgroup is non-freezable, so we can skip locking the + * freezer. This is safe regardless of race with task migration. + * If we didn't race or won, skipping is obviously the right thing + * to do. If we lost and root is the new cgroup, noop is still the + * right thing to do. + */ + if (task_css_is_root(task, freezer_cgrp_id)) + return; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + freezer = task_freezer(task); + if (freezer->state & CGROUP_FREEZING) + freeze_task(task); + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +/** + * update_if_frozen - update whether a cgroup finished freezing + * @css: css of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function. If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. + * + * Task states and freezer state might disagree while tasks are being + * migrated into or out of @css, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. + */ +static void update_if_frozen(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct cgroup_subsys_state *pos; + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZING) || + (freezer->state & CGROUP_FROZEN)) + return; + + /* are all (live) children frozen? */ + rcu_read_lock(); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); + + if ((child->state & CGROUP_FREEZER_ONLINE) && + !(child->state & CGROUP_FROZEN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + /* are all tasks frozen? */ + css_task_iter_start(css, 0, &it); + + while ((task = css_task_iter_next(&it))) { + if (freezing(task)) { + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (!frozen(task) && !freezer_should_skip(task)) + goto out_iter_end; + } + } + + freezer->state |= CGROUP_FROZEN; +out_iter_end: + css_task_iter_end(&it); +} + +static int freezer_read(struct seq_file *m, void *v) +{ + struct cgroup_subsys_state *css = seq_css(m), *pos; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + /* update states bottom-up */ + css_for_each_descendant_post(pos, css) { + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + update_if_frozen(pos); + + rcu_read_lock(); + css_put(pos); + } + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); + + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); + seq_putc(m, '\n'); + return 0; +} + +static void freeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + freeze_task(task); + css_task_iter_end(&it); +} + +static void unfreeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + __thaw_task(task); + css_task_iter_end(&it); +} + +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze, + unsigned int state) +{ + /* also synchronizes against task migration, see freezer_attach() */ + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + return; + + if (freeze) { + if (!(freezer->state & CGROUP_FREEZING)) + atomic_inc(&system_freezing_cnt); + freezer->state |= state; + freeze_cgroup(freezer); + } else { + bool was_freezing = freezer->state & CGROUP_FREEZING; + + freezer->state &= ~state; + + if (!(freezer->state & CGROUP_FREEZING)) { + if (was_freezing) + atomic_dec(&system_freezing_cnt); + freezer->state &= ~CGROUP_FROZEN; + unfreeze_cgroup(freezer); + } + } +} + +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @freezer according to @freeze. The operations are + * recursive - all descendants of @freezer will be affected. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ + struct cgroup_subsys_state *pos; + + /* + * Update all its descendants in pre-order traversal. Each + * descendant will try to inherit its parent's FREEZING state as + * CGROUP_FREEZING_PARENT. + */ + mutex_lock(&freezer_mutex); + rcu_read_lock(); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); + struct freezer *parent = parent_freezer(pos_f); + + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + if (pos_f == freezer) + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + else + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + + rcu_read_lock(); + css_put(pos); + } + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + bool freeze; + + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) + freeze = false; + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + freeze = true; + else + return -EINVAL; + + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; +} + +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_PARENT); +} + +static struct cftype files[] = { + { + .name = "state", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = freezer_read, + .write = freezer_write, + }, + { + .name = "self_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_self_freezing_read, + }, + { + .name = "parent_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_parent_freezing_read, + }, + { } /* terminate */ +}; + +struct cgroup_subsys freezer_cgrp_subsys = { + .css_alloc = freezer_css_alloc, + .css_online = freezer_css_online, + .css_offline = freezer_css_offline, + .css_free = freezer_css_free, + .attach = freezer_attach, + .fork = freezer_fork, + .legacy_cftypes = files, +}; diff --git a/kernel/compat.c b/kernel/compat.c index d8a36c6ad7c9..b5f7063c0db6 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -346,8 +346,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return -EFAULT; switch (_NSIG_WORDS) { case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); + /* fall through */ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); + /* fall through */ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); + /* fall through */ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } #else diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 7510dc687c0d..4b280fc7dd67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } #endif + /* Fall through */ case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) goto default_handle; if (tmp == 0) break; - /* Fall through on tmp < 0 */ + /* Fall through - on tmp < 0 */ case 'c': /* Continue packet */ case 's': /* Single step packet */ if (kgdb_contthread && kgdb_contthread != current) { @@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks) break; } dbg_activate_sw_breakpoints(); - /* Fall through to default processing */ + /* Fall through - to default processing */ default: default_handle: error = kgdb_arch_handle_exception(ks->ex_vector, @@ -1094,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) return error; case 's': case 'c': - strcpy(remcom_in_buffer, cmd); + strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); return 0; case '$': - strcpy(remcom_in_buffer, cmd); + strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); gdbstub_prev_in_buf_pos = 0; return 0; diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile index d4fc58f4b88d..efac857c5511 100644 --- a/kernel/debug/kdb/Makefile +++ b/kernel/debug/kdb/Makefile @@ -6,7 +6,6 @@ # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. # -CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p') obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a4b41484afe..3a5184eb6977 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -446,7 +446,7 @@ poll_again: char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) - strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); + strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); kdb_printf(kdb_prompt_str); kdb_nextline = 1; /* Prompt and input resets line number */ return kdb_read(buffer, bufsize); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 82a3b32a7cfc..9ecfa37c7fbf 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("machine %s\n", init_uts_ns.name.machine); kdb_printf("nodename %s\n", init_uts_ns.name.nodename); kdb_printf("domainname %s\n", init_uts_ns.name.domainname); - kdb_printf("ccversion %s\n", __stringify(CCVERSION)); now = __ktime_get_real_seconds(); time64_to_tm(now, 0, &tm); @@ -2584,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv) diag = kdbgetularg(argv[3], &whichcpu); if (diag) return diag; - if (!cpu_online(whichcpu)) { + if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) { kdb_printf("cpu %ld is not online\n", whichcpu); return KDB_BADCPUNUM; } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 50bf9b119bad..b8e6306e7e13 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) while ((name = kdb_walk_kallsyms(&pos))) { if (strncmp(name, prefix_name, prefix_len) == 0) { - strcpy(ks_namebuf, name); + strscpy(ks_namebuf, name, sizeof(ks_namebuf)); /* Work out the longest name that matches the prefix */ if (++number == 1) { prev_len = min_t(int, max_len-1, diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index a06ba3013b3b..83d711f8d665 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -38,6 +38,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL bool +config ARCH_HAS_DMA_PREP_COHERENT + bool + config ARCH_HAS_DMA_COHERENT_TO_PFN bool @@ -57,6 +60,7 @@ config SWIOTLB config DMA_REMAP depends on MMU + select GENERIC_ALLOCATOR bool config DMA_DIRECT_REMAP diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index fcdb23e8d2fc..2c2772e9702a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -311,7 +311,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, size_t size) { return swiotlb_force != SWIOTLB_FORCE && - (!dev || dma_capable(dev, dma_addr, size)); + dma_capable(dev, dma_addr, size); } dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index c000906348c9..f7afdadb6770 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -238,17 +238,13 @@ u64 dma_get_required_mask(struct device *dev) } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#ifndef arch_dma_alloc_attrs -#define arch_dma_alloc_attrs(dev) (true) -#endif - void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; - WARN_ON_ONCE(dev && !dev->coherent_dma_mask); + WARN_ON_ONCE(!dev->coherent_dma_mask); if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; @@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (!arch_dma_alloc_attrs(&dev)) - return NULL; - if (dma_is_direct(ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 53012db1e53c..6f7619c1f877 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -452,6 +452,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, unsigned long mask; unsigned long offset_slots; unsigned long max_slots; + unsigned long tmp_io_tlb_used; if (no_iotlb_memory) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); @@ -538,9 +539,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, } while (index != wrap); not_found: + tmp_io_tlb_used = io_tlb_used; + spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", + size, io_tlb_nslabs, tmp_io_tlb_used); return DMA_MAPPING_ERROR; found: io_tlb_used += nslots; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e6a0d6be87e3..78f61bfc6b79 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); @@ -2028,7 +2029,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (uc->handler) { rc = uc->handler(uc, regs); WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + "bad rc=0x%x from %ps()\n", rc, uc->handler); } if (uc->ret_handler) diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..8361a560cd1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -422,7 +422,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -462,7 +462,7 @@ retry: * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: @@ -483,7 +483,7 @@ assign_new_owner: put_task_struct(c); goto retry; } - mm->owner = c; + WRITE_ONCE(mm->owner, c); task_unlock(c); put_task_struct(c); } diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..feb80712b913 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v) { struct fei_attr *attr = list_entry(v, struct fei_attr, list); - seq_printf(m, "%pf\n", attr->kp.addr); + seq_printf(m, "%ps\n", attr->kp.addr); return 0; } diff --git a/kernel/fork.c b/kernel/fork.c index fbe9dfcd8680..b4cba953040a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,6 +11,7 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ +#include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> @@ -21,6 +22,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> @@ -953,6 +955,15 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static __always_inline void mm_clear_owner(struct mm_struct *mm, + struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + if (mm->owner == p) + WRITE_ONCE(mm->owner, NULL); +#endif +} + static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG @@ -1223,7 +1234,9 @@ static int wait_for_vfork_done(struct task_struct *child, int killed; freezer_do_not_count(); + cgroup_enter_frozen(); killed = wait_for_completion_killable(vfork); + cgroup_leave_frozen(false); freezer_count(); if (killed) { @@ -1339,6 +1352,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; + mm_init_owner(mm, NULL); mmput(mm); fail_nomem: @@ -1670,6 +1684,73 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ } +static int pidfd_release(struct inode *inode, struct file *file) +{ + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); + return 0; +} + +#ifdef CONFIG_PROC_FS +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); + struct pid *pid = f->private_data; + + seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); + seq_putc(m, '\n'); +} +#endif + +const struct file_operations pidfd_fops = { + .release = pidfd_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + +static void __delayed_free_task(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + free_task(tsk); +} + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ + if (IS_ENABLED(CONFIG_MEMCG)) + call_rcu(&tsk->rcu, __delayed_free_task); + else + free_task(tsk); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1682,13 +1763,14 @@ static __latent_entropy struct task_struct *copy_process( unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, + int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { - int retval; + int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; @@ -1738,6 +1820,31 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD) { + int reserved; + + /* + * - CLONE_PARENT_SETTID is useless for pidfds and also + * parent_tidptr is used to return pidfds. + * - CLONE_DETACHED is blocked so that we can potentially + * reuse it later for CLONE_PIDFD. + * - CLONE_THREAD is blocked until someone really needs it. + */ + if (clone_flags & + (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) + return ERR_PTR(-EINVAL); + + /* + * Verify that parent_tidptr is sane so we can potentially + * reuse it later. + */ + if (get_user(reserved, parent_tidptr)) + return ERR_PTR(-EFAULT); + + if (reserved != 0) + return ERR_PTR(-EINVAL); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -1944,6 +2051,22 @@ static __latent_entropy struct task_struct *copy_process( } } + /* + * This has to happen after we've potentially unshared the file + * descriptor table (so that the pidfd doesn't leak into the child + * if the fd table isn't shared). + */ + if (clone_flags & CLONE_PIDFD) { + retval = pidfd_create(pid); + if (retval < 0) + goto bad_fork_free_pid; + + pidfd = retval; + retval = put_user(pidfd, parent_tidptr); + if (retval) + goto bad_fork_put_pidfd; + } + #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1970,7 +2093,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif - clear_all_latency_tracing(p); + clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); @@ -2004,7 +2127,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_free_pid; + goto bad_fork_cgroup_threadgroup_change_end; /* * From this point on we must avoid any synchronous user-space @@ -2119,8 +2242,12 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); -bad_fork_free_pid: +bad_fork_cgroup_threadgroup_change_end: cgroup_threadgroup_change_end(current); +bad_fork_put_pidfd: + if (clone_flags & CLONE_PIDFD) + ksys_close(pidfd); +bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: @@ -2131,8 +2258,10 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + mm_clear_owner(p->mm, p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -2163,7 +2292,7 @@ bad_fork_cleanup_count: bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); - free_task(p); + delayed_free_task(p); fork_out: spin_lock_irq(¤t->sighand->siglock); hlist_del_init(&delayed.node); @@ -2184,7 +2313,7 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task); @@ -2236,7 +2365,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); diff --git a/kernel/futex.c b/kernel/futex.c index 6262f1534ac9..2268b97d5439 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -543,7 +543,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1e3823fa799b..f71c1adcff31 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -53,6 +53,7 @@ config GCOV_PROFILE_ALL choice prompt "Specify GCOV format" depends on GCOV_KERNEL + depends on CC_IS_GCC ---help--- The gcov format is usually determined by the GCC version, and the default is chosen according to your GCC version. However, there are @@ -62,7 +63,7 @@ choice config GCOV_FORMAT_3_4 bool "GCC 3.4 format" - depends on CC_IS_GCC && GCC_VERSION < 40700 + depends on GCC_VERSION < 40700 ---help--- Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index ff06d64df397..d66a74b0f100 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,5 +2,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9c7c8d5c18f2..0ffe9f194080 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -22,88 +22,8 @@ #include <linux/sched.h> #include "gcov.h" -static int gcov_events_enabled; -static DEFINE_MUTEX(gcov_lock); - -/* - * __gcov_init is called by gcc-generated constructor code for each object - * file compiled with -fprofile-arcs. - */ -void __gcov_init(struct gcov_info *info) -{ - static unsigned int gcov_version; - - mutex_lock(&gcov_lock); - if (gcov_version == 0) { - gcov_version = gcov_info_version(info); - /* - * Printing gcc's version magic may prove useful for debugging - * incompatibility reports. - */ - pr_info("version magic: 0x%x\n", gcov_version); - } - /* - * Add new profiling data structure to list and inform event - * listener. - */ - gcov_info_link(info); - if (gcov_events_enabled) - gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); -} -EXPORT_SYMBOL(__gcov_init); - -/* - * These functions may be referenced by gcc-generated profiling code but serve - * no function for kernel profiling. - */ -void __gcov_flush(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_flush); - -void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_add); - -void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_single); - -void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_delta); - -void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_ior); - -void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_time_profile); - -void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_icall_topn); - -void __gcov_exit(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_exit); +int gcov_events_enabled; +DEFINE_MUTEX(gcov_lock); /** * gcov_enable_events - enable event reporting through gcov_event() @@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within_module((unsigned long)info, mod)) { + if (gcov_info_within_module(info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c new file mode 100644 index 000000000000..c94b820a1b62 --- /dev/null +++ b/kernel/gcov/clang.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Google, Inc. + * modified from kernel/gcov/gcc_4_7.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * LLVM uses profiling data that's deliberately similar to GCC, but has a + * very different way of exporting that data. LLVM calls llvm_gcov_init() once + * per module, and provides a couple of callbacks that we can use to ask for + * more data. + * + * We care about the "writeout" callback, which in turn calls back into + * compiler-rt/this module to dump all the gathered coverage data to disk: + * + * llvm_gcda_start_file() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * [... repeats for each function ...] + * llvm_gcda_summary_info() + * llvm_gcda_end_file() + * + * This design is much more stateless and unstructured than gcc's, and is + * intended to run at process exit. This forces us to keep some local state + * about which module we're dealing with at the moment. On the other hand, it + * also means we don't depend as much on how LLVM represents profiling data + * internally. + * + * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more + * details on how this works, particularly GCOVProfiler::emitProfileArcs(), + * GCOVProfiler::insertCounterWriteout(), and + * GCOVProfiler::insertFlush(). + */ + +#define pr_fmt(fmt) "gcov: " fmt + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/printk.h> +#include <linux/ratelimit.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "gcov.h" + +typedef void (*llvm_gcov_callback)(void); + +struct gcov_info { + struct list_head head; + + const char *filename; + unsigned int version; + u32 checksum; + + struct list_head functions; +}; + +struct gcov_fn_info { + struct list_head head; + + u32 ident; + u32 checksum; + u8 use_extra_checksum; + u32 cfg_checksum; + + u32 num_counters; + u64 *counters; + const char *function_name; +}; + +static struct gcov_info *current_info; + +static LIST_HEAD(clang_gcov_list); + +void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) +{ + struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + INIT_LIST_HEAD(&info->functions); + + mutex_lock(&gcov_lock); + + list_add_tail(&info->head, &clang_gcov_list); + current_info = info; + writeout(); + current_info = NULL; + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(llvm_gcov_init); + +void llvm_gcda_start_file(const char *orig_filename, const char version[4], + u32 checksum) +{ + current_info->filename = orig_filename; + memcpy(¤t_info->version, version, sizeof(current_info->version)); + current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); + +void llvm_gcda_emit_function(u32 ident, const char *function_name, + u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) +{ + struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + info->ident = ident; + info->checksum = func_checksum; + info->use_extra_checksum = use_extra_checksum; + info->cfg_checksum = cfg_checksum; + if (function_name) + info->function_name = kstrdup(function_name, GFP_KERNEL); + + list_add_tail(&info->head, ¤t_info->functions); +} +EXPORT_SYMBOL(llvm_gcda_emit_function); + +void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) +{ + struct gcov_fn_info *info = list_last_entry(¤t_info->functions, + struct gcov_fn_info, head); + + info->num_counters = num_counters; + info->counters = counters; +} +EXPORT_SYMBOL(llvm_gcda_emit_arcs); + +void llvm_gcda_summary_info(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_summary_info); + +void llvm_gcda_end_file(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_end_file); + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return list_first_entry_or_null(&clang_gcov_list, + struct gcov_info, head); + if (list_is_last(&info->head, &clang_gcov_list)) + return NULL; + return list_next_entry(info, head); +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + list_add_tail(&info->head, &clang_gcov_list); +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + /* Generic code unlinks while iterating. */ + __list_del_entry(&info->head); +} + +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info->filename, mod); +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + struct gcov_fn_info *fn; + + list_for_each_entry(fn, &info->functions, head) + memset(fn->counters, 0, + sizeof(fn->counters[0]) * fn->num_counters); +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null( + &info1->functions, struct gcov_fn_info, head); + struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null( + &info2->functions, struct gcov_fn_info, head); + + if (info1->checksum != info2->checksum) + return false; + if (!fn_ptr1) + return fn_ptr1 == fn_ptr2; + while (!list_is_last(&fn_ptr1->head, &info1->functions) && + !list_is_last(&fn_ptr2->head, &info2->functions)) { + if (fn_ptr1->checksum != fn_ptr2->checksum) + return false; + if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) + return false; + if (fn_ptr1->use_extra_checksum && + fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) + return false; + fn_ptr1 = list_next_entry(fn_ptr1, head); + fn_ptr2 = list_next_entry(fn_ptr2, head); + } + return list_is_last(&fn_ptr1->head, &info1->functions) && + list_is_last(&fn_ptr2->head, &info2->functions); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ + struct gcov_fn_info *dfn_ptr; + struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions, + struct gcov_fn_info, head); + + list_for_each_entry(dfn_ptr, &dst->functions, head) { + u32 i; + + for (i = 0; i < sfn_ptr->num_counters; i++) + dfn_ptr->counters[i] += sfn_ptr->counters[i]; + } +} + +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ + size_t cv_size; /* counter values size */ + struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), + GFP_KERNEL); + if (!fn_dup) + return NULL; + INIT_LIST_HEAD(&fn_dup->head); + + fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); + if (!fn_dup->function_name) + goto err_name; + + cv_size = fn->num_counters * sizeof(fn->counters[0]); + fn_dup->counters = vmalloc(cv_size); + if (!fn_dup->counters) + goto err_counters; + memcpy(fn_dup->counters, fn->counters, cv_size); + + return fn_dup; + +err_counters: + kfree(fn_dup->function_name); +err_name: + kfree(fn_dup); + return NULL; +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + struct gcov_fn_info *fn; + + dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); + if (!dup) + return NULL; + INIT_LIST_HEAD(&dup->head); + INIT_LIST_HEAD(&dup->functions); + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err; + + list_for_each_entry(fn, &info->functions, head) { + struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn); + + if (!fn_dup) + goto err; + list_add_tail(&fn_dup->head, &dup->functions); + } + + return dup; + +err: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + struct gcov_fn_info *fn, *tmp; + + list_for_each_entry_safe(fn, tmp, &info->functions, head) { + kfree(fn->function_name); + vfree(fn->counters); + list_del(&fn->head); + kfree(fn); + } + kfree(info->filename); + kfree(info); +} + +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ + struct gcov_fn_info *fi_ptr; + size_t pos = 0; + + /* File header. */ + pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); + pos += store_gcov_u32(buffer, pos, info->version); + pos += store_gcov_u32(buffer, pos, info->checksum); + + list_for_each_entry(fi_ptr, &info->functions, head) { + u32 i; + u32 len = 2; + + if (fi_ptr->use_extra_checksum) + len++; + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); + pos += store_gcov_u32(buffer, pos, len); + pos += store_gcov_u32(buffer, pos, fi_ptr->ident); + pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); + if (fi_ptr->use_extra_checksum) + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); + pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); + for (i = 0; i < fi_ptr->num_counters; i++) + pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]); + } + + return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 2dddecbdbe6e..801ee4b0b969 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ca5e5c0ef853..ec37563674d6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c new file mode 100644 index 000000000000..3cf736b9f880 --- /dev/null +++ b/kernel/gcov/gcc_base.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include "gcov.h" + +/* + * __gcov_init is called by gcc-generated constructor code for each object + * file compiled with -fprofile-arcs. + */ +void __gcov_init(struct gcov_info *info) +{ + static unsigned int gcov_version; + + mutex_lock(&gcov_lock); + if (gcov_version == 0) { + gcov_version = gcov_info_version(info); + /* + * Printing gcc's version magic may prove useful for debugging + * incompatibility reports. + */ + pr_info("version magic: 0x%x\n", gcov_version); + } + /* + * Add new profiling data structure to list and inform event + * listener. + */ + gcov_info_link(info); + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(__gcov_init); + +/* + * These functions may be referenced by gcc-generated profiling code but serve + * no function for kernel profiling. + */ +void __gcov_flush(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_flush); + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_add); + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_single); + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_delta); + +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index de118ad4a024..6ab2c1808c9d 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -15,6 +15,7 @@ #ifndef GCOV_H #define GCOV_H GCOV_H +#include <linux/module.h> #include <linux/types.h> /* @@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info); struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); +bool gcov_info_within_module(struct gcov_info *info, struct module *mod); /* Base interface. */ enum gcov_action { @@ -83,4 +85,7 @@ struct gcov_link { }; extern const struct gcov_link gcov_link[]; +extern int gcov_events_enabled; +extern struct mutex gcov_lock; + #endif /* GCOV_H */ diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh new file mode 100755 index 000000000000..591a94f7b387 --- /dev/null +++ b/kernel/gen_ikh_data.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS_PROC. +set -e +spath="$(dirname "$(readlink -f "$0")")" +kroot="$spath/.." +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +# Script filename relative to the kernel source root +# We add it to the archive because it is small and any changes +# to this script will also cause a rebuild of the archive. +sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" + +src_file_list=" +include/ +arch/$SRCARCH/include/ +$sfile +" + +obj_file_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# iter=1 +# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; +# else; iter=$(($(cat /tmp/iter) + 1)); fi +# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $kroot > /dev/null +src_files_md5="$(find $src_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $obj_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" + +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && + [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + exit +fi + +if [ "${quiet}" != "silent_" ]; then + echo " GEN $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $kroot > /dev/null +for f in $src_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $obj_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | + xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 516c00a5e867..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); - seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6df5ddfdb0f8..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); - if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", irq, action->handler)) local_irq_disable(); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9f8a709337cf..c52b737ab8e3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = { &actions_attr.attr, NULL }; +ATTRIBUTE_GROUPS(irq); static struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = irq_attrs, + .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 53a081392115..78f3ddeb7fe4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -781,7 +781,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) ret = 0; break; default: - pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) */ raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %pf", + printk(KERN_CONT " threaded [<%p>] %ps", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f1d0e00a3971..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); @@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image) goto out_free_desc; desc->tfm = tfm; - desc->flags = 0; ret = crypto_shash_init(desc); if (ret < 0) diff --git a/kernel/kheaders.c b/kernel/kheaders.c new file mode 100644 index 000000000000..70ae6052920d --- /dev/null +++ b/kernel/kheaders.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel headers useful to build tracing programs + * such as for running eBPF tracing tools. + * + * (Borrowed code from kernel/configs.c) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/uaccess.h> + +/* + * Define kernel_headers_data and kernel_headers_data_end, within which the + * compressed kernel headers are stored. The file is first compressed with xz. + */ + +asm ( +" .pushsection .rodata, \"a\" \n" +" .global kernel_headers_data \n" +"kernel_headers_data: \n" +" .incbin \"kernel/kheaders_data.tar.xz\" \n" +" .global kernel_headers_data_end \n" +"kernel_headers_data_end: \n" +" .popsection \n" +); + +extern char kernel_headers_data; +extern char kernel_headers_data_end; + +static ssize_t +ikheaders_read_current(struct file *file, char __user *buf, + size_t len, loff_t *offset) +{ + return simple_read_from_buffer(buf, len, offset, + &kernel_headers_data, + &kernel_headers_data_end - + &kernel_headers_data); +} + +static const struct file_operations ikheaders_file_ops = { + .read = ikheaders_read_current, + .llseek = default_llseek, +}; + +static int __init ikheaders_init(void) +{ + struct proc_dir_entry *entry; + + /* create the current headers file */ + entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, + &ikheaders_file_ops); + if (!entry) + return -ENOMEM; + + proc_set_size(entry, + &kernel_headers_data_end - + &kernel_headers_data); + return 0; +} + +static void __exit ikheaders_cleanup(void) +{ + remove_proc_entry("kheaders.tar.xz", NULL); +} + +module_init(ikheaders_init); +module_exit(ikheaders_cleanup); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joel Fernandes"); +MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel"); diff --git a/kernel/kthread.c b/kernel/kthread.c index 5942eeafb9ac..be4e8795561a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -11,6 +11,7 @@ #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> +#include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 99a5b5f46dc5..871734ea2f04 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -67,13 +67,10 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; -void clear_all_latency_tracing(struct task_struct *p) +void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; - if (!latencytop_enabled) - return; - raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; @@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk, int firstnonnull = MAXLR + 1; int i; - if (!latencytop_enabled) - return; - /* skip kernel threads for now */ if (!tsk->mm) return; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..91cd519756d3 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -419,6 +419,7 @@ static struct attribute *klp_patch_attrs[] = { &force_kobj_attr.attr, NULL }; +ATTRIBUTE_GROUPS(klp_patch); static void klp_free_object_dynamic(struct klp_object *obj) { @@ -426,7 +427,13 @@ static void klp_free_object_dynamic(struct klp_object *obj) kfree(obj); } -static struct klp_object *klp_alloc_object_dynamic(const char *name) +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func); +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj); + +static struct klp_object *klp_alloc_object_dynamic(const char *name, + struct klp_patch *patch) { struct klp_object *obj; @@ -442,7 +449,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name) } } - INIT_LIST_HEAD(&obj->func_list); + klp_init_object_early(patch, obj); obj->dynamic = true; return obj; @@ -471,6 +478,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, } } + klp_init_func_early(obj, func); /* * func->new_func is same as func->old_func. These addresses are * set when the object is loaded, see klp_init_object_loaded(). @@ -490,11 +498,9 @@ static int klp_add_object_nops(struct klp_patch *patch, obj = klp_find_object(patch, old_obj); if (!obj) { - obj = klp_alloc_object_dynamic(old_obj->name); + obj = klp_alloc_object_dynamic(old_obj->name, patch); if (!obj) return -ENOMEM; - - list_add_tail(&obj->node, &patch->obj_list); } klp_for_each_func(old_obj, old_func) { @@ -505,8 +511,6 @@ static int klp_add_object_nops(struct klp_patch *patch, func = klp_alloc_func_nop(old_func, obj); if (!func) return -ENOMEM; - - list_add_tail(&func->node, &obj->func_list); } return 0; @@ -546,7 +550,7 @@ static void klp_kobj_release_patch(struct kobject *kobj) static struct kobj_type klp_ktype_patch = { .release = klp_kobj_release_patch, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = klp_patch_attrs, + .default_groups = klp_patch_groups, }; static void klp_kobj_release_object(struct kobject *kobj) @@ -588,13 +592,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only) continue; list_del(&func->node); - - /* Might be called from klp_init_patch() error path. */ - if (func->kobj_added) { - kobject_put(&func->kobj); - } else if (func->nop) { - klp_free_func_nop(func); - } + kobject_put(&func->kobj); } } @@ -624,13 +622,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only) continue; list_del(&obj->node); - - /* Might be called from klp_init_patch() error path. */ - if (obj->kobj_added) { - kobject_put(&obj->kobj); - } else if (obj->dynamic) { - klp_free_object_dynamic(obj); - } + kobject_put(&obj->kobj); } } @@ -675,10 +667,8 @@ static void klp_free_patch_finish(struct klp_patch *patch) * this is called when the patch gets disabled and it * cannot get enabled again. */ - if (patch->kobj_added) { - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); - } + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); /* Put the module after the last access to struct klp_patch. */ if (!patch->forced) @@ -700,8 +690,6 @@ static void klp_free_patch_work_fn(struct work_struct *work) static int klp_init_func(struct klp_object *obj, struct klp_func *func) { - int ret; - if (!func->old_name) return -EINVAL; @@ -724,13 +712,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) * object. If the user selects 0 for old_sympos, then 1 will be used * since a unique symbol will be the first occurrence. */ - ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s,%lu", func->old_name, - func->old_sympos ? func->old_sympos : 1); - if (!ret) - func->kobj_added = true; - - return ret; + return kobject_add(&func->kobj, &obj->kobj, "%s,%lu", + func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* Arches may override this to finish any remaining arch-specific tasks */ @@ -801,11 +785,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) klp_find_object_module(obj); name = klp_is_module(obj) ? obj->name : "vmlinux"; - ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, - &patch->kobj, "%s", name); + ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name); if (ret) return ret; - obj->kobj_added = true; klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); @@ -819,6 +801,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) return ret; } +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func) +{ + kobject_init(&func->kobj, &klp_ktype_func); + list_add_tail(&func->node, &obj->func_list); +} + +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj) +{ + INIT_LIST_HEAD(&obj->func_list); + kobject_init(&obj->kobj, &klp_ktype_object); + list_add_tail(&obj->node, &patch->obj_list); +} + static int klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; @@ -829,7 +826,7 @@ static int klp_init_patch_early(struct klp_patch *patch) INIT_LIST_HEAD(&patch->list); INIT_LIST_HEAD(&patch->obj_list); - patch->kobj_added = false; + kobject_init(&patch->kobj, &klp_ktype_patch); patch->enabled = false; patch->forced = false; INIT_WORK(&patch->free_work, klp_free_patch_work_fn); @@ -839,13 +836,10 @@ static int klp_init_patch_early(struct klp_patch *patch) if (!obj->funcs) return -EINVAL; - INIT_LIST_HEAD(&obj->func_list); - obj->kobj_added = false; - list_add_tail(&obj->node, &patch->obj_list); + klp_init_object_early(patch, obj); klp_for_each_func_static(obj, func) { - func->kobj_added = false; - list_add_tail(&func->node, &obj->func_list); + klp_init_func_early(obj, func); } } @@ -860,11 +854,9 @@ static int klp_init_patch(struct klp_patch *patch) struct klp_object *obj; int ret; - ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, - klp_root_kobj, "%s", patch->mod->name); + ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name); if (ret) return ret; - patch->kobj_added = true; if (patch->replace) { ret = klp_add_nops(patch); @@ -926,9 +918,6 @@ static int __klp_enable_patch(struct klp_patch *patch) if (WARN_ON(patch->enabled)) return -EINVAL; - if (!patch->kobj_added) - return -EINVAL; - pr_notice("enabling patch '%s'\n", patch->mod->name); klp_init_transition(patch, KLP_PATCHED); @@ -1003,11 +992,10 @@ int klp_enable_patch(struct klp_patch *patch) return -ENODEV; if (!klp_have_reliable_stack()) { - pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); - return -EOPNOTSUPP; + pr_warn("This architecture doesn't have support for the livepatch consistency model.\n"); + pr_warn("The livepatch transition may never complete.\n"); } - mutex_lock(&klp_mutex); ret = klp_init_patch_early(patch); @@ -1220,14 +1208,6 @@ void klp_module_going(struct module *mod) static int __init klp_init(void) { - int ret; - - ret = klp_check_compiler_support(); - if (ret) { - pr_info("Your compiler is too old; turning off.\n"); - return -EINVAL; - } - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); if (!klp_root_kobj) return -ENOMEM; diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..1490e63f69a9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, */ return devmem->page_fault(vma, addr, page, flags, pmdp); } -EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ static void pgmap_array_delete(struct resource *res) @@ -148,6 +147,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = altmap, + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -214,7 +219,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +227,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 79c9be2dbbe9..d354341f8cc0 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -20,7 +20,7 @@ struct load_info { unsigned long len; Elf_Shdr *sechdrs; char *secstrings, *strtab; - unsigned long symoffs, stroffs; + unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; struct _ddebug *debug; unsigned int num_debug; bool sig_ok; diff --git a/kernel/module.c b/kernel/module.c index a9020bdd4cf6..6e6712b3aaf5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -290,6 +290,11 @@ bool is_module_sig_enforced(void) } EXPORT_SYMBOL(is_module_sig_enforced); +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} + /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); @@ -2637,6 +2642,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_layout.size += strtab_size; + info->core_typeoffs = mod->core_layout.size; + mod->core_layout.size += ndst * sizeof(char); mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ @@ -2650,6 +2657,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) __alignof__(struct mod_kallsyms)); info->mod_kallsyms_init_off = mod->init_layout.size; mod->init_layout.size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod->init_layout.size; + mod->init_layout.size += nsrc * sizeof(char); mod->init_layout.size = debug_align(mod->init_layout.size); } @@ -2673,20 +2682,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; - /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->kallsyms->num_symtab; i++) - mod->kallsyms->symtab[i].st_size - = elf_type(&mod->kallsyms->symtab[i], info); - - /* Now populate the cut down core kallsyms for after init. */ + /* + * Now populate the cut down core kallsyms for after init + * and set types up while we still have access to sections. + */ mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { + mod->kallsyms->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { + mod->core_kallsyms.typetab[ndst] = + mod->kallsyms->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], @@ -4086,7 +4098,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, const Elf_Sym *sym = &kallsyms->symtab[symnum]; *value = kallsyms_symbol_value(sym); - *type = sym->st_size; + *type = kallsyms->typetab[symnum]; strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); diff --git a/kernel/notifier.c b/kernel/notifier.c index 6196af8a8223..bfc95b3e4235 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -22,6 +22,7 @@ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { + WARN_ONCE(((*nl) == n), "double register detected"); if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); diff --git a/kernel/padata.c b/kernel/padata.c index 3e2633ae3bca..2d2fddbb7a4c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = { ¶llel_cpumask_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(padata_default); static ssize_t padata_sysfs_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = { static struct kobj_type padata_attr_type = { .sysfs_ops = &padata_sysfs_ops, - .default_attrs = padata_default_attrs, + .default_groups = padata_default_groups, .release = padata_sysfs_release, }; diff --git a/kernel/panic.c b/kernel/panic.c index c1fcaad337b7..8779d64bace0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -306,6 +306,8 @@ void panic(const char *fmt, ...) * shutting down. But if there is a chance of * rebooting the system it will be rebooted. */ + if (panic_reboot_mode != REBOOT_UNDEFINED) + reboot_mode = panic_reboot_mode; emergency_restart(); } #ifdef __sparc__ @@ -321,6 +323,9 @@ void panic(const char *fmt, ...) disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); + + /* Do not scroll important messages printed above */ + suppress_printk = 1; local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..89548d35eefb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -32,7 +32,6 @@ #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> -#include <linux/hash.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02ca827b8fac..17102fd4c136 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +/* + * System may need to suppress printk message under certain + * circumstances, like after kernel panic happens. + */ +int __read_mostly suppress_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -1943,6 +1949,10 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; u64 curr_log_seq; + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4b58c907b4b7..390aab20115e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -11,11 +11,6 @@ #define __LINUX_RCU_H #include <trace/events/rcu.h> -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) @@ -216,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) + trace_rcu_invoke_kfree_callback(rn, head, offset); kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head);) + trace_rcu_invoke_callback(rn, head); f = head->func; WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ec77ec336f58..980ca3ca643f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1969,14 +1969,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp) */ int rcutree_dying_cpu(unsigned int cpu) { - RCU_TRACE(bool blkd;) - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) - RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) + bool blkd; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return 0; - RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) + blkd = !!(rnp->qsmask & rdp->grpmask); trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); return 0; @@ -2392,7 +2392,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); return; diff --git a/kernel/reboot.c b/kernel/reboot.c index e1b79b6a2735..b9e79e8c7226 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(cad_pid); #define DEFAULT_REBOOT_MODE #endif enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; +enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; /* * This variable is used privately to keep track of whether or not @@ -519,6 +520,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); static int __init reboot_setup(char *str) { for (;;) { + enum reboot_mode *mode; + /* * Having anything passed on the command line via * reboot= will cause us to disable DMI checking @@ -526,17 +529,24 @@ static int __init reboot_setup(char *str) */ reboot_default = 0; + if (!strncmp(str, "panic_", 6)) { + mode = &panic_reboot_mode; + str += 6; + } else { + mode = &reboot_mode; + } + switch (*str) { case 'w': - reboot_mode = REBOOT_WARM; + *mode = REBOOT_WARM; break; case 'c': - reboot_mode = REBOOT_COLD; + *mode = REBOOT_COLD; break; case 'h': - reboot_mode = REBOOT_HARD; + *mode = REBOOT_HARD; break; case 's': @@ -553,11 +563,11 @@ static int __init reboot_setup(char *str) if (rc) return rc; } else - reboot_mode = REBOOT_SOFT; + *mode = REBOOT_SOFT; break; } case 'g': - reboot_mode = REBOOT_GPIO; + *mode = REBOOT_GPIO; break; case 'b': diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5403479073b0..962cf343f798 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -600,13 +600,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); -static struct attribute *sugov_attributes[] = { +static struct attribute *sugov_attrs[] = { &rate_limit_us.attr, NULL }; +ATTRIBUTE_GROUPS(sugov); static struct kobj_type sugov_tunables_ktype = { - .default_attrs = sugov_attributes, + .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, }; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..7acc632c3b82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner <hannes@cmpxchg.org> * + * Polling support by Suren Baghdasaryan <surenb@google.com> + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/seqlock.h> +#include <linux/uaccess.h> #include <linux/cgroup.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/poll.h> #include <linux/psi.h> #include "sched.h" @@ -140,9 +147,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { @@ -156,16 +163,21 @@ __setup("psi=", setup_psi); #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; /* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work); static void group_init(struct psi_group *group) { @@ -173,9 +185,20 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); + /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); + mutex_init(&group->trigger_lock); + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->poll_states = 0; + group->poll_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; + rcu_assign_pointer(group->poll_kworker, NULL); } void __init psi_init(void) @@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; + + *pchanged_states = 0; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; - delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; + u32 changed_states = 0; int cpu; int s; - mutex_lock(&group->stat_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, aggregator, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group) /* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL)); + + if (pchanged_states) + *pchanged_states = changed_states; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; /* avgX= */ - now = sched_clock(); - expires = group->next_update; - if (now < expires) - goto out; + expires = group->avg_next_update; if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); @@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->stat_lock); - return nonidle_total; + + return avg_next_update; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; + u64 now; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); + + mutex_lock(&group->avgs_lock); + now = sched_clock(); + + collect_percpu_times(group, PSI_AVGS, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); if (nonidle) { - unsigned long delay = 0; - u64 now; + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); + } + + mutex_unlock(&group->avgs_lock); +} + +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->polling_total, group->total[PSI_POLL], + sizeof(group->polling_total)); + group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + u64 *total = group->total[PSI_POLL]; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) + memcpy(group->polling_total, total, + sizeof(group->polling_total)); + + return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ + struct kthread_worker *kworker; + + /* Do not reschedule if already scheduled */ + if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + return; + + rcu_read_lock(); - now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; - schedule_delayed_work(dwork, delay); + kworker = rcu_dereference(group->poll_kworker); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(kworker)) + kthread_queue_delayed_work(kworker, &group->poll_work, delay); + else + atomic_set(&group->poll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ + struct kthread_delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + u64 now; + + dwork = container_of(work, struct kthread_delayed_work, work); + group = container_of(dwork, struct psi_group, poll_work); + + atomic_set(&group->poll_scheduled, 0); + + mutex_lock(&group->trigger_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->poll_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->polling_until = now + + group->poll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->polling_until) { + group->polling_next_update = ULLONG_MAX; + goto out; } + + if (now >= group->polling_next_update) + group->polling_next_update = update_triggers(group, now); + + psi_schedule_poll_work(group, + nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: + mutex_unlock(&group->trigger_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } -static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); + + return state_mask; } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } } @@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); } /** @@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; @@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) for (w = 0; w < 3; w++) avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + + if (!rcu_access_pointer(group->poll_kworker)) { + struct sched_param param = { + .sched_priority = MAX_RT_PRIO - 1, + }; + struct kthread_worker *kworker; + + kworker = kthread_create_worker(0, "psimon"); + if (IS_ERR(kworker)) { + kfree(t); + mutex_unlock(&group->trigger_lock); + return ERR_CAST(kworker); + } + sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + kthread_init_delayed_work(&group->poll_work, + psi_poll_work); + rcu_assign_pointer(group->poll_kworker, kworker); + } + + list_add(&t->node, &group->triggers); + group->poll_min_period = min(group->poll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->poll_states |= (1 << t->state); + + mutex_unlock(&group->trigger_lock); + + return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ + struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); + struct psi_group *group = t->group; + struct kthread_worker *kworker_to_destroy = NULL; + + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. + */ + wake_up_interruptible(&t->event_wait); + + mutex_lock(&group->trigger_lock); + + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->poll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->poll_min_period = period; + /* Destroy poll_kworker when the last trigger is destroyed */ + if (group->poll_states == 0) { + group->polling_until = 0; + kworker_to_destroy = rcu_dereference_protected( + group->poll_kworker, + lockdep_is_held(&group->trigger_lock)); + rcu_assign_pointer(group->poll_kworker, NULL); + } + } + + mutex_unlock(&group->trigger_lock); + + /* + * Wait for both *trigger_ptr from psi_trigger_replace and + * poll_kworker RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_kworker + */ + synchronize_rcu(); + /* + * Destroy the kworker after releasing trigger_lock to prevent a + * deadlock while waiting for psi_poll_work to acquire trigger_lock + */ + if (kworker_to_destroy) { + kthread_cancel_delayed_work_sync(&group->poll_work); + kthread_destroy_worker(kworker_to_destroy); + } + kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ + struct psi_trigger *old = *trigger_ptr; + + if (static_branch_likely(&psi_disabled)) + return; + + rcu_assign_pointer(*trigger_ptr, new); + if (old) + kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + rcu_read_lock(); + + t = rcu_dereference(*(void __rcu __force **)trigger_ptr); + if (!t) { + rcu_read_unlock(); + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + kref_get(&t->refcount); + + rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + kref_put(&t->refcount, psi_trigger_destroy); + + return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + psi_trigger_replace(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_replace(&seq->private, NULL); + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static int __init psi_proc_init(void) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3582eeb59893..811b4a86cdf6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -148,7 +148,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); - sd->arch = syscall_get_arch(); + sd->arch = syscall_get_arch(task); syscall_get_arguments(task, regs, args); sd->args[0] = args[0]; sd->args[1] = args[1]; @@ -331,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent, * Expects sighand and cred_guard_mutex locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was - * either not in the correct seccomp mode or it did not have an ancestral + * either not in the correct seccomp mode or did not have an ancestral * seccomp filter. */ static inline pid_t seccomp_can_sync_threads(void) @@ -594,7 +594,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); info->si_errno = reason; - info->si_arch = syscall_get_arch(); + info->si_arch = syscall_get_arch(current); info->si_syscall = syscall; } diff --git a/kernel/signal.c b/kernel/signal.c index 227ba170298e..c4dd66436fc5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -43,6 +43,7 @@ #include <linux/compiler.h> #include <linux/posix-timers.h> #include <linux/livepatch.h> +#include <linux/cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -146,9 +147,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) static bool recalc_sigpending_tsk(struct task_struct *t) { - if ((t->jobctl & JOBCTL_PENDING_MASK) || + if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked)) { + PENDING(&t->signal->shared_pending, &t->blocked) || + cgroup_task_frozen(t)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } @@ -838,6 +840,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; + /* fall through */ default: return -EPERM; } @@ -2108,6 +2111,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t preempt_disable(); read_unlock(&tasklist_lock); preempt_enable_no_resched(); + cgroup_enter_frozen(); freezable_schedule(); } else { /* @@ -2286,6 +2290,7 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ + cgroup_enter_frozen(); freezable_schedule(); return true; } else { @@ -2332,6 +2337,43 @@ static void do_jobctl_trap(void) } } +/** + * do_freezer_trap - handle the freezer jobctl trap + * + * Puts the task into frozen state, if only the task is not about to quit. + * In this case it drops JOBCTL_TRAP_FREEZE. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, + * which is always released before returning. + */ +static void do_freezer_trap(void) + __releases(¤t->sighand->siglock) +{ + /* + * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, + * let's make another loop to give it a chance to be handled. + * In any case, we'll return back. + */ + if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != + JOBCTL_TRAP_FREEZE) { + spin_unlock_irq(¤t->sighand->siglock); + return; + } + + /* + * Now we're sure that there is no pending fatal signal and no + * pending traps. Clear TIF_SIGPENDING to not get out of schedule() + * immediately (if there is a non-fatal signal pending), and + * put the task into sleep. + */ + __set_current_state(TASK_INTERRUPTIBLE); + clear_thread_flag(TIF_SIGPENDING); + spin_unlock_irq(¤t->sighand->siglock); + cgroup_enter_frozen(); + freezable_schedule(); +} + static int ptrace_signal(int signr, kernel_siginfo_t *info) { /* @@ -2452,9 +2494,24 @@ relock: do_signal_stop(0)) goto relock; - if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { - do_jobctl_trap(); + if (unlikely(current->jobctl & + (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { + if (current->jobctl & JOBCTL_TRAP_MASK) { + do_jobctl_trap(); + spin_unlock_irq(&sighand->siglock); + } else if (current->jobctl & JOBCTL_TRAP_FREEZE) + do_freezer_trap(); + + goto relock; + } + + /* + * If the task is leaving the frozen state, let's update + * cgroup counters and reset the frozen bit. + */ + if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); + cgroup_leave_frozen(false); goto relock; } @@ -2550,6 +2607,8 @@ relock: fatal: spin_unlock_irq(&sighand->siglock); + if (unlikely(cgroup_task_frozen(current))) + cgroup_leave_frozen(true); /* * Anything else is fatal, maybe with a core dump. @@ -3513,7 +3572,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) return kill_something_info(sig, &info, pid); } -#ifdef CONFIG_PROC_FS /* * Verify that the signaler and signalee either are in the same pid namespace * or that the signaler's pid namespace is an ancestor of the signalee's pid @@ -3550,6 +3608,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) return copy_siginfo_from_user(kinfo, info); } +static struct pid *pidfd_to_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return tgid_pidfd_to_pid(file); +} + /** * sys_pidfd_send_signal - send a signal to a process through a task file * descriptor @@ -3586,7 +3652,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, return -EBADF; /* Is this a pidfd? */ - pid = tgid_pidfd_to_pid(f.file); + pid = pidfd_to_pid(f.file); if (IS_ERR(pid)) { ret = PTR_ERR(pid); goto err; @@ -3620,7 +3686,6 @@ err: fdput(f); return ret; } -#endif /* CONFIG_PROC_FS */ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..7231fb5953fc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,7 +513,7 @@ repeat: } preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); + "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; } } diff --git a/kernel/sys.c b/kernel/sys.c index 12df0e5434b8..bdbfe8d37418 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d21f4befaea4..4d9ae5ea6caf 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -167,9 +167,6 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ -/* kernel/signal.c */ -COND_SYSCALL(pidfd_send_signal); - /* kernel/sys.c */ COND_SYSCALL(setregid); COND_SYSCALL(setgid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9ec050bcf46..943c89178e3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include <linux/kexec.h> #include <linux/bpf.h> #include <linux/mount.h> +#include <linux/userfaultfd_k.h> #include "../lib/kstrtox.h" @@ -1720,6 +1721,17 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; @@ -2874,8 +2886,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul; @@ -3158,17 +3172,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (write) { char *kbuf, *p; + size_t skipped = 0; - if (left > PAGE_SIZE - 1) + if (left > PAGE_SIZE - 1) { left = PAGE_SIZE - 1; + /* How much of the buffer we'll skip this pass */ + skipped = *lenp - left; + } p = kbuf = memdup_user_nul(buffer, left); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), - sizeof(unsigned long), - GFP_KERNEL); + tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); if (!tmp_bitmap) { kfree(kbuf); return -ENOMEM; @@ -3177,9 +3193,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, while (!err && left) { unsigned long val_a, val_b; bool neg; + size_t saved_left; + /* In case we stop parsing mid-number, we can reset */ + saved_left = left; err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); + /* + * If we consumed the entirety of a truncated buffer or + * only one char is left (may be a "-"), then stop here, + * reset, & come back for more. + */ + if ((left <= 1) && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_a >= bitmap_len || neg) { @@ -3197,6 +3226,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); + /* + * If we consumed all of a truncated buffer or + * then stop here, reset, & come back for more. + */ + if (!left && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_b >= bitmap_len || neg || @@ -3215,6 +3253,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, proc_skip_char(&p, &left, '\n'); } kfree(kbuf); + left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -3259,7 +3298,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, *ppos += *lenp; } - kfree(tmp_bitmap); + bitmap_free(tmp_bitmap); return err; } @@ -3326,6 +3365,11 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif /* CONFIG_PROC_SYSCTL */ @@ -3366,3 +3410,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(proc_do_large_bitmap); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4e62a4a8fa91..5f852b8f59f7 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -375,7 +375,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; - na = nla_nest_start(skb, aggr); + na = nla_nest_start_noflag(skb, aggr); if (!na) goto err; @@ -649,17 +649,41 @@ err: static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, + /* policy enforced later */ + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, }, { .cmd = CGROUPSTATS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, + /* policy enforced later */ + .flags = GENL_CMD_CAP_HASPOL, }, }; +static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + const struct nla_policy *policy = NULL; + + switch (ops->cmd) { + case TASKSTATS_CMD_GET: + policy = taskstats_cmd_get_policy; + break; + case CGROUPSTATS_CMD_GET: + policy = cgroupstats_cmd_get_policy; + break; + default: + return -EINVAL; + } + + return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN, + TASKSTATS_CMD_ATTR_MAX, policy, + info->extack); +} + static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, @@ -667,6 +691,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .pre_doit = taskstats_pre_doit, }; /* Needed early in initialization */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 92a90014a925..ac5555e25733 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,6 +17,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/rtc.h> +#include <linux/audit.h> #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -709,7 +710,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, * kernel time-keeping variables. used by xntpd. */ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai) + s32 *time_tai, struct audit_ntp_data *ad) { int result; @@ -720,14 +721,29 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, /* adjtime() is independent from ntp_adjtime() */ time_adjust = txc->offset; ntp_update_frequency(); + + audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); + audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); } txc->offset = save_adjust; } else { - /* If there are input parameters, then process them: */ - if (txc->modes) + if (txc->modes) { + audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec); + process_adjtimex_modes(txc, time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec); + } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); if (!(time_status & STA_NANO)) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 40e6122e634e..908ecaa65fc3 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,8 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern int __do_adjtimex(struct __kernel_timex *txc, + const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 968e4b07918e..142b07619918 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); - pr_debug("Registered %pF as sched_clock source\n", read); + pr_debug("Registered %pS as sched_clock source\n", read); } void __init generic_sched_clock_init(void) diff --git a/kernel/time/time.c b/kernel/time/time.c index 86656bbac232..7f7d6914ddd5 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -783,6 +783,16 @@ u64 jiffies64_to_nsecs(u64 j) } EXPORT_SYMBOL(jiffies64_to_nsecs); +u64 jiffies64_to_msecs(const u64 j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#else + return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_msecs); + /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5716e28bfa3c..85f5912d8f70 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -21,6 +21,7 @@ #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> +#include <linux/audit.h> #include "tick-internal.h" #include "ntp_internal.h" @@ -1250,6 +1251,9 @@ out: /* signal hrtimers about time change */ clock_was_set(); + if (!ret) + audit_tk_injoffset(ts_delta); + return ret; } EXPORT_SYMBOL(do_settimeofday64); @@ -2303,6 +2307,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; + struct audit_ntp_data ad; unsigned long flags; struct timespec64 ts; s32 orig_tai, tai; @@ -2322,15 +2327,19 @@ int do_adjtimex(struct __kernel_timex *txc) ret = timekeeping_inject_offset(&delta); if (ret) return ret; + + audit_tk_injoffset(delta); } + audit_ntp_init(&ad); + ktime_get_real_ts64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai); + ret = __do_adjtimex(txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); @@ -2341,6 +2350,8 @@ int do_adjtimex(struct __kernel_timex *txc) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + audit_ntp_log(&ad); + /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) timekeeping_advance(TK_ADV_FREQ); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a9b1bbc2d88d..343c7ba33b1c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1325,7 +1325,7 @@ static void call_timer_fn(struct timer_list *timer, lock_map_release(&lockdep_map); if (count != preempt_count()) { - WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8bd1d6d001d7..5d965cef6c77 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -774,13 +774,6 @@ config TRACE_EVAL_MAP_FILE If unsure, say N -config TRACING_EVENTS_GPIO - bool "Trace gpio events" - depends on GPIOLIB - default y - help - Enable tracing events for gpio subsystem - config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 94b0e37d90ef..b496ffdf5f36 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -577,6 +577,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: @@ -917,6 +923,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -1206,6 +1233,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; + if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b920358dd8f7..a12aff849c04 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -70,12 +70,8 @@ #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), -#define ASSIGN_OPS_HASH(opsname, val) \ - .func_hash = val, \ - .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else #define INIT_OPS_HASH(opsname) -#define ASSIGN_OPS_HASH(opsname, val) #endif enum { @@ -3880,7 +3876,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, static bool module_exists(const char *module) { /* All modules have the symbol __this_module */ - const char this_mod[] = "__this_module"; + static const char this_mod[] = "__this_module"; char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; @@ -6265,6 +6261,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Stub functions don't need to be called nor tested */ + if (op->flags & FTRACE_OPS_FL_STUB) + continue; /* * Check the following for each ops before calling their func: * if RCU flag is set, then rcu_is_watching() must be true diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ee8d8aa3d0f..05b0b3139ebc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4979,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) cnt = data->cnt + (nested ? 27 : 0); /* Multiply cnt by ~e, to make some unique increment */ - size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); len = size + sizeof(struct rb_item); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index ffba6789c0e2..0564f6db0561 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -362,7 +362,7 @@ static void ring_buffer_producer(void) hit--; /* make it non zero */ } - /* Caculate the average time in nanosecs */ + /* Calculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); trace_printk("%ld ns per entry\n", avg); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ec439999f387..2c92b3d9ea30 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1727,6 +1727,10 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); list_for_each_entry_safe(p, n, &postponed_selftests, list) { + /* This loop can take minutes when sanitizers are enabled, so + * lets make sure we allow RCU processing. + */ + cond_resched(); ret = run_tracer_selftest(p->type); /* If the test fails, then warn and remove from available_tracers */ if (ret < 0) { @@ -3045,6 +3049,7 @@ void trace_printk_init_buffers(void) if (global_trace.trace_buffer.buffer) tracing_start_cmdline_record(); } +EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { @@ -3205,6 +3210,7 @@ int trace_array_printk(struct trace_array *tr, va_end(ap); return ret; } +EXPORT_SYMBOL_GPL(trace_array_printk); __printf(3, 4) int trace_array_printk_buf(struct ring_buffer *buffer, @@ -3483,33 +3489,68 @@ static void s_stop(struct seq_file *m, void *p) } static void +get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total, + unsigned long *entries, int cpu) +{ + unsigned long count; + + count = ring_buffer_entries_cpu(buf->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; + /* total is the same as the entries */ + *total = count; + } else + *total = count + + ring_buffer_overrun_cpu(buf->buffer, cpu); + *entries = count; +} + +static void get_total_entries(struct trace_buffer *buf, unsigned long *total, unsigned long *entries) { - unsigned long count; + unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(buf->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { - count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; - /* total is the same as the entries */ - *total += count; - } else - *total += count + - ring_buffer_overrun_cpu(buf->buffer, cpu); - *entries += count; + get_total_entries_cpu(buf, &t, &e, cpu); + *total += t; + *entries += e; } } +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu); + + return entries; +} + +unsigned long trace_total_entries(struct trace_array *tr) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries(&tr->trace_buffer, &total, &entries); + + return entries; +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" @@ -3548,25 +3589,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char tgid_space[] = " "; - const char space[] = " "; + const char *space = " "; + int prec = tgid ? 10 : 2; print_event_info(buf, m); - seq_printf(m, "# %s _-----=> irqs-off\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s / _----=> need-resched\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s||| / delay\n", - tgid ? tgid_space : space); - seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", - tgid ? " TGID " : space); - seq_printf(m, "# | | %s | |||| | |\n", - tgid ? " | " : space); + seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void @@ -4692,6 +4726,7 @@ static const char readme_msg[] = " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" " current_tracer\t- function and latency tracers\n" " available_tracers\t- list of configured tracers for current_tracer\n" + " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" " trace_clock\t\t-change the clock used to order events\n" @@ -4712,7 +4747,7 @@ static const char readme_msg[] = " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" - "\t\t\t Disable an option by adding a suffix 'no' to the\n" + "\t\t\t Disable an option by prefixing 'no' to the\n" "\t\t\t option name\n" " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE @@ -6296,13 +6331,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; - const char faulted[] = "<faulted>"; ssize_t written; int size; int len; /* Used in tracing_mark_raw_write() as well */ -#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ +#define FAULTED_STR "<faulted>" +#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; @@ -6334,7 +6369,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { - memcpy(&entry->buf, faulted, FAULTED_SIZE); + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; written = -EFAULT; } else @@ -6375,7 +6410,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, struct ring_buffer_event *event; struct ring_buffer *buffer; struct raw_data_entry *entry; - const char faulted[] = "<faulted>"; unsigned long irq_flags; ssize_t written; int size; @@ -6415,7 +6449,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); if (len) { entry->id = -1; - memcpy(&entry->buf, faulted, FAULTED_SIZE); + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); written = -EFAULT; } else written = cnt; @@ -6868,6 +6902,238 @@ static const struct file_operations snapshot_raw_fops = { #endif /* CONFIG_TRACER_SNAPSHOT */ +#define TRACING_LOG_ERRS_MAX 8 +#define TRACING_LOG_LOC_MAX 128 + +#define CMD_PREFIX " Command: " + +struct err_info { + const char **errs; /* ptr to loc-specific array of err strings */ + u8 type; /* index into errs -> specific err string */ + u8 pos; /* MAX_FILTER_STR_VAL = 256 */ + u64 ts; +}; + +struct tracing_log_err { + struct list_head list; + struct err_info info; + char loc[TRACING_LOG_LOC_MAX]; /* err location */ + char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ +}; + +static DEFINE_MUTEX(tracing_err_log_lock); + +struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) +{ + struct tracing_log_err *err; + + if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + err = ERR_PTR(-ENOMEM); + tr->n_err_log_entries++; + + return err; + } + + err = list_first_entry(&tr->err_log, struct tracing_log_err, list); + list_del(&err->list); + + return err; +} + +/** + * err_pos - find the position of a string within a command for error careting + * @cmd: The tracing command that caused the error + * @str: The string to position the caret at within @cmd + * + * Finds the position of the first occurence of @str within @cmd. The + * return value can be passed to tracing_log_err() for caret placement + * within @cmd. + * + * Returns the index within @cmd of the first occurence of @str or 0 + * if @str was not found. + */ +unsigned int err_pos(char *cmd, const char *str) +{ + char *found; + + if (WARN_ON(!strlen(cmd))) + return 0; + + found = strstr(cmd, str); + if (found) + return found - cmd; + + return 0; +} + +/** + * tracing_log_err - write an error to the tracing error log + * @tr: The associated trace array for the error (NULL for top level array) + * @loc: A string describing where the error occurred + * @cmd: The tracing command that caused the error + * @errs: The array of loc-specific static error strings + * @type: The index into errs[], which produces the specific static err string + * @pos: The position the caret should be placed in the cmd + * + * Writes an error into tracing/error_log of the form: + * + * <loc>: error: <text> + * Command: <cmd> + * ^ + * + * tracing/error_log is a small log file containing the last + * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated + * unless there has been a tracing error, and the error log can be + * cleared and have its memory freed by writing the empty string in + * truncation mode to it i.e. echo > tracing/error_log. + * + * NOTE: the @errs array along with the @type param are used to + * produce a static error string - this string is not copied and saved + * when the error is logged - only a pointer to it is saved. See + * existing callers for examples of how static strings are typically + * defined for use with tracing_log_err(). + */ +void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, + const char **errs, u8 type, u8 pos) +{ + struct tracing_log_err *err; + + if (!tr) + tr = &global_trace; + + mutex_lock(&tracing_err_log_lock); + err = get_tracing_log_err(tr); + if (PTR_ERR(err) == -ENOMEM) { + mutex_unlock(&tracing_err_log_lock); + return; + } + + snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); + snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); + + err->info.errs = errs; + err->info.type = type; + err->info.pos = pos; + err->info.ts = local_clock(); + + list_add_tail(&err->list, &tr->err_log); + mutex_unlock(&tracing_err_log_lock); +} + +static void clear_tracing_err_log(struct trace_array *tr) +{ + struct tracing_log_err *err, *next; + + mutex_lock(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tr->err_log, list) { + list_del(&err->list); + kfree(err); + } + + tr->n_err_log_entries = 0; + mutex_unlock(&tracing_err_log_lock); +} + +static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) +{ + struct trace_array *tr = m->private; + + mutex_lock(&tracing_err_log_lock); + + return seq_list_start(&tr->err_log, *pos); +} + +static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + + return seq_list_next(v, &tr->err_log, pos); +} + +static void tracing_err_log_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tracing_err_log_lock); +} + +static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) +{ + u8 i; + + for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) + seq_putc(m, ' '); + for (i = 0; i < pos; i++) + seq_putc(m, ' '); + seq_puts(m, "^\n"); +} + +static int tracing_err_log_seq_show(struct seq_file *m, void *v) +{ + struct tracing_log_err *err = v; + + if (err) { + const char *err_text = err->info.errs[err->info.type]; + u64 sec = err->info.ts; + u32 nsec; + + nsec = do_div(sec, NSEC_PER_SEC); + seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, + err->loc, err_text); + seq_printf(m, "%s", err->cmd); + tracing_err_log_show_pos(m, err->info.pos); + } + + return 0; +} + +static const struct seq_operations tracing_err_log_seq_ops = { + .start = tracing_err_log_seq_start, + .next = tracing_err_log_seq_next, + .stop = tracing_err_log_seq_stop, + .show = tracing_err_log_seq_show +}; + +static int tracing_err_log_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret = 0; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + clear_tracing_err_log(tr); + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &tracing_err_log_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = tr; + } else { + trace_array_put(tr); + } + } + return ret; +} + +static ssize_t tracing_err_log_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static const struct file_operations tracing_err_log_fops = { + .open = tracing_err_log_open, + .write = tracing_err_log_write, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release_generic_tr, +}; + static int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -8033,7 +8299,7 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -static int instance_mkdir(const char *name) +struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; @@ -8072,6 +8338,7 @@ static int instance_mkdir(const char *name) INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); + INIT_LIST_HEAD(&tr->err_log); if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -8097,7 +8364,7 @@ static int instance_mkdir(const char *name) mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return 0; + return tr; out_free_tr: free_trace_buffers(tr); @@ -8109,33 +8376,21 @@ static int instance_mkdir(const char *name) mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return ret; + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(trace_array_create); +static int instance_mkdir(const char *name) +{ + return PTR_ERR_OR_ZERO(trace_array_create(name)); } -static int instance_rmdir(const char *name) +static int __remove_instance(struct trace_array *tr) { - struct trace_array *tr; - int found = 0; - int ret; int i; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -ENODEV; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) { - found = 1; - break; - } - } - if (!found) - goto out_unlock; - - ret = -EBUSY; if (tr->ref || (tr->current_trace && tr->current_trace->ref)) - goto out_unlock; + return -EBUSY; list_del(&tr->list); @@ -8161,10 +8416,46 @@ static int instance_rmdir(const char *name) free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); + tr = NULL; - ret = 0; + return 0; +} + +int trace_array_destroy(struct trace_array *tr) +{ + int ret; + + if (!tr) + return -EINVAL; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = __remove_instance(tr); + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_array_destroy); + +static int instance_rmdir(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + ret = __remove_instance(tr); + break; + } + } - out_unlock: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -8254,6 +8545,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) tr, &snapshot_fops); #endif + trace_create_file("error_log", 0644, d_tracer, + tr, &tracing_err_log_fops); + for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); @@ -8839,6 +9133,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); + INIT_LIST_HEAD(&global_trace.err_log); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 639047b259d7..1974ce818ddb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -293,11 +293,13 @@ struct trace_array { int nr_topts; bool clear_trace; int buffer_percent; + unsigned int n_err_log_entries; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + struct list_head err_log; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; @@ -719,6 +721,9 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); +unsigned long trace_total_entries(struct trace_array *tr); + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -1545,7 +1550,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -extern int create_event_filter(struct trace_event_call *call, +extern int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp); extern void free_event_filter(struct event_filter *filter); @@ -1876,6 +1882,11 @@ extern ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char**)); +extern unsigned int err_pos(char *cmd, const char *str); +extern void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, + const char **errs, u8 type, u8 pos); + /* * Normal trace_printk() and friends allocates special buffers * to do the manipulation, as well as saves the print formats diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5b3b0c3c8a47..0ce3db67f556 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } +EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event @@ -1318,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) char buf[32]; int len; - if (*ppos) - return 0; - if (unlikely(!id)) return -ENODEV; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 05a66493a164..d3e59312ef40 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -66,7 +66,8 @@ static const char * ops[] = { OPS }; C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ - C(NO_FILTER, "No filter found"), + C(ERRNO, "Error"), \ + C(NO_FILTER, "No filter found") #undef C #define C(a, b) FILT_ERR_##a @@ -76,7 +77,7 @@ enum { ERRORS }; #undef C #define C(a, b) b -static char *err_text[] = { ERRORS }; +static const char *err_text[] = { ERRORS }; /* Called after a '!' character but "!=" and "!~" are not "not"s */ static bool is_not(const char *str) @@ -919,7 +920,8 @@ static void remove_filter_string(struct event_filter *filter) filter->filter_string = NULL; } -static void append_filter_err(struct filter_parse_error *pe, +static void append_filter_err(struct trace_array *tr, + struct filter_parse_error *pe, struct event_filter *filter) { struct trace_seq *s; @@ -947,8 +949,14 @@ static void append_filter_err(struct filter_parse_error *pe, if (pe->lasterr > 0) { trace_seq_printf(s, "\n%*s", pos, "^"); trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); + tracing_log_err(tr, "event filter parse error", + filter->filter_string, err_text, + pe->lasterr, pe->lasterr_pos); } else { trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); + tracing_log_err(tr, "event filter parse error", + filter->filter_string, err_text, + FILT_ERR_ERRNO, 0); } trace_seq_putc(s, 0); buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); @@ -1214,30 +1222,30 @@ static int parse_pred(const char *str, void *data, * (perf doesn't use it) and grab everything. */ if (strcmp(field->name, "ip") != 0) { - parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); - goto err_free; - } - pred->fn = filter_pred_none; - - /* - * Quotes are not required, but if they exist then we need - * to read them till we hit a matching one. - */ - if (str[i] == '\'' || str[i] == '"') - q = str[i]; - else - q = 0; - - for (i++; str[i]; i++) { - if (q && str[i] == q) - break; - if (!q && (str[i] == ')' || str[i] == '&' || - str[i] == '|')) - break; - } - /* Skip quotes */ - if (q) - s++; + parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); + goto err_free; + } + pred->fn = filter_pred_none; + + /* + * Quotes are not required, but if they exist then we need + * to read them till we hit a matching one. + */ + if (str[i] == '\'' || str[i] == '"') + q = str[i]; + else + q = 0; + + for (i++; str[i]; i++) { + if (q && str[i] == q) + break; + if (!q && (str[i] == ')' || str[i] == '&' || + str[i] == '|')) + break; + } + /* Skip quotes */ + if (q) + s++; len = i - s; if (len >= MAX_FILTER_STR_VAL) { parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); @@ -1600,7 +1608,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (err) { filter_disable(file); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); - append_filter_err(pe, filter); + append_filter_err(tr, pe, filter); } else event_set_filtered_flag(file); @@ -1712,7 +1720,8 @@ static void create_filter_finish(struct filter_parse_error *pe) * information if @set_str is %true and the caller is responsible for * freeing it. */ -static int create_filter(struct trace_event_call *call, +static int create_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_string, bool set_str, struct event_filter **filterp) { @@ -1729,17 +1738,18 @@ static int create_filter(struct trace_event_call *call, err = process_preds(call, filter_string, *filterp, pe); if (err && set_str) - append_filter_err(pe, *filterp); + append_filter_err(tr, pe, *filterp); create_filter_finish(pe); return err; } -int create_event_filter(struct trace_event_call *call, +int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { - return create_filter(call, filter_str, set_str, filterp); + return create_filter(tr, call, filter_str, set_str, filterp); } /** @@ -1766,7 +1776,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { - append_filter_err(pe, *filterp); + append_filter_err(tr, pe, *filterp); } } create_filter_finish(pe); @@ -1797,7 +1807,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) return 0; } - err = create_filter(call, filter_string, true, &filter); + err = create_filter(file->tr, call, filter_string, true, &filter); /* * Always swap the call filter with the new filter @@ -2053,7 +2063,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - err = create_filter(call, filter_str, false, &filter); + err = create_filter(NULL, call, filter_str, false, &filter); if (err) goto free_filter; @@ -2202,8 +2212,8 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = create_filter(&event_ftrace_test_filter, d->filter, - false, &filter); + err = create_filter(NULL, &event_ftrace_test_filter, + d->filter, false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a1d20421f4b0..7fca3457c705 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -22,6 +22,57 @@ #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#define ERRORS \ + C(NONE, "No error"), \ + C(DUPLICATE_VAR, "Variable already defined"), \ + C(VAR_NOT_UNIQUE, "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \ + C(TOO_MANY_VARS, "Too many variables defined"), \ + C(MALFORMED_ASSIGNMENT, "Malformed assignment"), \ + C(NAMED_MISMATCH, "Named hist trigger doesn't match existing named trigger (includes variables)"), \ + C(TRIGGER_EEXIST, "Hist trigger already exists"), \ + C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \ + C(SET_CLOCK_FAIL, "Couldn't set trace_clock"), \ + C(BAD_FIELD_MODIFIER, "Invalid field modifier"), \ + C(TOO_MANY_SUBEXPR, "Too many subexpressions (3 max)"), \ + C(TIMESTAMP_MISMATCH, "Timestamp units in expression don't match"), \ + C(TOO_MANY_FIELD_VARS, "Too many field variables defined"), \ + C(EVENT_FILE_NOT_FOUND, "Event file not found"), \ + C(HIST_NOT_FOUND, "Matching event histogram not found"), \ + C(HIST_CREATE_FAIL, "Couldn't create histogram for field"), \ + C(SYNTH_VAR_NOT_FOUND, "Couldn't find synthetic variable"), \ + C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"), \ + C(SYNTH_TYPE_MISMATCH, "Param type doesn't match synthetic event field type"), \ + C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \ + C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"), \ + C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"), \ + C(ONX_NOT_VAR, "For onmax(x) or onchange(x), x must be a variable"), \ + C(ONX_VAR_NOT_FOUND, "Couldn't find onmax or onchange variable"), \ + C(ONX_VAR_CREATE_FAIL, "Couldn't create onmax or onchange variable"), \ + C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"), \ + C(TOO_MANY_PARAMS, "Too many action params"), \ + C(PARAM_NOT_FOUND, "Couldn't find param"), \ + C(INVALID_PARAM, "Invalid action param"), \ + C(ACTION_NOT_FOUND, "No action found"), \ + C(NO_SAVE_PARAMS, "No params found for save()"), \ + C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \ + C(ACTION_MISMATCH, "Handler doesn't support action"), \ + C(NO_CLOSING_PAREN, "No closing paren found"), \ + C(SUBSYS_NOT_FOUND, "Missing subsystem"), \ + C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ + C(INVALID_REF_KEY, "Using variable references as keys not supported"), \ + C(VAR_NOT_FOUND, "Couldn't find variable"), \ + C(FIELD_NOT_FOUND, "Couldn't find field"), + +#undef C +#define C(a, b) HIST_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, @@ -535,62 +586,49 @@ static struct track_data *track_data_alloc(unsigned int key_len, return data; } -static char last_hist_cmd[MAX_FILTER_STR_VAL]; -static char hist_err_str[MAX_FILTER_STR_VAL]; +static char last_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd_loc[MAX_FILTER_STR_VAL]; -static void last_cmd_set(char *str) +static int errpos(char *str) { - if (!str) - return; - - strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); + return err_pos(last_cmd, str); } -static void hist_err(char *str, char *var) +static void last_cmd_set(struct trace_event_file *file, char *str) { - int maxlen = MAX_FILTER_STR_VAL - 1; + const char *system = NULL, *name = NULL; + struct trace_event_call *call; if (!str) return; - if (strlen(hist_err_str)) - return; + strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); - if (!var) - var = ""; + if (file) { + call = file->event_call; - if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) - return; + system = call->class->system; + if (system) { + name = trace_event_name(call); + if (!name) + system = NULL; + } + } - strcat(hist_err_str, str); - strcat(hist_err_str, var); + if (system) + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } -static void hist_err_event(char *str, char *system, char *event, char *var) +static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) { - char err[MAX_FILTER_STR_VAL]; - - if (system && var) - snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); - else if (system) - snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); - else - strscpy(err, var, MAX_FILTER_STR_VAL); - - hist_err(str, err); + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, + err_type, err_pos); } static void hist_err_clear(void) { - hist_err_str[0] = '\0'; -} - -static bool have_hist_err(void) -{ - if (strlen(hist_err_str)) - return true; - - return false; + last_cmd[0] = '\0'; + last_cmd_loc[0] = '\0'; } struct synth_trace_event { @@ -1719,7 +1757,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr, if (find_var_field(var_hist_data, var_name)) { if (found) { - hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); return NULL; } @@ -1770,7 +1808,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) hist_field = find_file_var(file, var_name); if (hist_field) { if (found) { - hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, + errpos(var_name)); return ERR_PTR(-EINVAL); } @@ -2002,11 +2041,11 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) attrs->n_actions++; ret = 0; } - return ret; } -static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) +static int parse_assignment(struct trace_array *tr, + char *str, struct hist_trigger_attrs *attrs) { int ret = 0; @@ -2062,7 +2101,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) char *assignment; if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { - hist_err("Too many variables defined: ", str); + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str)); ret = -EINVAL; goto out; } @@ -2079,7 +2118,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) return ret; } -static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +static struct hist_trigger_attrs * +parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) { struct hist_trigger_attrs *attrs; int ret = 0; @@ -2092,7 +2132,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) char *str = strsep(&trigger_str, ":"); if (strchr(str, '=')) { - ret = parse_assignment(str, attrs); + ret = parse_assignment(tr, str, attrs); if (ret) goto free; } else if (strcmp(str, "pause") == 0) @@ -2648,6 +2688,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, char *var_name) { struct hist_field *var_field = NULL, *ref_field = NULL; + struct trace_array *tr = hist_data->event_file->tr; if (!is_var_ref(var_name)) return NULL; @@ -2660,8 +2701,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, system, event_name); if (!ref_field) - hist_err_event("Couldn't find variable: $", - system, event_name, var_name); + hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); return ref_field; } @@ -2672,6 +2712,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, { struct ftrace_event_field *field = NULL; char *field_name, *modifier, *str; + struct trace_array *tr = file->tr; modifier = str = kstrdup(field_str, GFP_KERNEL); if (!modifier) @@ -2695,7 +2736,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { - hist_err("Invalid field modifier: ", modifier); + hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; } @@ -2711,7 +2752,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { - hist_err("Couldn't find field: ", field_name); + hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); field = ERR_PTR(-EINVAL); goto out; } @@ -2773,7 +2814,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); if (!s) { - hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); + hist_field = parse_var_ref(hist_data, ref_system, + ref_event, ref_var); if (hist_field) { if (var_name) { hist_field = create_alias(hist_data, hist_field, var_name); @@ -2822,7 +2864,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, /* we support only -(xxx) i.e. explicit parens required */ if (level > 3) { - hist_err("Too many subexpressions (3 max): ", str); + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; } @@ -2877,7 +2919,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } -static int check_expr_operands(struct hist_field *operand1, +static int check_expr_operands(struct trace_array *tr, + struct hist_field *operand1, struct hist_field *operand2) { unsigned long operand1_flags = operand1->flags; @@ -2905,7 +2948,7 @@ static int check_expr_operands(struct hist_field *operand1, if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { - hist_err("Timestamp units in expression don't match", NULL); + hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0); return -EINVAL; } @@ -2923,7 +2966,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *sep, *operand1_str; if (level > 3) { - hist_err("Too many subexpressions (3 max): ", str); + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } @@ -2968,7 +3011,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - ret = check_expr_operands(operand1, operand2); + ret = check_expr_operands(file->tr, operand1, operand2); if (ret) goto free; @@ -3161,16 +3204,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err_event("trace action: Too many field variables defined: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); return ERR_PTR(-EINVAL); } file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err_event("trace action: Event file not found: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); ret = PTR_ERR(file); return ERR_PTR(ret); } @@ -3183,8 +3224,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err_event("trace action: Matching event histogram not found: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3245,8 +3285,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("trace action: Couldn't create histogram for field: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); return ERR_PTR(ret); } @@ -3258,8 +3297,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("trace action: Couldn't find synthetic variable: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3392,25 +3430,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, { struct hist_field *val = NULL, *var = NULL; unsigned long flags = HIST_FIELD_FL_VAR; + struct trace_array *tr = file->tr; struct field_var *field_var; int ret = 0; if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { - hist_err("Too many field variables defined: ", field_name); + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); ret = -EINVAL; goto err; } val = parse_atom(hist_data, file, field_name, &flags, NULL); if (IS_ERR(val)) { - hist_err("Couldn't parse field variable: ", field_name); + hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); ret = PTR_ERR(val); goto err; } var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { - hist_err("Couldn't create or find variable: ", field_name); + hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); kfree(val); ret = PTR_ERR(var); goto err; @@ -3737,19 +3776,20 @@ static int track_data_create(struct hist_trigger_data *hist_data, { struct hist_field *var_field, *ref_field, *track_var = NULL; struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; char *track_data_var_str; int ret = 0; track_data_var_str = data->track_data.var_str; if (track_data_var_str[0] != '$') { - hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); + hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); return -EINVAL; } track_data_var_str++; var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); + hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); return -EINVAL; } @@ -3762,7 +3802,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONMAX) track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err("Couldn't create onmax variable: ", "__max"); + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3770,7 +3810,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONCHANGE) track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err("Couldn't create onchange variable: ", "__change"); + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3781,7 +3821,8 @@ static int track_data_create(struct hist_trigger_data *hist_data, return ret; } -static int parse_action_params(char *params, struct action_data *data) +static int parse_action_params(struct trace_array *tr, char *params, + struct action_data *data) { char *param, *saved_param; bool first_param = true; @@ -3789,20 +3830,20 @@ static int parse_action_params(char *params, struct action_data *data) while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { - hist_err("Too many action params", ""); + hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); goto out; } param = strsep(¶ms, ","); if (!param) { - hist_err("No action param found", ""); + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0); ret = -EINVAL; goto out; } param = strstrip(param); if (strlen(param) < 2) { - hist_err("Invalid action param: ", param); + hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param)); ret = -EINVAL; goto out; } @@ -3826,7 +3867,7 @@ static int parse_action_params(char *params, struct action_data *data) return ret; } -static int action_parse(char *str, struct action_data *data, +static int action_parse(struct trace_array *tr, char *str, struct action_data *data, enum handler_id handler) { char *action_name; @@ -3834,14 +3875,14 @@ static int action_parse(char *str, struct action_data *data, strsep(&str, "."); if (!str) { - hist_err("action parsing: No action found", ""); + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } action_name = strsep(&str, "("); if (!action_name || !str) { - hist_err("action parsing: No action found", ""); + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } @@ -3850,12 +3891,12 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!params) { - hist_err("action parsing: No params found for %s", "save"); + hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0); ret = -EINVAL; goto out; } - ret = parse_action_params(params, data); + ret = parse_action_params(tr, params, data); if (ret) goto out; @@ -3864,7 +3905,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err("action parsing: Handler doesn't support action: ", action_name); + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -3876,7 +3917,7 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!str) { - hist_err("action parsing: No closing paren found: %s", params); + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params)); ret = -EINVAL; goto out; } @@ -3886,7 +3927,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err("action parsing: Handler doesn't support action: ", action_name); + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -3901,7 +3942,7 @@ static int action_parse(char *str, struct action_data *data, data->use_trace_keyword = true; if (params) { - ret = parse_action_params(params, data); + ret = parse_action_params(tr, params, data); if (ret) goto out; } @@ -3954,7 +3995,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, goto free; } - ret = action_parse(str, data, handler); + ret = action_parse(hist_data->event_file->tr, str, data, handler); if (ret) goto free; out: @@ -4024,6 +4065,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, struct action_data *data, char *system, char *event, char *var) { + struct trace_array *tr = hist_data->event_file->tr; struct hist_field *hist_field; var++; /* skip '$' */ @@ -4039,7 +4081,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, } if (!hist_field) - hist_err_event("trace action: Couldn't find param: $", system, event, var); + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var)); return hist_field; } @@ -4097,6 +4139,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, static int trace_action_create(struct hist_trigger_data *hist_data, struct action_data *data) { + struct trace_array *tr = hist_data->event_file->tr; char *event_name, *param, *system = NULL; struct hist_field *hist_field, *var_ref; unsigned int i, var_ref_idx; @@ -4114,7 +4157,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, event = find_synth_event(synth_event_name); if (!event) { - hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); + hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); return -EINVAL; } @@ -4175,15 +4218,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data, continue; } - hist_err_event("trace action: Param type doesn't match synthetic event field type: ", - system, event_name, param); + hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); kfree(p); ret = -EINVAL; goto err; } if (field_pos != event->n_fields) { - hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); + hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); ret = -EINVAL; goto err; } @@ -4202,6 +4244,7 @@ static int action_create(struct hist_trigger_data *hist_data, struct action_data *data) { struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; struct track_data *track_data; struct field_var *field_var; unsigned int i; @@ -4229,7 +4272,7 @@ static int action_create(struct hist_trigger_data *hist_data, if (data->action == ACTION_SAVE) { if (hist_data->n_save_vars) { ret = -EEXIST; - hist_err("save action: Can't have more than one save() action per hist", ""); + hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); goto out; } @@ -4242,7 +4285,8 @@ static int action_create(struct hist_trigger_data *hist_data, field_var = create_target_field_var(hist_data, NULL, NULL, param); if (IS_ERR(field_var)) { - hist_err("save action: Couldn't create field variable: ", param); + hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL, + errpos(param)); ret = PTR_ERR(field_var); kfree(param); goto out; @@ -4276,19 +4320,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) match_event = strsep(&str, ")"); if (!match_event || !str) { - hist_err("onmatch: Missing closing paren: ", match_event); + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); goto free; } match_event_system = strsep(&match_event, "."); if (!match_event) { - hist_err("onmatch: Missing subsystem for match event: ", match_event_system); + hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); goto free; } if (IS_ERR(event_file(tr, match_event_system, match_event))) { - hist_err_event("onmatch: Invalid subsystem or event name: ", - match_event_system, match_event, NULL); + hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); goto free; } @@ -4304,7 +4347,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) goto free; } - ret = action_parse(str, data, HANDLER_ONMATCH); + ret = action_parse(tr, str, data, HANDLER_ONMATCH); if (ret) goto free; out: @@ -4373,13 +4416,14 @@ static int create_var_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *var_name, char *expr_str) { + struct trace_array *tr = hist_data->event_file->tr; unsigned long flags = 0; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; if (find_var(hist_data, file, var_name) && !hist_data->remove) { - hist_err("Variable already defined: ", var_name); + hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name)); return -EINVAL; } @@ -4436,8 +4480,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *field_str) { + struct trace_array *tr = hist_data->event_file->tr; struct hist_field *hist_field = NULL; - unsigned long flags = 0; unsigned int key_size; int ret = 0; @@ -4460,7 +4504,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { - hist_err("Using variable references as keys not supported: ", field_str); + hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str)); destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; @@ -4561,6 +4605,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data) static int parse_var_defs(struct hist_trigger_data *hist_data) { + struct trace_array *tr = hist_data->event_file->tr; char *s, *str, *var_name, *field_str; unsigned int i, j, n_vars = 0; int ret = 0; @@ -4574,13 +4619,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) var_name = strsep(&field_str, "="); if (!var_name || !field_str) { - hist_err("Malformed assignment: ", var_name); + hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT, + errpos(var_name)); ret = -EINVAL; goto free; } if (n_vars == TRACING_MAP_VARS_MAX) { - hist_err("Too many variables defined: ", var_name); + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name)); ret = -EINVAL; goto free; } @@ -5431,11 +5477,6 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } - if (have_hist_err()) { - seq_printf(m, "\nERROR: %s\n", hist_err_str); - seq_printf(m, " Last command: %s\n", last_hist_cmd); - } - out_unlock: mutex_unlock(&event_mutex); @@ -5800,6 +5841,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, { struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test, *named_data = NULL; + struct trace_array *tr = file->tr; int ret = 0; if (hist_data->attrs->name) { @@ -5807,7 +5849,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { - hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); + hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); ret = -EINVAL; goto out; } @@ -5828,7 +5870,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, else if (hist_data->attrs->clear) hist_clear(test); else { - hist_err("Hist trigger already exists", NULL); + hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; } goto out; @@ -5836,7 +5878,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { - hist_err("Can't clear or continue a nonexistent hist trigger", NULL); + hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); ret = -ENOENT; goto out; } @@ -5861,7 +5903,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, ret = tracing_set_clock(file->tr, hist_data->attrs->clock); if (ret) { - hist_err("Couldn't set trace_clock: ", clock); + hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); goto out; } @@ -6037,8 +6079,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); if (glob && strlen(glob)) { - last_cmd_set(param); hist_err_clear(); + last_cmd_set(file, param); } if (!param) @@ -6079,7 +6121,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger = strstrip(trigger); } - attrs = parse_hist_trigger_attrs(trigger); + attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) return PTR_ERR(attrs); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cd12ecb66eb9..2a2912cb4533 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str, goto out; /* The filter is for the 'trigger' event, not the triggered event */ - ret = create_event_filter(file->event_call, filter_str, false, &filter); + ret = create_event_filter(file->tr, file->event_call, + filter_str, false, &filter); /* * If create_event_filter() fails, filter still needs to be freed. * Which the calling code will do with data->filter. diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 810d78a8d14c..6c1ae6b752d1 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -17,29 +17,25 @@ #include "trace.h" #include "trace_output.h" -static void ftrace_dump_buf(int skip_lines, long cpu_file) +static struct trace_iterator iter; +static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + +static void ftrace_dump_buf(int skip_entries, long cpu_file) { - /* use static because iter can be a bit big for the stack */ - static struct trace_iterator iter; - static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; struct trace_array *tr; unsigned int old_userobj; int cnt = 0, cpu; - trace_init_global_iter(&iter); - iter.buffer_iter = buffer_iter; tr = iter.tr; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; kdb_printf("Dumping ftrace buffer:\n"); + if (skip_entries) + kdb_printf("(skipping %d entries)\n", skip_entries); /* reset all but tr, trace, and overruns */ memset(&iter.seq, 0, @@ -70,11 +66,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) kdb_printf("---------------------------------\n"); cnt++; - if (!skip_lines) { + if (!skip_entries) { print_trace_line(&iter); trace_printk_seq(&iter.seq); } else { - skip_lines--; + skip_entries--; } if (KDB_FLAG(CMD_INTERRUPT)) @@ -90,10 +86,6 @@ out: tr->trace_flags = old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - - for_each_tracing_cpu(cpu) { if (iter.buffer_iter[cpu]) { ring_buffer_read_finish(iter.buffer_iter[cpu]); iter.buffer_iter[cpu] = NULL; @@ -106,17 +98,19 @@ out: */ static int kdb_ftdump(int argc, const char **argv) { - int skip_lines = 0; + int skip_entries = 0; long cpu_file; char *cp; + int cnt; + int cpu; if (argc > 2) return KDB_ARGCOUNT; if (argc) { - skip_lines = simple_strtol(argv[1], &cp, 0); + skip_entries = simple_strtol(argv[1], &cp, 0); if (*cp) - skip_lines = 0; + skip_entries = 0; } if (argc == 2) { @@ -129,7 +123,29 @@ static int kdb_ftdump(int argc, const char **argv) } kdb_trap_printk++; - ftrace_dump_buf(skip_lines, cpu_file); + + trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; + + for_each_tracing_cpu(cpu) { + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + + /* A negative skip_entries means skip all but the last entries */ + if (skip_entries < 0) { + if (cpu_file == RING_BUFFER_ALL_CPUS) + cnt = trace_total_entries(NULL); + else + cnt = trace_total_entries_cpu(NULL, cpu_file); + skip_entries = max(cnt + skip_entries, 0); + } + + ftrace_dump_buf(skip_entries, cpu_file); + + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + kdb_trap_printk--; return 0; @@ -137,8 +153,9 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { - kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", - "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", + "Dump ftrace log; -skip dumps last #entries", 0, + KDB_ENABLE_ALWAYS_SAFE); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5d5129b05df7..7d736248a070 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -441,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) { + if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; - } else if (ret == -EILSEQ) { - pr_warn("Probing address(0x%p) is not an instruction boundary.\n", - tk->rp.kp.addr); - ret = -EINVAL; - } return ret; } @@ -591,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_kprobe *tk; + struct trace_kprobe *tk = NULL; int i, len, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; @@ -615,44 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[]) if (argc < 2) return -ECANCELED; + trace_probe_log_init("trace_kprobe", argc, argv); + event = strchr(&argv[0][1], ':'); if (event) event++; if (isdigit(argv[0][1])) { if (!is_return) { - pr_info("Maxactive is not for kprobe"); - return -EINVAL; + trace_probe_log_err(1, MAXACT_NO_KPROBE); + goto parse_error; } if (event) len = event - &argv[0][1] - 1; else len = strlen(&argv[0][1]); - if (len > MAX_EVENT_NAME_LEN - 1) - return -E2BIG; + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { - pr_info("Invalid maxactive number\n"); - return ret; + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; } /* kretprobes instances are iterated over via a list. The * maximum should stay reasonable. */ if (maxactive > KRETPROBE_MAXACTIVE_MAX) { - pr_info("Maxactive is too big (%d > %d).\n", - maxactive, KRETPROBE_MAXACTIVE_MAX); - return -E2BIG; + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; } } /* try to parse an address. if that fails, try to read the * input as a symbol. */ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { + trace_probe_log_set_index(1); /* Check whether uprobe event specified */ - if (strchr(argv[1], '/') && strchr(argv[1], ':')) - return -ECANCELED; + if (strchr(argv[1], '/') && strchr(argv[1], ':')) { + ret = -ECANCELED; + goto error; + } /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) @@ -660,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[]) /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { - pr_info("Failed to parse either an address or a symbol.\n"); - goto out; + trace_probe_log_err(0, BAD_PROBE_ADDR); + goto parse_error; } if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { - pr_info("Given offset is not valid for return probe.\n"); - ret = -EINVAL; - goto out; + trace_probe_log_err(0, BAD_RETPROBE); + goto parse_error; } } - argc -= 2; argv += 2; + trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf); + ret = traceprobe_parse_event_name(&event, &group, buf, + event - argv[0]); if (ret) - goto out; + goto parse_error; } else { /* Make a new event name */ if (symbol) @@ -691,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[]) /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc, is_return); + argc - 2, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); - /* This must return -ENOMEM otherwise there is a bug */ + /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; + goto out; /* We know tk is not allocated */ } + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { @@ -707,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[]) goto error; } + trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); kfree(tmp); if (ret) - goto error; + goto error; /* This can be -ENOMEM */ } ret = register_trace_kprobe(tk); - if (ret) + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM) + trace_probe_log_err(0, FAIL_REG_PROBE); goto error; + } + out: + trace_probe_log_clear(); kfree(symbol); return ret; +parse_error: + ret = -EINVAL; error: free_trace_kprobe(tk); goto out; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f8411e7835f..a347faced959 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,6 +13,11 @@ #include "trace_probe.h" +#undef C +#define C(a, b) b + +static const char *trace_probe_err_text[] = { ERRORS }; + static const char *reserved_field_names[] = { "common_type", "common_flags", @@ -133,6 +138,60 @@ fail: return NULL; } +static struct trace_probe_log trace_probe_log; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +{ + trace_probe_log.subsystem = subsystem; + trace_probe_log.argc = argc; + trace_probe_log.argv = argv; + trace_probe_log.index = 0; +} + +void trace_probe_log_clear(void) +{ + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); +} + +void trace_probe_log_set_index(int index) +{ + trace_probe_log.index = index; +} + +void __trace_probe_log_err(int offset, int err_type) +{ + char *command, *p; + int i, len = 0, pos = 0; + + if (!trace_probe_log.argv) + return; + + /* Recalcurate the length and allocate buffer */ + for (i = 0; i < trace_probe_log.argc; i++) { + if (i == trace_probe_log.index) + pos = len; + len += strlen(trace_probe_log.argv[i]) + 1; + } + command = kzalloc(len, GFP_KERNEL); + if (!command) + return; + + /* And make a command string from argv array */ + p = command; + for (i = 0; i < trace_probe_log.argc; i++) { + len = strlen(trace_probe_log.argv[i]); + strcpy(p, trace_probe_log.argv[i]); + p[len] = ' '; + p += len + 1; + } + *(p - 1) = '\0'; + + tracing_log_err(NULL, trace_probe_log.subsystem, command, + trace_probe_err_text, err_type, pos + offset); + + kfree(command); +} + /* Split symbol and offset. */ int traceprobe_split_symbol_offset(char *symbol, long *offset) { @@ -156,7 +215,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) /* @buf must has MAX_EVENT_NAME_LEN size */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, - char *buf) + char *buf, int offset) { const char *slash, *event = *pevent; int len; @@ -164,32 +223,33 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, slash = strchr(event, '/'); if (slash) { if (slash == event) { - pr_info("Group name is not specified\n"); + trace_probe_log_err(offset, NO_GROUP_NAME); return -EINVAL; } if (slash - event + 1 > MAX_EVENT_NAME_LEN) { - pr_info("Group name is too long\n"); - return -E2BIG; + trace_probe_log_err(offset, GROUP_TOO_LONG); + return -EINVAL; } strlcpy(buf, event, slash - event + 1); if (!is_good_name(buf)) { - pr_info("Group name must follow the same rules as C identifiers\n"); + trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; } *pgroup = buf; *pevent = slash + 1; + offset += slash - event + 1; event = *pevent; } len = strlen(event); if (len == 0) { - pr_info("Event name is not specified\n"); + trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; } else if (len > MAX_EVENT_NAME_LEN) { - pr_info("Event name is too long\n"); - return -E2BIG; + trace_probe_log_err(offset, EVENT_TOO_LONG); + return -EINVAL; } if (!is_good_name(event)) { - pr_info("Event name must follow the same rules as C identifiers\n"); + trace_probe_log_err(offset, BAD_EVENT_NAME); return -EINVAL; } return 0; @@ -198,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, unsigned int flags) + struct fetch_insn *code, unsigned int flags, int offs) { unsigned long param; int ret = 0; int len; if (strcmp(arg, "retval") == 0) { - if (flags & TPARG_FL_RETURN) + if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; - else + } else { + trace_probe_log_err(offs, RETVAL_ON_PROBE); ret = -EINVAL; + } } else if ((len = str_has_prefix(arg, "stack"))) { if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; } else if (isdigit(arg[len])) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret || ((flags & TPARG_FL_KERNEL) && - param > PARAM_MAX_STACK)) + if (ret) { + goto inval_var; + } else if ((flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + trace_probe_log_err(offs, BAD_STACK_NUM); ret = -EINVAL; - else { + } else { code->op = FETCH_OP_STACK; code->param = (unsigned int)param; } } else - ret = -EINVAL; + goto inval_var; } else if (strcmp(arg, "comm") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && (len = str_has_prefix(arg, "arg"))) { - if (!isdigit(arg[len])) - return -EINVAL; ret = kstrtoul(arg + len, 10, ¶m); - if (ret || !param || param > PARAM_MAX_STACK) + if (ret) { + goto inval_var; + } else if (!param || param > PARAM_MAX_STACK) { + trace_probe_log_err(offs, BAD_ARG_NUM); return -EINVAL; + } code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif } else - ret = -EINVAL; + goto inval_var; return ret; + +inval_var: + trace_probe_log_err(offs, BAD_VAR); + return -EINVAL; } /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - unsigned int flags) + unsigned int flags, int offs) { struct fetch_insn *code = *pcode; unsigned long param; @@ -257,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, flags); + ret = parse_probe_vars(arg + 1, type, code, flags, offs); break; case '%': /* named register */ @@ -266,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_REG; code->param = (unsigned int)ret; ret = 0; - } + } else + trace_probe_log_err(offs, BAD_REG_NAME); break; case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_MEM_ADDR); break; + } /* load address */ code->op = FETCH_OP_IMM; code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (flags & TPARG_FL_KERNEL) + if (flags & TPARG_FL_KERNEL) { + trace_probe_log_err(offs, FILE_ON_KPROBE); return -EINVAL; - + } ret = kstrtol(arg + 2, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_FILE_OFFS); break; + } code->op = FETCH_OP_FOFFS; code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!(flags & TPARG_FL_KERNEL)) + if (!(flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(offs, SYM_ON_UPROBE); return -EINVAL; - + } /* Preserve symbol for updating */ code->op = FETCH_NOP_SYMBOL; code->data = kstrdup(arg + 1, GFP_KERNEL); if (!code->data) return -ENOMEM; - if (++code == end) - return -E2BIG; - + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } code->op = FETCH_OP_IMM; code->immediate = 0; } /* These are fetching from memory */ - if (++code == end) - return -E2BIG; + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } *pcode = code; code->op = FETCH_OP_DEREF; code->offset = offset; @@ -317,28 +398,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type, /* fall through */ case '-': tmp = strchr(arg, '('); - if (!tmp) + if (!tmp) { + trace_probe_log_err(offs, DEREF_NEED_BRACE); return -EINVAL; - + } *tmp = '\0'; ret = kstrtol(arg, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_DEREF_OFFS); break; - + } + offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); arg = tmp + 1; tmp = strrchr(arg, ')'); - - if (tmp) { + if (!tmp) { + trace_probe_log_err(offs + strlen(arg), + DEREF_OPEN_BRACE); + return -EINVAL; + } else { const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, flags); + ret = parse_probe_arg(arg, t2, &code, end, flags, offs); if (ret) break; - if (code->op == FETCH_OP_COMM) + if (code->op == FETCH_OP_COMM) { + trace_probe_log_err(offs, COMM_CANT_DEREF); return -EINVAL; - if (++code == end) - return -E2BIG; + } + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } *pcode = code; code->op = FETCH_OP_DEREF; @@ -348,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ + trace_probe_log_err(offs, BAD_FETCH_ARG); ret = -EINVAL; } return ret; @@ -379,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf, return -EINVAL; code++; if (code->op != FETCH_OP_NOP) - return -E2BIG; + return -EINVAL; *pcode = code; code->op = FETCH_OP_MOD_BF; @@ -392,44 +484,66 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, - struct probe_arg *parg, unsigned int flags) + struct probe_arg *parg, unsigned int flags, int offset) { struct fetch_insn *code, *scode, *tmp = NULL; - char *t, *t2; + char *t, *t2, *t3; int ret, len; - if (strlen(arg) > MAX_ARGSTR_LEN) { - pr_info("Argument is too long.: %s\n", arg); - return -ENOSPC; + len = strlen(arg); + if (len > MAX_ARGSTR_LEN) { + trace_probe_log_err(offset, ARG_TOO_LONG); + return -EINVAL; + } else if (len == 0) { + trace_probe_log_err(offset, NO_ARG_BODY); + return -EINVAL; } + parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - pr_info("Failed to allocate memory for command '%s'.\n", arg); + if (!parg->comm) return -ENOMEM; - } + t = strchr(arg, ':'); if (t) { *t = '\0'; t2 = strchr(++t, '['); if (t2) { - *t2 = '\0'; - parg->count = simple_strtoul(t2 + 1, &t2, 0); - if (strcmp(t2, "]") || parg->count == 0) + *t2++ = '\0'; + t3 = strchr(t2, ']'); + if (!t3) { + offset += t2 + strlen(t2) - arg; + trace_probe_log_err(offset, + ARRAY_NO_CLOSE); + return -EINVAL; + } else if (t3[1] != '\0') { + trace_probe_log_err(offset + t3 + 1 - arg, + BAD_ARRAY_SUFFIX); return -EINVAL; - if (parg->count > MAX_ARRAY_LEN) - return -E2BIG; + } + *t3 = '\0'; + if (kstrtouint(t2, 0, &parg->count) || !parg->count) { + trace_probe_log_err(offset + t2 - arg, + BAD_ARRAY_NUM); + return -EINVAL; + } + if (parg->count > MAX_ARRAY_LEN) { + trace_probe_log_err(offset + t2 - arg, + ARRAY_TOO_BIG); + return -EINVAL; + } } } - /* - * The default type of $comm should be "string", and it can't be - * dereferenced. - */ - if (!t && strcmp(arg, "$comm") == 0) + + /* Since $comm can not be dereferred, we can find $comm by strcmp */ + if (strcmp(arg, "$comm") == 0) { + /* The type of $comm must be "string", and not an array. */ + if (parg->count || (t && strcmp(t, "string"))) + return -EINVAL; parg->type = find_fetch_type("string"); - else + } else parg->type = find_fetch_type(t); if (!parg->type) { - pr_info("Unsupported type: %s\n", t); + trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); return -EINVAL; } parg->offset = *size; @@ -444,13 +558,13 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, parg->count); } - code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); + code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) return -ENOMEM; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - flags); + flags, offset); if (ret) goto fail; @@ -458,7 +572,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (!strcmp(parg->type->name, "string")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { - pr_info("string only accepts memory or address.\n"); + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); ret = -EINVAL; goto fail; } @@ -470,7 +585,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, */ code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } } @@ -483,7 +599,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } else { code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } code->op = FETCH_OP_ST_RAW; @@ -493,20 +610,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, /* Modify operation */ if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); - if (ret) + if (ret) { + trace_probe_log_err(offset + t - arg, BAD_BITFIELD); goto fail; + } } /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING) { - pr_info("array only accepts memory or address\n"); + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); ret = -EINVAL; goto fail; } code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -516,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code->op = FETCH_OP_END; /* Shrink down the code buffer */ - parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); + parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) ret = -ENOMEM; else @@ -555,15 +676,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, { struct probe_arg *parg = &tp->args[i]; char *body; - int ret; /* Increment count for freeing args in error case */ tp->nr_args++; body = strchr(arg, '='); if (body) { - if (body - arg > MAX_ARG_NAME_LEN || body == arg) + if (body - arg > MAX_ARG_NAME_LEN) { + trace_probe_log_err(0, ARG_NAME_TOO_LONG); + return -EINVAL; + } else if (body == arg) { + trace_probe_log_err(0, NO_ARG_NAME); return -EINVAL; + } parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { @@ -575,22 +700,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, return -ENOMEM; if (!is_good_name(parg->name)) { - pr_info("Invalid argument[%d] name: %s\n", - i, parg->name); + trace_probe_log_err(0, BAD_ARG_NAME); return -EINVAL; } - if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { - pr_info("Argument[%d]: '%s' conflicts with another field.\n", - i, parg->name); + trace_probe_log_err(0, USED_ARG_NAME); return -EINVAL; } - /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); - if (ret) - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); - return ret; + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, + body - arg); } void traceprobe_free_probe_arg(struct probe_arg *arg) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 2177c206de15..f9a8c632188b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -124,6 +124,7 @@ struct fetch_insn { /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ #define FETCH_INSN_MAX 16 +#define FETCH_TOKEN_COMM (-ECOMM) /* Fetch type information table */ struct fetch_type { @@ -280,8 +281,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -extern int traceprobe_parse_event_name(const char **pevent, - const char **pgroup, char *buf); +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, + char *buf, int offset); extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); @@ -298,3 +299,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, size_t offset, struct trace_probe *tp); + +#undef ERRORS +#define ERRORS \ + C(FILE_NOT_FOUND, "Failed to find the given file"), \ + C(NO_REGULAR_FILE, "Not a regular file"), \ + C(BAD_REFCNT, "Invalid reference counter offset"), \ + C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ + C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ + C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ + C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ + C(BAD_MAXACT, "Invalid maxactive number"), \ + C(MAXACT_TOO_BIG, "Maxactive is too big"), \ + C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ + C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_GROUP_NAME, "Group name is not specified"), \ + C(GROUP_TOO_LONG, "Group name is too long"), \ + C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ + C(NO_EVENT_NAME, "Event name is not specified"), \ + C(EVENT_TOO_LONG, "Event name is too long"), \ + C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(BAD_STACK_NUM, "Invalid stack number"), \ + C(BAD_ARG_NUM, "Invalid argument number"), \ + C(BAD_VAR, "Invalid $-valiable specified"), \ + C(BAD_REG_NAME, "Invalid register name"), \ + C(BAD_MEM_ADDR, "Invalid memory address"), \ + C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ + C(BAD_FILE_OFFS, "Invalid file offset value"), \ + C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ + C(TOO_MANY_OPS, "Dereference is too much nested"), \ + C(DEREF_NEED_BRACE, "Dereference needs a brace"), \ + C(BAD_DEREF_OFFS, "Invalid dereference offset"), \ + C(DEREF_OPEN_BRACE, "Dereference brace is not closed"), \ + C(COMM_CANT_DEREF, "$comm can not be dereferenced"), \ + C(BAD_FETCH_ARG, "Invalid fetch argument"), \ + C(ARRAY_NO_CLOSE, "Array is not closed"), \ + C(BAD_ARRAY_SUFFIX, "Array has wrong suffix"), \ + C(BAD_ARRAY_NUM, "Invalid array size"), \ + C(ARRAY_TOO_BIG, "Array number is too big"), \ + C(BAD_TYPE, "Unknown type is specified"), \ + C(BAD_STRING, "String accepts only memory argument"), \ + C(BAD_BITFIELD, "Invalid bitfield"), \ + C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ + C(NO_ARG_NAME, "Argument name is not specified"), \ + C(BAD_ARG_NAME, "Argument name must follow the same rules as C identifiers"), \ + C(USED_ARG_NAME, "This argument name is already used"), \ + C(ARG_TOO_LONG, "Argument expression is too long"), \ + C(NO_ARG_BODY, "No argument expression"), \ + C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ + C(FAIL_REG_PROBE, "Failed to register probe event"), + +#undef C +#define C(a, b) TP_ERR_##a + +/* Define TP_ERR_ */ +enum { ERRORS }; + +/* Error text is defined in trace_probe.c */ + +struct trace_probe_log { + const char *subsystem; + const char **argv; + int argc; + int index; +}; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +void trace_probe_log_set_index(int index); +void trace_probe_log_clear(void); +void __trace_probe_log_err(int offset, int err); + +#define trace_probe_log_err(offs, err) \ + __trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4737bb8c07a3..c30c61f12ddd 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -88,7 +88,7 @@ stage3: /* 3rd stage: store value to buffer */ if (unlikely(!dest)) { if (code->op == FETCH_OP_ST_STRING) { - ret += fetch_store_strlen(val + code->offset); + ret = fetch_store_strlen(val + code->offset); code++; goto array; } else diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9d402e7fc949..69ee8ef12cee 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); - trace->reset(tr); + /* Need to also simulate the tr->reset to remove this fgraph_ops */ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(&fgraph_ops); + tracing_start(); if (!ret && !count) { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index be78d99ee6bc..eb7e06b54741 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base) if (unlikely(!maxlen)) return -ENOMEM; - ret = strncpy_from_user(dst, src, maxlen); + if (addr == FETCH_TOKEN_COMM) + ret = strlcpy(dst, current->comm, maxlen); + else + ret = strncpy_from_user(dst, src, maxlen); if (ret >= 0) { if (ret == maxlen) dst[ret - 1] = '\0'; @@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr) int len; void __user *vaddr = (void __force __user *) addr; - len = strnlen_user(vaddr, MAX_STRING_SIZE); + if (addr == FETCH_TOKEN_COMM) + len = strlen(current->comm) + 1; + else + len = strnlen_user(vaddr, MAX_STRING_SIZE); return (len > MAX_STRING_SIZE) ? 0 : len; } @@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, case FETCH_OP_IMM: val = code->immediate; break; + case FETCH_OP_COMM: + val = FETCH_TOKEN_COMM; + break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; @@ -457,13 +466,19 @@ static int trace_uprobe_create(int argc, const char **argv) return -ECANCELED; } + trace_probe_log_init("trace_uprobe", argc, argv); + trace_probe_log_set_index(1); /* filename is the 2nd argument */ + *arg++ = '\0'; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { + trace_probe_log_err(0, FILE_NOT_FOUND); kfree(filename); + trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { + trace_probe_log_err(0, NO_REGULAR_FILE); ret = -EINVAL; goto fail_address_parse; } @@ -472,9 +487,16 @@ static int trace_uprobe_create(int argc, const char **argv) rctr = strchr(arg, '('); if (rctr) { rctr_end = strchr(rctr, ')'); - if (rctr > rctr_end || *(rctr_end + 1) != 0) { + if (!rctr_end) { + ret = -EINVAL; + rctr_end = rctr + strlen(rctr); + trace_probe_log_err(rctr_end - filename, + REFCNT_OPEN_BRACE); + goto fail_address_parse; + } else if (rctr_end[1] != '\0') { ret = -EINVAL; - pr_info("Invalid reference counter offset.\n"); + trace_probe_log_err(rctr_end + 1 - filename, + BAD_REFCNT_SUFFIX); goto fail_address_parse; } @@ -482,22 +504,23 @@ static int trace_uprobe_create(int argc, const char **argv) *rctr_end = '\0'; ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { - pr_info("Invalid reference counter offset.\n"); + trace_probe_log_err(rctr - filename, BAD_REFCNT); goto fail_address_parse; } } /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); goto fail_address_parse; - - argc -= 2; - argv += 2; + } /* setup a probe */ + trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf); + ret = traceprobe_parse_event_name(&event, &group, buf, + event - argv[0]); if (ret) goto fail_address_parse; } else { @@ -519,6 +542,9 @@ static int trace_uprobe_create(int argc, const char **argv) kfree(tail); } + argc -= 2; + argv += 2; + tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { ret = PTR_ERR(tu); @@ -539,6 +565,7 @@ static int trace_uprobe_create(int argc, const char **argv) goto error; } + trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, is_return ? TPARG_FL_RETURN : 0); kfree(tmp); @@ -547,20 +574,20 @@ static int trace_uprobe_create(int argc, const char **argv) } ret = register_trace_uprobe(tu); - if (ret) - goto error; - return 0; + if (!ret) + goto out; error: free_trace_uprobe(tu); +out: + trace_probe_log_clear(); return ret; fail_address_parse: + trace_probe_log_clear(); path_put(&path); kfree(filename); - pr_info("Failed to parse address or file.\n"); - return ret; } diff --git a/kernel/user.c b/kernel/user.c index 0df9b1640b2a..88b834f0eebc 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -185,7 +185,7 @@ struct user_struct *alloc_uid(kuid_t uid) if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) - goto out_unlock; + return NULL; new->uid = uid; refcount_set(&new->__count, 1); @@ -199,8 +199,6 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - key_put(new->uid_keyring); - key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -210,9 +208,6 @@ struct user_struct *alloc_uid(kuid_t uid) } return up; - -out_unlock: - return NULL; } static int __init uid_cache_init(void) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 56180c9286f5..9657315405de 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -127,16 +127,16 @@ enum { * * PL: wq_pool_mutex protected. * - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. + * PR: wq_pool_mutex protected for writes. RCU protected for reads. * * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. * * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or - * sched-RCU for reads. + * RCU for reads. * * WQ: wq->mutex protected. * - * WR: wq->mutex protected for writes. Sched-RCU protected for reads. + * WR: wq->mutex protected for writes. RCU protected for reads. * * MD: wq_mayday_lock protected. */ @@ -183,7 +183,7 @@ struct worker_pool { atomic_t nr_running ____cacheline_aligned_in_smp; /* - * Destruction of pool is sched-RCU protected to allow dereferences + * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; @@ -212,7 +212,7 @@ struct pool_workqueue { /* * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also sched-RCU protected so that the first pwq can be + * itself is also RCU protected so that the first pwq can be * determined without grabbing wq->mutex. */ struct work_struct unbound_release_work; @@ -266,8 +266,8 @@ struct workqueue_struct { char name[WQ_NAME_LEN]; /* I: workqueue name */ /* - * Destruction of workqueue_struct is sched-RCU protected to allow - * walking the workqueues list without grabbing wq_pool_mutex. + * Destruction of workqueue_struct is RCU protected to allow walking + * the workqueues list without grabbing wq_pool_mutex. * This is used to dump all workqueues from sysrq. */ struct rcu_head rcu; @@ -359,20 +359,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq_pool_mutex), \ - "sched RCU or wq_pool_mutex should be held") + "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex), \ - "sched RCU or wq->mutex should be held") + "RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ - "sched RCU, wq->mutex or wq_pool_mutex should be held") + "RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ @@ -384,7 +384,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with wq_pool_mutex held or sched RCU read + * This must be called either with wq_pool_mutex held or RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * @@ -416,7 +416,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with wq->mutex held or sched RCU read locked. + * This must be called either with wq->mutex held or RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -552,7 +552,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) * @wq: the target workqueue * @node: the node ID * - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU + * This must be called with any of wq_pool_mutex, wq->mutex or RCU * read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. @@ -696,8 +696,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read - * access under sched-RCU read lock. As such, this function should be - * called under wq_pool_mutex or with preemption disabled. + * access under RCU read lock. As such, this function should be + * called under wq_pool_mutex or inside of a rcu_read_lock() region. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -907,6 +907,7 @@ void wq_worker_sleeping(struct task_struct *task) /** * wq_worker_last_func - retrieve worker's last work function + * @task: Task to retrieve last work function of. * * Determine the last function a worker executed. This is called from * the scheduler to get a worker's last known identity. @@ -1126,7 +1127,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* - * As both pwqs and pools are sched-RCU protected, the + * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ spin_lock_irq(&pwq->pool->lock); @@ -1254,6 +1255,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; + rcu_read_lock(); /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. @@ -1292,10 +1294,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, set_work_pool_and_keep_pending(work, pool->id); spin_unlock(&pool->lock); + rcu_read_unlock(); return 1; } spin_unlock(&pool->lock); fail: + rcu_read_unlock(); local_irq_restore(*flags); if (work_is_canceling(work)) return -ENOENT; @@ -1409,6 +1413,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; + rcu_read_lock(); retry: if (req_cpu == WORK_CPU_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); @@ -1465,10 +1470,8 @@ retry: /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); - if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock(&pwq->pool->lock); - return; - } + if (WARN_ON(!list_empty(&work->entry))) + goto out; pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); @@ -1486,7 +1489,9 @@ retry: insert_work(pwq, work, worklist, work_flags); +out: spin_unlock(&pwq->pool->lock); + rcu_read_unlock(); } /** @@ -2271,7 +2276,7 @@ __acquires(&pool->lock) if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" - " last function: %pf\n", + " last function: %ps\n", current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); @@ -2590,11 +2595,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, worker = current_wq_worker(); WARN_ONCE(current->flags & PF_MEMALLOC, - "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), - "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } @@ -2968,14 +2973,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, might_sleep(); - local_irq_disable(); + rcu_read_lock(); pool = get_work_pool(work); if (!pool) { - local_irq_enable(); + rcu_read_unlock(); return false; } - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3007,10 +3012,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, lock_map_acquire(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); } - + rcu_read_unlock(); return true; already_gone: spin_unlock_irq(&pool->lock); + rcu_read_unlock(); return false; } @@ -3497,7 +3503,7 @@ static void rcu_free_pool(struct rcu_head *rcu) * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). @@ -3551,7 +3557,7 @@ static void put_unbound_pool(struct worker_pool *pool) del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); - /* sched-RCU protected to allow dereferences from get_work_pool() */ + /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); } @@ -4202,6 +4208,7 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; } +__printf(1, 4) struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) @@ -4465,7 +4472,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock_sched(); + rcu_read_lock(); + preempt_disable(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); @@ -4476,7 +4484,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); ret = !list_empty(&pwq->delayed_works); - rcu_read_unlock_sched(); + preempt_enable(); + rcu_read_unlock(); return ret; } @@ -4502,15 +4511,15 @@ unsigned int work_busy(struct work_struct *work) if (work_pending(work)) ret |= WORK_BUSY_PENDING; - local_irq_save(flags); + rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); } - local_irq_restore(flags); + rcu_read_unlock(); return ret; } @@ -4581,7 +4590,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); @@ -4606,7 +4615,7 @@ static void pr_cont_work(bool comma, struct work_struct *work) pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %pf", comma ? "," : "", work->func); + pr_cont("%s %ps", comma ? "," : "", work->func); } } @@ -4638,7 +4647,7 @@ static void show_pwq(struct pool_workqueue *pwq) if (worker->current_pwq != pwq) continue; - pr_cont("%s %d%s:%pf", comma ? "," : "", + pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), worker == pwq->wq->rescuer ? "(RESCUER)" : "", worker->current_func); @@ -4694,7 +4703,7 @@ void show_workqueue_state(void) unsigned long flags; int pi; - rcu_read_lock_sched(); + rcu_read_lock(); pr_info("Showing busy workqueues and worker pools:\n"); @@ -4759,7 +4768,7 @@ void show_workqueue_state(void) touch_nmi_watchdog(); } - rcu_read_unlock_sched(); + rcu_read_unlock(); } /* used to show worker information through /proc/PID/{comm,stat,status} */ @@ -5146,16 +5155,16 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - rcu_read_lock_sched(); + rcu_read_lock(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; - rcu_read_unlock_sched(); + rcu_read_unlock(); goto out_unlock; } } - rcu_read_unlock_sched(); + rcu_read_unlock(); } out_unlock: mutex_unlock(&wq_pool_mutex); @@ -5350,7 +5359,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - rcu_read_lock_sched(); + get_online_cpus(); + rcu_read_lock(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, "%s%d:%d", delim, node, @@ -5358,7 +5368,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, delim = " "; } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); + rcu_read_unlock(); + put_online_cpus(); return written; } |