diff options
Diffstat (limited to 'kernel')
186 files changed, 17365 insertions, 6796 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 172d151d429c..f85ae5dfa474 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o diff --git a/kernel/async.c b/kernel/async.c index 2cbd3dd5940d..a893d6170944 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -84,20 +84,24 @@ static atomic_t entry_count; static async_cookie_t lowest_in_progress(struct async_domain *domain) { - struct list_head *pending; + struct async_entry *first = NULL; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); - if (domain) - pending = &domain->pending; - else - pending = &async_global_pending; + if (domain) { + if (!list_empty(&domain->pending)) + first = list_first_entry(&domain->pending, + struct async_entry, domain_list); + } else { + if (!list_empty(&async_global_pending)) + first = list_first_entry(&async_global_pending, + struct async_entry, global_list); + } - if (!list_empty(pending)) - ret = list_first_entry(pending, struct async_entry, - domain_list)->cookie; + if (first) + ret = first->cookie; spin_unlock_irqrestore(&async_lock, flags); return ret; diff --git a/kernel/audit.c b/kernel/audit.c index 227db99b0f19..670665c6e2a6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -38,7 +38,8 @@ * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * - * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ + * Audit userspace, documentation, tests, and bug/issue trackers: + * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -180,9 +181,21 @@ static char *audit_feature_names[2] = { "loginuid_immutable", }; - -/* Serialize requests from userspace. */ -DEFINE_MUTEX(audit_cmd_mutex); +/** + * struct audit_ctl_mutex - serialize requests from userspace + * @lock: the mutex used for locking + * @owner: the task which owns the lock + * + * Description: + * This is the lock struct used to ensure we only process userspace requests + * in an orderly fashion. We can't simply use a mutex/lock here because we + * need to track lock ownership so we don't end up blocking the lock owner in + * audit_log_start() or similar. + */ +static struct audit_ctl_mutex { + struct mutex lock; + void *owner; +} audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -227,6 +240,36 @@ int auditd_test_task(struct task_struct *task) } /** + * audit_ctl_lock - Take the audit control lock + */ +void audit_ctl_lock(void) +{ + mutex_lock(&audit_cmd_mutex.lock); + audit_cmd_mutex.owner = current; +} + +/** + * audit_ctl_unlock - Drop the audit control lock + */ +void audit_ctl_unlock(void) +{ + audit_cmd_mutex.owner = NULL; + mutex_unlock(&audit_cmd_mutex.lock); +} + +/** + * audit_ctl_owner_current - Test to see if the current task owns the lock + * + * Description: + * Return true if the current task owns the audit control lock, false if it + * doesn't own the lock. + */ +static bool audit_ctl_owner_current(void) +{ + return (current == audit_cmd_mutex.owner); +} + +/** * auditd_pid_vnr - Return the auditd PID relative to the namespace * * Description: @@ -443,15 +486,15 @@ static int audit_set_failure(u32 state) * Drop any references inside the auditd connection tracking struct and free * the memory. */ - static void auditd_conn_free(struct rcu_head *rcu) - { +static void auditd_conn_free(struct rcu_head *rcu) +{ struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); - } +} /** * auditd_set - Set/Reset the auditd connection state @@ -860,8 +903,8 @@ int audit_send_list(void *_dest) struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_lock(); + audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); @@ -902,8 +945,8 @@ static int audit_send_reply_thread(void *arg) struct audit_reply *reply = (struct audit_reply *)arg; struct sock *sk = audit_get_sk(reply->net); - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_lock(); + audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ @@ -1058,6 +1101,8 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); + if (!ab) + return; audit_log_task_info(ab, current); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, @@ -1466,7 +1511,7 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_hdr(skb); len = skb->len; - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ @@ -1475,7 +1520,7 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_next(nlh, &len); } - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } /* Run custom bind function on netlink socket group connect or bind requests. */ @@ -1547,6 +1592,9 @@ static int __init audit_init(void) for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); + mutex_init(&audit_cmd_mutex.lock); + audit_cmd_mutex.owner = NULL; + pr_info("initializing netlink subsys (%s)\n", audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); @@ -1567,19 +1615,26 @@ static int __init audit_init(void) } postcore_initcall(audit_init); -/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ +/* + * Process kernel command-line parameter at boot time. + * audit={0|off} or audit={1|on}. + */ static int __init audit_enable(char *str) { - long val; - - if (kstrtol(str, 0, &val)) - panic("audit: invalid 'audit' parameter value (%s)\n", str); - audit_default = (val ? AUDIT_ON : AUDIT_OFF); + if (!strcasecmp(str, "off") || !strcmp(str, "0")) + audit_default = AUDIT_OFF; + else if (!strcasecmp(str, "on") || !strcmp(str, "1")) + audit_default = AUDIT_ON; + else { + pr_err("audit: invalid 'audit' parameter value (%s)\n", str); + audit_default = AUDIT_ON; + } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) - panic("audit: error setting audit state (%d)\n", audit_default); + pr_err("audit: error setting audit state (%d)\n", + audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); @@ -1710,8 +1765,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex */ - if (!(auditd_test_task(current) || - (current == __mutex_owner(&audit_cmd_mutex)))) { + if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && @@ -2254,33 +2308,23 @@ EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_link_denied - report a link restriction denial * @operation: specific link operation - * @link: the path that triggered the restriction */ -void audit_log_link_denied(const char *operation, const struct path *link) +void audit_log_link_denied(const char *operation) { struct audit_buffer *ab; - struct audit_names *name; - name = kzalloc(sizeof(*name), GFP_NOFS); - if (!name) + if (!audit_enabled || audit_dummy_context()) return; /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); if (!ab) - goto out; + return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab, current); audit_log_format(ab, " res=0"); audit_log_end(ab); - - /* Generate AUDIT_PATH record with object. */ - name->type = AUDIT_TYPE_NORMAL; - audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry)); - audit_log_name(current->audit_context, name, link, 0, NULL); -out: - kfree(name); } /** diff --git a/kernel/audit.h b/kernel/audit.h index af5bc59487ed..214e14948370 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -341,4 +341,5 @@ extern struct list_head *audit_killed_trees(void); #define audit_filter_inodes(t,c) AUDIT_DISABLED #endif -extern struct mutex audit_cmd_mutex; +extern void audit_ctl_lock(void); +extern void audit_ctl_unlock(void); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index fd353120e0d9..67e6956c0b61 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -709,7 +709,7 @@ static int prune_tree_thread(void *unused) schedule(); } - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); while (!list_empty(&prune_list)) { @@ -727,7 +727,7 @@ static int prune_tree_thread(void *unused) } mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } return 0; } @@ -924,7 +924,7 @@ static void audit_schedule_prune(void) */ void audit_kill_trees(struct list_head *list) { - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); while (!list_empty(list)) { @@ -942,7 +942,7 @@ void audit_kill_trees(struct list_head *list) } mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } /* diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4a1758adb222..d7a807e81451 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -258,8 +258,8 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * goto exit_err; #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: - if (rule->action == AUDIT_ALWAYS) - goto exit_err; + pr_err("AUDIT_FILTER_ENTRY is deprecated\n"); + goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif @@ -496,7 +496,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (!gid_valid(f->gid)) goto exit_free; break; - case AUDIT_SESSIONID: case AUDIT_ARCH: entry->rule.arch_f = f; break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e80459f7e132..4e0a4ac803db 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1511,30 +1511,28 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (!context) + if (!audit_enabled || !context) return; BUG_ON(context->in_syscall || context->name_count); - if (!audit_enabled) + state = context->state; + if (state == AUDIT_DISABLED) return; + context->dummy = !audit_n_rules; + if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { + context->prio = 0; + if (auditd_test_task(tsk)) + return; + } + context->arch = syscall_get_arch(); context->major = major; context->argv[0] = a1; context->argv[1] = a2; context->argv[2] = a3; context->argv[3] = a4; - - state = context->state; - context->dummy = !audit_n_rules; - if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { - context->prio = 0; - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); - } - if (state == AUDIT_DISABLED) - return; - context->serial = 0; context->ctime = current_kernel_time64(); context->in_syscall = 1; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e691da0b3bab..a713fd23ec88 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) +ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif endif +endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ab94d304a634..14750e7c5ee4 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -26,8 +26,10 @@ static void bpf_array_free_percpu(struct bpf_array *array) { int i; - for (i = 0; i < array->map.max_entries; i++) + for (i = 0; i < array->map.max_entries; i++) { free_percpu(array->pptrs[i]); + cond_resched(); + } } static int bpf_array_alloc_percpu(struct bpf_array *array) @@ -43,33 +45,42 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) return -ENOMEM; } array->pptrs[i] = ptr; + cond_resched(); } return 0; } /* Called from syscall */ -static struct bpf_map *array_map_alloc(union bpf_attr *attr) +static int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); - u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); - struct bpf_array *array; - u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ - return ERR_PTR(-E2BIG); + return -E2BIG; + + return 0; +} + +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int ret, numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); + u64 cost, array_size, mask64; + struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -101,8 +112,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ - if (array_size >= U32_MAX - PAGE_SIZE) + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); + if (percpu) { + cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + } + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); @@ -112,26 +134,14 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ - array->map.map_type = attr->map_type; - array->map.key_size = attr->key_size; - array->map.value_size = attr->value_size; - array->map.max_entries = attr->max_entries; - array->map.map_flags = attr->map_flags; - array->map.numa_node = numa_node; + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; array->elem_size = elem_size; - if (!percpu) - goto out; - - array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); - - if (array_size >= U32_MAX - PAGE_SIZE || - bpf_array_alloc_percpu(array)) { + if (percpu && bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } -out: - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; return &array->map; } @@ -327,6 +337,7 @@ static void array_map_free(struct bpf_map *map) } const struct bpf_map_ops array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -337,6 +348,7 @@ const struct bpf_map_ops array_map_ops = { }; const struct bpf_map_ops percpu_array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -345,12 +357,12 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return array_map_alloc(attr); + return -EINVAL; + return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) @@ -474,7 +486,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map) } const struct bpf_map_ops prog_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -561,7 +574,8 @@ static void perf_event_fd_array_release(struct bpf_map *map, } const struct bpf_map_ops perf_event_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -592,7 +606,8 @@ static void cgroup_fd_array_free(struct bpf_map *map) } const struct bpf_map_ops cgroup_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -610,7 +625,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_array_map_alloc(attr); + map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -673,6 +688,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, } const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b789ab78d28f..43171a0bb02b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -495,6 +495,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); /** + * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and + * provided by user sockaddr + * @sk: sock struct that will use sockaddr + * @uaddr: sockaddr struct provided by user + * @type: The type of program to be exectuted + * + * socket is expected to be of type INET or INET6. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type) +{ + struct bpf_sock_addr_kern ctx = { + .sk = sk, + .uaddr = uaddr, + }; + struct cgroup *cgrp; + int ret; + + /* Check socket family since not all sockets represent network + * endpoint (e.g. AF_UNIX). + */ + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); + +/** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains @@ -545,7 +581,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -566,8 +602,11 @@ cgroup_dev_func_proto(enum bpf_func_id func_id) static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (type == BPF_WRITE) return false; @@ -576,8 +615,17 @@ static bool cgroup_dev_is_valid_access(int off, int size, /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + break; + default: + if (size != size_default) + return false; + } return true; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7949e8b8f94e..d315b393abdd 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; + fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); @@ -217,30 +218,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) -{ - return BPF_CLASS(insn->code) == BPF_JMP && - /* Call and Exit are both special jumps with no - * target inside the BPF instruction image. - */ - BPF_OP(insn->code) != BPF_CALL && - BPF_OP(insn->code) != BPF_EXIT; -} - static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) { struct bpf_insn *insn = prog->insnsi; u32 i, insn_cnt = prog->len; + bool pseudo_call; + u8 code; + int off; for (i = 0; i < insn_cnt; i++, insn++) { - if (!bpf_is_jmp_and_has_target(insn)) + code = insn->code; + if (BPF_CLASS(code) != BPF_JMP) continue; + if (BPF_OP(code) == BPF_EXIT) + continue; + if (BPF_OP(code) == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + pseudo_call = true; + else + continue; + } else { + pseudo_call = false; + } + off = pseudo_call ? insn->imm : insn->off; /* Adjust offset of jmps if we cross boundaries. */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; + if (i < pos && i + off + 1 > pos) + off += delta; + else if (i > pos + delta && i + off + 1 <= pos + delta) + off -= delta; + + if (pseudo_call) + insn->imm = off; + else + insn->off = off; } } @@ -289,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +/* All BPF JIT sysctl knobs here. */ +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_harden __read_mostly; +int bpf_jit_kallsyms __read_mostly; + static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, unsigned long *symbol_start, @@ -370,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -int bpf_jit_kallsyms __read_mostly; - static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); @@ -552,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } -int bpf_jit_harden __read_mostly; - static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) @@ -711,7 +723,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled()) + if (!bpf_jit_blinding_enabled(prog) || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -753,13 +765,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) i += insn_delta; } + clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. + * anyway later on, so do not let the compiler omit it. This also needs + * to go into kallsyms for correlation from e.g. bpftool, so naming + * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { @@ -767,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +/* All UAPI available opcodes. */ +#define BPF_INSN_MAP(INSN_2, INSN_3) \ + /* 32 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ + INSN_2(ALU, NEG), \ + INSN_3(ALU, END, TO_BE), \ + INSN_3(ALU, END, TO_LE), \ + /* Immediate based. */ \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ + /* 64 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU64, ADD, X), \ + INSN_3(ALU64, SUB, X), \ + INSN_3(ALU64, AND, X), \ + INSN_3(ALU64, OR, X), \ + INSN_3(ALU64, LSH, X), \ + INSN_3(ALU64, RSH, X), \ + INSN_3(ALU64, XOR, X), \ + INSN_3(ALU64, MUL, X), \ + INSN_3(ALU64, MOV, X), \ + INSN_3(ALU64, ARSH, X), \ + INSN_3(ALU64, DIV, X), \ + INSN_3(ALU64, MOD, X), \ + INSN_2(ALU64, NEG), \ + /* Immediate based. */ \ + INSN_3(ALU64, ADD, K), \ + INSN_3(ALU64, SUB, K), \ + INSN_3(ALU64, AND, K), \ + INSN_3(ALU64, OR, K), \ + INSN_3(ALU64, LSH, K), \ + INSN_3(ALU64, RSH, K), \ + INSN_3(ALU64, XOR, K), \ + INSN_3(ALU64, MUL, K), \ + INSN_3(ALU64, MOV, K), \ + INSN_3(ALU64, ARSH, K), \ + INSN_3(ALU64, DIV, K), \ + INSN_3(ALU64, MOD, K), \ + /* Call instruction. */ \ + INSN_2(JMP, CALL), \ + /* Exit instruction. */ \ + INSN_2(JMP, EXIT), \ + /* Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP, JEQ, X), \ + INSN_3(JMP, JNE, X), \ + INSN_3(JMP, JGT, X), \ + INSN_3(JMP, JLT, X), \ + INSN_3(JMP, JGE, X), \ + INSN_3(JMP, JLE, X), \ + INSN_3(JMP, JSGT, X), \ + INSN_3(JMP, JSLT, X), \ + INSN_3(JMP, JSGE, X), \ + INSN_3(JMP, JSLE, X), \ + INSN_3(JMP, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP, JEQ, K), \ + INSN_3(JMP, JNE, K), \ + INSN_3(JMP, JGT, K), \ + INSN_3(JMP, JLT, K), \ + INSN_3(JMP, JGE, K), \ + INSN_3(JMP, JLE, K), \ + INSN_3(JMP, JSGT, K), \ + INSN_3(JMP, JSLT, K), \ + INSN_3(JMP, JSGE, K), \ + INSN_3(JMP, JSLE, K), \ + INSN_3(JMP, JSET, K), \ + INSN_2(JMP, JA), \ + /* Store instructions. */ \ + /* Register based. */ \ + INSN_3(STX, MEM, B), \ + INSN_3(STX, MEM, H), \ + INSN_3(STX, MEM, W), \ + INSN_3(STX, MEM, DW), \ + INSN_3(STX, XADD, W), \ + INSN_3(STX, XADD, DW), \ + /* Immediate based. */ \ + INSN_3(ST, MEM, B), \ + INSN_3(ST, MEM, H), \ + INSN_3(ST, MEM, W), \ + INSN_3(ST, MEM, DW), \ + /* Load instructions. */ \ + /* Register based. */ \ + INSN_3(LDX, MEM, B), \ + INSN_3(LDX, MEM, H), \ + INSN_3(LDX, MEM, W), \ + INSN_3(LDX, MEM, DW), \ + /* Immediate based. */ \ + INSN_3(LD, IMM, DW), \ + /* Misc (old cBPF carry-over). */ \ + INSN_3(LD, ABS, B), \ + INSN_3(LD, ABS, H), \ + INSN_3(LD, ABS, W), \ + INSN_3(LD, IND, B), \ + INSN_3(LD, IND, H), \ + INSN_3(LD, IND, W) + +bool bpf_opcode_in_insntable(u8 code) +{ +#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true +#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true + static const bool public_insntable[256] = { + [0 ... 255] = false, + /* Now overwrite non-defaults ... */ + BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + }; +#undef BPF_INSN_3_TBL +#undef BPF_INSN_2_TBL + return public_insntable[code]; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context @@ -775,118 +921,21 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, - u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; +#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y +#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, + BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), + /* Non-UAPI available opcodes. */ + [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, - [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, - [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, - [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, - [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; +#undef BPF_INSN_3_LBL +#undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; void *ptr; int off; @@ -950,14 +999,10 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; div64_u64_rem(DST, SRC, &tmp); DST = tmp; CONT; ALU_MOD_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); CONT; @@ -970,13 +1015,9 @@ select_insn: DST = do_div(tmp, (u32) IMM); CONT; ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); DST = (u32) tmp; @@ -1026,6 +1067,13 @@ select_insn: BPF_R4, BPF_R5); CONT; + JMP_CALL_ARGS: + BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, + BPF_R3, BPF_R4, + BPF_R5, + insn + insn->off + 1); + CONT; + JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1280,8 +1328,14 @@ load_byte: goto load_byte; default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + /* If we ever reach this, we have a bug somewhere. Die hard here + * instead of just returning 0; we could be somewhere in a subprog, + * so execution could continue otherwise which we do /not/ want. + * + * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). + */ + pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + BUG_ON(1); return 0; } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ @@ -1298,6 +1352,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn return ___bpf_prog_run(regs, insn, stack); \ } +#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size +#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ +static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ + const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + BPF_R1 = r1; \ + BPF_R2 = r2; \ + BPF_R3 = r3; \ + BPF_R4 = r4; \ + BPF_R5 = r5; \ + return ___bpf_prog_run(regs, insn, stack); \ +} + #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) @@ -1309,6 +1380,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); + #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, @@ -1317,11 +1392,33 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#undef PROG_NAME_LIST +#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), +static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; +#undef PROG_NAME_LIST + +void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) +{ + stack_depth = max_t(u32, stack_depth, 1); + insn->off = (s16) insn->imm; + insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - + __bpf_call_base_args; + insn->code = BPF_JMP | BPF_CALL_ARGS; +} #else -static unsigned int __bpf_prog_ret0(const void *ctx, - const struct bpf_insn *insn) +static unsigned int __bpf_prog_ret0_warn(const void *ctx, + const struct bpf_insn *insn) { + /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON + * is not working properly, so warn about it! + */ + WARN_ON_ONCE(1); return 0; } #endif @@ -1329,6 +1426,9 @@ static unsigned int __bpf_prog_ret0(const void *ctx, bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. @@ -1378,7 +1478,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; #else - fp->bpf_func = __bpf_prog_ret0; + fp->bpf_func = __bpf_prog_ret0_warn; #endif /* eBPF JITs can rewrite the program in case constant @@ -1476,23 +1576,41 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; - u32 i = 0, id; - + unsigned long err = 0; + u32 i = 0, *ids; + bool nospc; + + /* users of this function are doing: + * cnt = bpf_prog_array_length(); + * if (cnt > 0) + * bpf_prog_array_copy_to_user(..., cnt); + * so below kcalloc doesn't need extra cnt > 0 check, but + * bpf_prog_array_length() releases rcu lock and + * prog array could have been swapped with empty or larger array, + * so always copy 'cnt' prog_ids to the user. + * In a rare race the user will see zero prog_ids + */ + ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); + if (!ids) + return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) { - id = (*prog)->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) { - rcu_read_unlock(); - return -EFAULT; - } + if (*prog == &dummy_bpf_prog.prog) + continue; + ids[i] = (*prog)->aux->id; if (++i == cnt) { prog++; break; } } + nospc = !!(*prog); rcu_read_unlock(); - if (*prog) + err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); + kfree(ids); + if (err) + return -EFAULT; + if (nospc) return -ENOSPC; return 0; } @@ -1564,14 +1682,41 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt) +{ + u32 cnt = 0; + + if (array) + cnt = bpf_prog_array_length(array); + + if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + + /* return early if user requested only program count or nothing to copy */ + if (!request_cnt || !cnt) + return 0; + + return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; + int i; aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); - bpf_jit_free(aux->prog); + for (i = 0; i < aux->func_cnt; i++) + bpf_jit_free(aux->func[i]); + if (aux->func_cnt) { + kfree(aux->func); + bpf_prog_unlock_free(aux->prog); + } else { + bpf_jit_free(aux->prog); + } } /* Free internal BPF program */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ce5b669003b2..a4bb0b34375a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) if (!cmap) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - cmap->map.map_type = attr->map_type; - cmap->map.key_size = attr->key_size; - cmap->map.value_size = attr->value_size; - cmap->map.max_entries = attr->max_entries; - cmap->map.map_flags = attr->map_flags; - cmap->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&cmap->map, attr); /* Pre-limit array size based on NR_CPUS, not final CPU check */ if (cmap->map.max_entries > NR_CPUS) { @@ -143,7 +137,7 @@ free_cmap: return ERR_PTR(err); } -void __cpu_map_queue_destructor(void *ptr) +static void __cpu_map_queue_destructor(void *ptr) { /* The tear-down procedure should have made sure that queue is * empty. See __cpu_map_entry_replace() and work-queue @@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) return xdp_pkt; } -struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) +static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) { unsigned int frame_size; void *pkt_data_start; @@ -337,9 +331,10 @@ static int cpu_map_kthread_run(void *data) return 0; } -struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, + int map_id) { - gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; int numa, err; @@ -395,7 +390,7 @@ free_rcu: return NULL; } -void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; int cpu; @@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu) * cpu_map_kthread_stop, which waits for an RCU graze period before * stopping kthread, emptying the queue. */ -void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, - u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) { struct bpf_cpu_map_entry *old_rcpu; @@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -int cpu_map_delete_elem(struct bpf_map *map, void *key) +static int cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } -void cpu_map_free(struct bpf_map *map) +static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); int cpu; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ebdef54bf7df..565f9ece9115 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - dtab->map.map_type = attr->map_type; - dtab->map.key_size = attr->key_size; - dtab->map.value_size = attr->value_size; - dtab->map.max_entries = attr->max_entries; - dtab->map.map_flags = attr->map_flags; - dtab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index e682850c9715..d6b76377cb6e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -21,10 +21,39 @@ static const char * const func_id_str[] = { }; #undef __BPF_FUNC_STR_FN -const char *func_id_name(int id) +static const char *__func_get_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + char *buff, size_t len) { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + if (insn->src_reg != BPF_PSEUDO_CALL && + insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && + func_id_str[insn->imm]) + return func_id_str[insn->imm]; + + if (cbs && cbs->cb_call) + return cbs->cb_call(cbs->private_data, insn); + + if (insn->src_reg == BPF_PSEUDO_CALL) + snprintf(buff, len, "%+d", insn->imm); + + return buff; +} + +static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + u64 full_imm, char *buff, size_t len) +{ + if (cbs && cbs->cb_imm) + return cbs->cb_imm(cbs->private_data, insn, full_imm); + + snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); + return buff; +} + +const char *func_id_name(int id) +{ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) return func_id_str[id]; else @@ -83,40 +112,43 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_end_insn(bpf_insn_print_cb verbose, - struct bpf_verifier_env *env, +static void print_bpf_end_insn(bpf_insn_print_t verbose, + void *private_data, const struct bpf_insn *insn) { - verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + verbose(private_data, "(%02x) r%d = %s%d r%d\n", + insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks) +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + bool allow_ptr_leaks) { + const bpf_insn_print_t verbose = cbs->cb_print; u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose(env, "BUG_alu64_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); else - print_bpf_end_insn(verbose, env, insn); + print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(env, "(%02x) r%d = %s-r%d\n", + verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", insn->code, insn->dst_reg, class == BPF_ALU ? "(u32) " : "", insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) %sr%d %s %sr%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); } else { - verbose(env, "(%02x) %sr%d %s %s%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], @@ -125,46 +157,46 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) - verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else - verbose(env, "BUG_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_st_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); return; } - verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_ldx_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } - verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); @@ -175,40 +207,55 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + char tmp[64]; if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); + verbose(cbs->private_data, "(%02x) r%d = %s\n", + insn->code, insn->dst_reg, + __func_imm_name(cbs, insn, imm, + tmp, sizeof(tmp))); } else { - verbose(env, "BUG_ld_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + char tmp[64]; + + if (insn->src_reg == BPF_PSEUDO_CALL) { + verbose(cbs->private_data, "(%02x) call pc%s\n", + insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp))); + } else { + strcpy(tmp, "unknown"); + verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp)), + insn->imm); + } } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(env, "(%02x) goto pc%+d\n", + verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(env, "(%02x) exit\n", insn->code); + verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->src_reg, insn->off); } else { - verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { - verbose(env, "(%02x) %s\n", + verbose(cbs->private_data, "(%02x) %s\n", insn->code, bpf_class_string[class]); } } diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 8de977e420b6..e1324a834a24 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -17,16 +17,32 @@ #include <linux/bpf.h> #include <linux/kernel.h> #include <linux/stringify.h> +#ifndef __KERNEL__ +#include <stdio.h> +#include <string.h> +#endif extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -struct bpf_verifier_env; -typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, - const char *, ...); -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks); +typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data, + const char *, ...); +typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, + const struct bpf_insn *insn); +typedef const char *(*bpf_insn_print_imm_t)(void *private_data, + const struct bpf_insn *insn, + __u64 full_imm); + +struct bpf_insn_cbs { + bpf_insn_print_t cb_print; + bpf_insn_revmap_call_t cb_call; + bpf_insn_print_imm_t cb_imm; + void *private_data; +}; +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3905d4bc5b80..b76828f23b49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab) } /* Called from syscall */ -static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +static int htab_map_alloc_check(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); @@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_htab *htab; - int err, i; - u64 cost; BUILD_BUG_ON(offsetof(struct htab_elem, htab) != offsetof(struct htab_elem, hash_node.pprev)); @@ -254,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* LRU implementation is much complicated than other * maps. Hence, limit to CAP_SYS_ADMIN for now. */ - return ERR_PTR(-EPERM); + return -EPERM; if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ - return ERR_PTR(-EINVAL); + return -EINVAL; if (!lru && percpu_lru) - return ERR_PTR(-EINVAL); + return -EINVAL; if (lru && !prealloc) - return ERR_PTR(-ENOTSUPP); + return -ENOTSUPP; if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) - return ERR_PTR(-EINVAL); + return -EINVAL; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + if (attr->max_entries == 0 || attr->key_size == 0 || + attr->value_size == 0) + return -EINVAL; + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return -E2BIG; + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + struct bpf_htab *htab; + int err, i; + u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - htab->map.map_type = attr->map_type; - htab->map.key_size = attr->key_size; - htab->map.value_size = attr->value_size; - htab->map.max_entries = attr->max_entries; - htab->map.map_flags = attr->map_flags; - htab->map.numa_node = numa_node; - - /* check sanity of attributes. - * value_size == 0 may be allowed in the future to use map as a set - */ - err = -EINVAL; - if (htab->map.max_entries == 0 || htab->map.key_size == 0 || - htab->map.value_size == 0) - goto free_htab; + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. @@ -304,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - err = -E2BIG; - if (htab->map.key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - goto free_htab; - - if (htab->map.value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's - * kmalloc-able later in htab_map_update_elem() - */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) @@ -327,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->elem_size += round_up(htab->map.value_size, 8); + err = -E2BIG; /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) @@ -1143,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1153,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = { }; const struct bpf_map_ops htab_lru_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1236,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, } const struct bpf_map_ops htab_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1245,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { }; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1253,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +static int fd_htab_map_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return htab_map_alloc(attr); + return -EINVAL; + return htab_map_alloc_check(attr); } static void fd_htab_map_free(struct bpf_map *map) @@ -1328,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_htab_map_alloc(attr); + map = htab_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -1372,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 81e2f6995adb..bf6da59ae0d0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -178,6 +178,9 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { + /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future + * extensions. + */ if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 885e45479680..b4b5b81e7251 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ - trie->map.map_type = attr->map_type; - trie->map.key_size = attr->key_size; - trie->map.value_size = attr->value_size; - trie->map.max_entries = attr->max_entries; - trie->map.map_flags = attr->map_flags; - trie->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; @@ -560,7 +555,10 @@ static void trie_free(struct bpf_map *map) struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; - raw_spin_lock(&trie->lock); + /* Wait for outstanding programs to complete + * update/lookup/delete/get_next_key and free the trie. + */ + synchronize_rcu(); /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent @@ -571,10 +569,9 @@ static void trie_free(struct bpf_map *map) slot = &trie->root; for (;;) { - node = rcu_dereference_protected(*slot, - lockdep_is_held(&trie->lock)); + node = rcu_dereference_protected(*slot, 1); if (!node) - goto unlock; + goto out; if (rcu_access_pointer(node->child[0])) { slot = &node->child[0]; @@ -592,13 +589,100 @@ static void trie_free(struct bpf_map *map) } } -unlock: - raw_spin_unlock(&trie->lock); +out: + kfree(trie); } -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { - return -ENOTSUPP; + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct lpm_trie_node **node_stack = NULL; + int err = 0, stack_ptr = -1; + unsigned int next_bit; + size_t matchlen; + + /* The get_next_key follows postorder. For the 4 node example in + * the top of this file, the trie_get_next_key() returns the following + * one after another: + * 192.168.0.0/24 + * 192.168.1.0/24 + * 192.168.128.0/24 + * 192.168.0.0/16 + * + * The idea is to return more specific keys before less specific ones. + */ + + /* Empty trie */ + search_root = rcu_dereference(trie->root); + if (!search_root) + return -ENOENT; + + /* For invalid key, find the leftmost node in the trie */ + if (!key || key->prefixlen > trie->max_prefixlen) + goto find_leftmost; + + node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), + GFP_ATOMIC | __GFP_NOWARN); + if (!node_stack) + return -ENOMEM; + + /* Try to find the exact node for the given key */ + for (node = search_root; node;) { + node_stack[++stack_ptr] = node; + matchlen = longest_prefix_match(trie, node, key); + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) + goto find_leftmost; + + /* The node with the exactly-matching key has been found, + * find the first node in postorder after the matched node. + */ + node = node_stack[stack_ptr]; + while (stack_ptr > 0) { + parent = node_stack[stack_ptr - 1]; + if (rcu_dereference(parent->child[0]) == node) { + search_root = rcu_dereference(parent->child[1]); + if (search_root) + goto find_leftmost; + } + if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { + next_node = parent; + goto do_copy; + } + + node = parent; + stack_ptr--; + } + + /* did not find anything */ + err = -ENOENT; + goto free_stack; + +find_leftmost: + /* Find the leftmost non-intermediate node, all intermediate nodes + * have exact two children, so this function will never return NULL. + */ + for (node = search_root; node;) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + next_node = node; + node = rcu_dereference(node->child[0]); + } +do_copy: + next_key->prefixlen = next_node->prefixlen; + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + next_node->data, trie->data_size); +free_stack: + kfree(node_stack); + return err; } const struct bpf_map_ops trie_map_ops = { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8455b89d1bbf..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -16,18 +16,35 @@ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/bug.h> +#include <linux/kdev_t.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> +#include <linux/proc_ns.h> #include <linux/rtnetlink.h> +#include <linux/rwsem.h> -/* protected by RTNL */ +/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members + * of all progs. + * RTNL lock cannot be taken when holding this lock. + */ +static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static LIST_HEAD(bpf_map_offload_devs); + +static int bpf_dev_offload_check(struct net_device *netdev) +{ + if (!netdev) + return -EINVAL; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + return 0; +} int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct net *net = current->nsproxy->net_ns; - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; + int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -41,34 +58,44 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return -ENOMEM; offload->prog = prog; - init_waitqueue_head(&offload->verifier_done); - rtnl_lock(); - offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); - if (!offload->netdev) { - rtnl_unlock(); - kfree(offload); - return -EINVAL; - } + offload->netdev = dev_get_by_index(current->nsproxy->net_ns, + attr->prog_ifindex); + err = bpf_dev_offload_check(offload->netdev); + if (err) + goto err_maybe_put; + down_write(&bpf_devs_lock); + if (offload->netdev->reg_state != NETREG_REGISTERED) { + err = -EINVAL; + goto err_unlock; + } prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); - rtnl_unlock(); + dev_put(offload->netdev); + up_write(&bpf_devs_lock); return 0; +err_unlock: + up_write(&bpf_devs_lock); +err_maybe_put: + if (offload->netdev) + dev_put(offload->netdev); + kfree(offload); + return err; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct net_device *netdev = prog->aux->offload->netdev; + struct bpf_prog_offload *offload = prog->aux->offload; + struct net_device *netdev; ASSERT_RTNL(); - if (!netdev) + if (!offload) return -ENODEV; - if (!netdev->netdev_ops->ndo_bpf) - return -EOPNOTSUPP; + netdev = offload->netdev; data->command = cmd; @@ -87,62 +114,63 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) if (err) goto exit_unlock; - env->dev_ops = data.verifier.ops; - + env->prog->aux->offload->dev_ops = data.verifier.ops; env->prog->aux->offload->dev_state = true; - env->prog->aux->offload->verifier_running = true; exit_unlock: rtnl_unlock(); return err; } +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + struct bpf_prog_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) + ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; - /* Caution - if netdev is destroyed before the program, this function - * will be called twice. - */ - data.offload.prog = prog; - if (offload->verifier_running) - wait_event(offload->verifier_done, !offload->verifier_running); - if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); - offload->dev_state = false; + /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ + bpf_prog_free_id(prog, true); + list_del_init(&offload->offloads); - offload->netdev = NULL; + kfree(offload); + prog->aux->offload = NULL; } void bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; - - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); - __bpf_prog_offload_destroy(prog); + down_write(&bpf_devs_lock); + if (prog->aux->offload) + __bpf_prog_offload_destroy(prog); + up_write(&bpf_devs_lock); rtnl_unlock(); - - kfree(offload); } static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; int ret; data.offload.prog = prog; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); rtnl_unlock(); @@ -164,14 +192,323 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +struct ns_get_path_bpf_prog_args { + struct bpf_prog *prog; + struct bpf_prog_info *info; +}; + +static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_prog_args *args = private_data; + struct bpf_prog_aux *aux = args->prog->aux; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (aux->offload) { + args->info->ifindex = aux->offload->netdev->ifindex; + net = dev_net(aux->offload->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog) +{ + struct ns_get_path_bpf_prog_args args = { + .prog = prog, + .info = info, + }; + struct bpf_prog_aux *aux = prog->aux; + struct inode *ns_inode; + struct path ns_path; + char __user *uinsns; + void *res; + u32 ulen; + + res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + down_read(&bpf_devs_lock); + + if (!aux->offload) { + up_read(&bpf_devs_lock); + return -ENODEV; + } + + ulen = info->jited_prog_len; + info->jited_prog_len = aux->offload->jited_len; + if (info->jited_prog_len & ulen) { + uinsns = u64_to_user_ptr(info->jited_prog_insns); + ulen = min_t(u32, info->jited_prog_len, ulen); + if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { + up_read(&bpf_devs_lock); + return -EFAULT; + } + } + + up_read(&bpf_devs_lock); + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_offloaded_map *offmap; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_type != BPF_MAP_TYPE_HASH) + return ERR_PTR(-EINVAL); + + offmap = kzalloc(sizeof(*offmap), GFP_USER); + if (!offmap) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&offmap->map, attr); + + rtnl_lock(); + down_write(&bpf_devs_lock); + offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); + err = bpf_dev_offload_check(offmap->netdev); + if (err) + goto err_unlock; + + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); + if (err) + goto err_unlock; + + list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + return &offmap->map; + +err_unlock: + up_write(&bpf_devs_lock); + rtnl_unlock(); + kfree(offmap); + return ERR_PTR(err); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +void bpf_map_offload_map_free(struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + + rtnl_lock(); + down_write(&bpf_devs_lock); + if (offmap->netdev) + __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + kfree(offmap); +} + +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_update_elem(offmap, key, value, + flags); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_delete_elem(offmap, key); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); + up_read(&bpf_devs_lock); + + return ret; +} + +struct ns_get_path_bpf_map_args { + struct bpf_offloaded_map *offmap; + struct bpf_map_info *info; +}; + +static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_map_args *args = private_data; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (args->offmap->netdev) { + args->info->ifindex = args->offmap->netdev->ifindex; + net = dev_net(args->offmap->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct ns_get_path_bpf_map_args args = { + .offmap = map_to_offmap(map), + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + struct bpf_prog_offload *offload; + bool ret; + + if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) + return false; + + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + offmap = map_to_offmap(map); + + ret = offload && offload->netdev == offmap->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + +static void bpf_offload_orphan_all_progs(struct net_device *netdev) +{ + struct bpf_prog_offload *offload, *tmp; + + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); +} + +static void bpf_offload_orphan_all_maps(struct net_device *netdev) +{ + struct bpf_offloaded_map *offmap, *tmp; + + list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) + if (offmap->netdev == netdev) + __bpf_map_offload_destroy(offmap); +} + static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_dev_offload *offload, *tmp; ASSERT_RTNL(); @@ -181,11 +518,10 @@ static int bpf_offload_notification(struct notifier_block *notifier, if (netdev->reg_state != NETREG_UNREGISTERING) break; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, - offloads) { - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); - } + down_write(&bpf_devs_lock); + bpf_offload_orphan_all_progs(netdev); + bpf_offload_orphan_all_maps(netdev); + up_write(&bpf_devs_lock); break; default: break; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1712d319c2d8..8dd9210d7db7 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -38,8 +38,11 @@ #include <linux/skbuff.h> #include <linux/workqueue.h> #include <linux/list.h> +#include <linux/mm.h> #include <net/strparser.h> #include <net/tcp.h> +#include <linux/ptr_ring.h> +#include <net/inet_common.h> #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -47,6 +50,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; @@ -62,8 +66,7 @@ struct smap_psock_map_entry { struct smap_psock { struct rcu_head rcu; - /* refcnt is used inside sk_callback_lock */ - u32 refcnt; + refcount_t refcnt; /* datapath variables */ struct sk_buff_head rxqueue; @@ -74,7 +77,17 @@ struct smap_psock { int save_off; struct sk_buff *save_skb; + /* datapath variables for tx_msg ULP */ + struct sock *sk_redir; + int apply_bytes; + int cork_bytes; + int sg_size; + int eval; + struct sk_msg_buff *cork; + struct list_head ingress; + struct strparser strp; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; @@ -86,30 +99,939 @@ struct smap_psock { struct work_struct tx_work; struct work_struct gc_work; + struct proto *sk_proto; + void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); - void (*save_state_change)(struct sock *sk); }; +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); } -/* compute the linear packet data range [data, data_end) for skb when - * sk_skb type programs are in use. - */ -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +static bool bpf_tcp_stream_read(const struct sock *sk) +{ + struct smap_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + empty = list_empty(&psock->ingress); +out: + rcu_read_unlock(); + return !empty; +} + +static struct proto tcp_bpf_proto; +static int bpf_tcp_init(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return -EINVAL; + } + + if (unlikely(psock->sk_proto)) { + rcu_read_unlock(); + return -EBUSY; + } + + psock->save_close = sk->sk_prot->close; + psock->sk_proto = sk->sk_prot; + + if (psock->bpf_tx_msg) { + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; + tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; + } + + sk->sk_prot = &tcp_bpf_proto; + rcu_read_unlock(); + return 0; +} + +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); + +static void bpf_tcp_release(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } + + if (psock->sk_proto) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } +out: + rcu_read_unlock(); +} + +static void bpf_tcp_close(struct sock *sk, long timeout) { - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; + struct smap_psock *psock; + struct sock *osk; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return sk->sk_prot->close(sk, timeout); + } + + /* The psock may be destroyed anytime after exiting the RCU critial + * section so by the time we use close_fun the psock may no longer + * be valid. However, bpf_tcp_close is called with the sock lock + * held so the close hook and sk are still valid. + */ + close_fun = psock->save_close; + + write_lock_bh(&sk->sk_callback_lock); + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } + + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } + write_unlock_bh(&sk->sk_callback_lock); + rcu_read_unlock(); + close_fun(sk, timeout); } enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, + __SK_NONE, }; +static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { + .name = "bpf_tcp", + .uid = TCP_ULP_BPF, + .user_visible = false, + .owner = NULL, + .init = bpf_tcp_init, + .release = bpf_tcp_release, +}; + +static int memcopy_from_iter(struct sock *sk, + struct sk_msg_buff *md, + struct iov_iter *from, int bytes) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_curr, rc = -ENOSPC; + + do { + int copy; + char *to; + + if (md->sg_copybreak >= sg[i].length) { + md->sg_copybreak = 0; + + if (++i == MAX_SKB_FRAGS) + i = 0; + + if (i == md->sg_end) + break; + } + + copy = sg[i].length - md->sg_copybreak; + to = sg_virt(&sg[i]) + md->sg_copybreak; + md->sg_copybreak += copy; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(to, copy, from); + else + rc = copy_from_iter(to, copy, from); + + if (rc != copy) { + rc = -EFAULT; + goto out; + } + + bytes -= copy; + if (!bytes) + break; + + md->sg_copybreak = 0; + if (++i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +out: + md->sg_curr = i; + return rc; +} + +static int bpf_tcp_push(struct sock *sk, int apply_bytes, + struct sk_msg_buff *md, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sg; + int offset, ret = 0; + struct page *p; + size_t size; + + while (1) { + sg = md->sg_data + md->sg_start; + size = (apply && apply_bytes < sg->length) ? + apply_bytes : sg->length; + offset = sg->offset; + + tcp_rate_check_app_limited(sk); + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + if (apply) + apply_bytes -= ret; + size -= ret; + offset += ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + goto retry; + } + + sg->length = size; + sg->offset = offset; + return ret; + } + + if (apply) + apply_bytes -= ret; + sg->offset += ret; + sg->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + + if (!sg->length) { + put_page(p); + md->sg_start++; + if (md->sg_start == MAX_SKB_FRAGS) + md->sg_start = 0; + sg_init_table(sg, 1); + + if (md->sg_start == md->sg_end) + break; + } + + if (apply && !apply_bytes) + break; + } + return 0; +} + +static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data + md->sg_start; + + if (md->sg_copy[md->sg_start]) { + md->data = md->data_end = 0; + } else { + md->data = sg_virt(sg); + md->data_end = md->data + sg->length; + } +} + +static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start; + + do { + int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + if (!bytes) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +} + +static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start, free; + + while (bytes && sg[i].length) { + free = sg[i].length; + if (bytes < free) { + sg[i].length -= bytes; + sg[i].offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + bytes -= sg[i].length; + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } +} + +static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = start, free = 0; + + while (sg[i].length) { + free += sg[i].length; + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } + + return free; +} + +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +{ + int free = free_sg(sk, md->sg_start, md); + + md->sg_start = md->sg_end; + return free; +} + +static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) +{ + return free_sg(sk, md->sg_curr, md); +} + +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) +{ + return ((_rc == SK_PASS) ? + (md->map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP); +} + +static unsigned int smap_do_tx_msg(struct sock *sk, + struct smap_psock *psock, + struct sk_msg_buff *md) +{ + struct bpf_prog *prog; + unsigned int rc, _rc; + + preempt_disable(); + rcu_read_lock(); + + /* If the policy was removed mid-send then default to 'accept' */ + prog = READ_ONCE(psock->bpf_tx_msg); + if (unlikely(!prog)) { + _rc = SK_PASS; + goto verdict; + } + + bpf_compute_data_pointers_sg(md); + rc = (*prog->bpf_func)(md, prog->insnsi); + psock->apply_bytes = md->apply_bytes; + + /* Moving return codes from UAPI namespace into internal namespace */ + _rc = bpf_map_msg_verdict(rc, md); + + /* The psock has a refcount on the sock but not on the map and because + * we need to drop rcu read lock here its possible the map could be + * removed between here and when we need it to execute the sock + * redirect. So do the map lookup now for future use. + */ + if (_rc == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = do_msg_redirect_map(md); + if (!psock->sk_redir) { + _rc = __SK_DROP; + goto verdict; + } + sock_hold(psock->sk_redir); + } +verdict: + rcu_read_unlock(); + preempt_enable(); + + return _rc; +} + +static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, + struct smap_psock *psock, + struct sk_msg_buff *md, int flags) +{ + bool apply = apply_bytes; + size_t size, copied = 0; + struct sk_msg_buff *r; + int err = 0, i; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!r)) + return -ENOMEM; + + lock_sock(sk); + r->sg_start = md->sg_start; + i = md->sg_start; + + do { + r->sg_data[i] = md->sg_data[i]; + + size = (apply && apply_bytes < md->sg_data[i].length) ? + apply_bytes : md->sg_data[i].length; + + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + err = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + r->sg_data[i].length = size; + md->sg_data[i].length -= size; + md->sg_data[i].offset += size; + copied += size; + + if (md->sg_data[i].length) { + get_page(sg_page(&r->sg_data[i])); + r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; + } else { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + r->sg_end = i; + } + + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != md->sg_end); + + md->sg_start = i; + + if (!err) { + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + } else { + free_start_sg(sk, r); + kfree(r); + } + + release_sock(sk); + return err; +} + +static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, + struct sk_msg_buff *md, + int flags) +{ + struct smap_psock *psock; + struct scatterlist *sg; + int i, err, free = 0; + bool ingress = !!(md->flags & BPF_F_INGRESS); + + sg = md->sg_data; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out_rcu; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto out_rcu; + + rcu_read_unlock(); + + if (ingress) { + err = bpf_tcp_ingress(sk, send, psock, md, flags); + } else { + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + } + smap_release_sock(psock, sk); + if (unlikely(err)) + goto out; + return 0; +out_rcu: + rcu_read_unlock(); +out: + i = md->sg_start; + while (sg[i].length) { + free += sg[i].length; + put_page(sg_page(&sg[i])); + sg[i].length = 0; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } + return free; +} + +static inline void bpf_md_init(struct smap_psock *psock) +{ + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } +} + +static void apply_bytes_dec(struct smap_psock *psock, int i) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < i) + psock->apply_bytes = 0; + else + psock->apply_bytes -= i; + } +} + +static int bpf_exec_tx_verdict(struct smap_psock *psock, + struct sk_msg_buff *m, + struct sock *sk, + int *copied, int flags) +{ + bool cork = false, enospc = (m->sg_start == m->sg_end); + struct sock *redir; + int err = 0; + int send; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = smap_do_tx_msg(sk, psock, m); + + if (m->cork_bytes && + m->cork_bytes > psock->sg_size && !enospc) { + psock->cork_bytes = m->cork_bytes - psock->sg_size; + if (!psock->cork) { + psock->cork = kcalloc(1, + sizeof(struct sk_msg_buff), + GFP_ATOMIC | __GFP_NOWARN); + + if (!psock->cork) { + err = -ENOMEM; + goto out_err; + } + } + memcpy(psock->cork, m, sizeof(*m)); + goto out_err; + } + + send = psock->sg_size; + if (psock->apply_bytes && psock->apply_bytes < send) + send = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = bpf_tcp_push(sk, send, m, flags, true); + if (unlikely(err)) { + *copied -= free_start_sg(sk, m); + break; + } + + apply_bytes_dec(psock, send); + psock->sg_size -= send; + break; + case __SK_REDIRECT: + redir = psock->sk_redir; + apply_bytes_dec(psock, send); + + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + + return_mem_sg(sk, send, m); + release_sock(sk); + + err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); + lock_sock(sk); + + if (cork) { + free_start_sg(sk, m); + kfree(m); + m = NULL; + } + if (unlikely(err)) + *copied -= err; + else + psock->sg_size -= send; + break; + case __SK_DROP: + default: + free_bytes_sg(sk, send, m); + apply_bytes_dec(psock, send); + *copied -= send; + psock->sg_size -= send; + err = -EACCES; + break; + } + + if (likely(!err)) { + bpf_md_init(psock); + if (m && + m->sg_data[m->sg_start].page_link && + m->sg_data[m->sg_start].length) + goto more_data; + } + +out_err: + return err; +} + +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct iov_iter *iter = &msg->msg_iter; + struct smap_psock *psock; + int copied = 0; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + + if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) + goto out; + rcu_read_unlock(); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + while (copied != len) { + struct scatterlist *sg; + struct sk_msg_buff *md; + int i; + + md = list_first_entry_or_null(&psock->ingress, + struct sk_msg_buff, list); + if (unlikely(!md)) + break; + i = md->sg_start; + do { + struct page *page; + int n, copy; + + sg = &md->sg_data[i]; + copy = sg->length; + page = sg_page(sg); + + if (copied + copy > len) + copy = len - copied; + + n = copy_page_to_iter(page, sg->offset, copy, iter); + if (n != copy) { + md->sg_start = i; + release_sock(sk); + smap_release_sock(psock, sk); + return -EFAULT; + } + + copied += copy; + sg->offset += copy; + sg->length -= copy; + sk_mem_uncharge(sk, copy); + + if (!sg->length) { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (!md->skb) + put_page(page); + } + if (copied == len) + break; + } while (i != md->sg_end); + md->sg_start = i; + + if (!sg->length && md->sg_start == md->sg_end) { + list_del(&md->list); + if (md->skb) + consume_skb(md->skb); + kfree(md); + } + } + + release_sock(sk); + smap_release_sock(psock, sk); + return copied; +out: + rcu_read_unlock(); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); +} + + +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + struct sk_msg_buff md = {0}; + unsigned int sg_copy = 0; + struct smap_psock *psock; + int copied = 0, err = 0; + struct scatterlist *sg; + long timeo; + + /* Its possible a sock event or user removed the psock _but_ the ops + * have not been reprogrammed yet so we get here. In this case fallback + * to tcp_sendmsg. Note this only works because we _only_ ever allow + * a single ULP there is no hierarchy here. + */ + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + /* Increment the psock refcnt to ensure its not released while sending a + * message. Required because sk lookup and bpf programs are used in + * separate rcu critical sections. Its OK if we lose the map entry + * but we can't lose the sock reference. + */ + if (!refcount_inc_not_zero(&psock->refcnt)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + sg = md.sg_data; + sg_init_marker(sg, MAX_SKB_FRAGS); + rcu_read_unlock(); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + while (msg_data_left(msg)) { + struct sk_msg_buff *m; + bool enospc = false; + int copy; + + if (sk->sk_err) { + err = sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + m = psock->cork_bytes ? psock->cork : &md; + m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; + err = sk_alloc_sg(sk, copy, m->sg_data, + m->sg_start, &m->sg_end, &sg_copy, + m->sg_end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = sg_copy; + } + + err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); + if (err < 0) { + free_curr_sg(sk, m); + goto out_err; + } + + psock->sg_size += copy; + copied += copy; + sg_copy = 0; + + /* When bytes are being corked skip running BPF program and + * applying verdict unless there is no more buffer space. In + * the ENOSPC case simply run BPF prorgram with currently + * accumulated data. We don't have much choice at this point + * we could try extending the page frags or chaining complex + * frags but even in these cases _eventually_ we will hit an + * OOM scenario. More complex recovery schemes may be + * implemented in the future, but BPF programs must handle + * the case where apply_cork requests are not honored. The + * canonical method to verify this is to check data length. + */ + if (psock->cork_bytes) { + if (copy > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= copy; + + if (psock->cork_bytes && !enospc) + goto out_cork; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); +out_cork: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +} + +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct sk_msg_buff md = {0}, *m = NULL; + int err = 0, copied = 0; + struct smap_psock *psock; + struct scatterlist *sg; + bool enospc = false; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto accept; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto accept; + rcu_read_unlock(); + + lock_sock(sk); + + if (psock->cork_bytes) { + m = psock->cork; + sg = &m->sg_data[m->sg_end]; + } else { + m = &md; + sg = m->sg_data; + sg_init_marker(sg, MAX_SKB_FRAGS); + } + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(m->sg_end == m->sg_start && + m->sg_data[m->sg_end].length)) + goto out_err; + + psock->sg_size += size; + sg_set_page(sg, page, size, offset); + get_page(page); + m->sg_copy[m->sg_end] = true; + sk_mem_charge(sk, size); + m->sg_end++; + copied = size; + + if (m->sg_end == MAX_SKB_FRAGS) + m->sg_end = 0; + + if (m->sg_end == m->sg_start) + enospc = true; + + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + + if (psock->cork_bytes && !enospc) + goto out_err; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); +out_err: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +accept: + rcu_read_unlock(); + return tcp_sendpage(sk, page, offset, size, flags); +} + +static void bpf_tcp_msg_add(struct smap_psock *psock, + struct sock *sk, + struct bpf_prog *tx_msg) +{ + struct bpf_prog *orig_tx_msg; + + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); + if (orig_tx_msg) + bpf_prog_put(orig_tx_msg); +} + +static int bpf_tcp_ulp_register(void) +{ + tcp_bpf_proto = tcp_prot; + tcp_bpf_proto.close = bpf_tcp_close; + /* Once BPF TX ULP is registered it is never unregistered. It + * will be in the ULP list for the lifetime of the system. Doing + * duplicate registers is not a problem. + */ + return tcp_register_ulp(&bpf_tcp_ulp_ops); +} + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -137,27 +1059,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) __SK_DROP; } +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sock; + int copied = 0, num_sg; + struct sk_msg_buff *r; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!r)) + return -EAGAIN; + + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(r); + return -EAGAIN; + } + + sg_init_table(r->sg_data, MAX_SKB_FRAGS); + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); + if (unlikely(num_sg < 0)) { + kfree(r); + return num_sg; + } + sk_mem_charge(sk, skb->len); + copied = skb->len; + r->sg_start = 0; + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; + r->skb = skb; + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + return copied; +} + static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) { + struct smap_psock *peer; struct sock *sk; + __u32 in; int rc; rc = smap_verdict_func(psock, skb); switch (rc) { case __SK_REDIRECT: sk = do_sk_redirect_map(skb); - if (likely(sk)) { - struct smap_psock *peer = smap_psock_sk(sk); - - if (likely(peer && - test_bit(SMAP_TX_RUNNING, &peer->state) && - !sock_flag(sk, SOCK_DEAD) && - sock_writeable(sk))) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } + if (!sk) { + kfree_skb(skb); + break; + } + + peer = smap_psock_sk(sk); + in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; + + if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || + !test_bit(SMAP_TX_RUNNING, &peer->state))) { + kfree_skb(skb); + break; + } + + if (!in && sock_writeable(sk)) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } else if (in && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; } /* Fall through and free skb otherwise */ case __SK_DROP: @@ -174,68 +1141,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err) sk->sk_error_report(sk); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - -/* Called with lock_sock(sk) held */ -static void smap_state_change(struct sock *sk) -{ - struct smap_psock_map_entry *e, *tmp; - struct smap_psock *psock; - struct socket_wq *wq; - struct sock *osk; - - rcu_read_lock(); - - /* Allowing transitions into an established syn_recv states allows - * for early binding sockets to a smap object before the connection - * is established. - */ - switch (sk->sk_state) { - case TCP_SYN_SENT: - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - break; - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - case TCP_LAST_ACK: - case TCP_FIN_WAIT1: - case TCP_FIN_WAIT2: - case TCP_LISTEN: - break; - case TCP_CLOSE: - /* Only release if the map entry is in fact the sock in - * question. There is a case where the operator deletes - * the sock from the map, but the TCP sock is closed before - * the psock is detached. Use cmpxchg to verify correct - * sock is removed. - */ - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - break; - write_lock_bh(&sk->sk_callback_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); - } - } - write_unlock_bh(&sk->sk_callback_lock); - break; - default: - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - break; - smap_report_sk_error(psock, EPIPE); - break; - } - - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible_all(&wq->wait); - rcu_read_unlock(); -} - static void smap_read_sock_strparser(struct strparser *strp, struct sk_buff *skb) { @@ -281,15 +1186,23 @@ static void smap_tx_work(struct work_struct *w) } while ((skb = skb_dequeue(&psock->rxqueue))) { + __u32 flags; + rem = skb->len; off = 0; start: + flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; do { - if (likely(psock->sock->sk_socket)) - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - else + if (likely(psock->sock->sk_socket)) { + if (flags) + n = smap_do_ingress(psock, skb); + else + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + } else { n = -EINVAL; + } + if (n <= 0) { if (n == -EAGAIN) { /* Retry when space is available */ @@ -307,7 +1220,9 @@ start: rem -= n; off += n; } while (rem); - kfree_skb(skb); + + if (!flags) + kfree_skb(skb); } out: release_sock(psock->sock); @@ -330,10 +1245,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) return; sk->sk_data_ready = psock->save_data_ready; sk->sk_write_space = psock->save_write_space; - sk->sk_state_change = psock->save_state_change; psock->save_data_ready = NULL; psock->save_write_space = NULL; - psock->save_state_change = NULL; strp_stop(&psock->strp); psock->strp_enabled = false; } @@ -354,14 +1267,13 @@ static void smap_destroy_psock(struct rcu_head *rcu) static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { - psock->refcnt--; - if (psock->refcnt) - return; - - smap_stop_sock(psock, sock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); + if (refcount_dec_and_test(&psock->refcnt)) { + tcp_cleanup_ulp(sock); + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); + } } static int smap_parse_func_strparser(struct strparser *strp, @@ -395,7 +1307,6 @@ static int smap_parse_func_strparser(struct strparser *strp, return rc; } - static int smap_read_sock_done(struct strparser *strp, int err) { return err; @@ -435,10 +1346,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk) return; psock->save_data_ready = sk->sk_data_ready; psock->save_write_space = sk->sk_write_space; - psock->save_state_change = sk->sk_state_change; sk->sk_data_ready = smap_data_ready; sk->sk_write_space = smap_write_space; - sk->sk_state_change = smap_state_change; psock->strp_enabled = true; } @@ -451,6 +1360,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab) static void smap_gc_work(struct work_struct *w) { struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; psock = container_of(w, struct smap_psock, gc_work); @@ -467,12 +1377,28 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_parse); if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); + if (psock->bpf_tx_msg) + bpf_prog_put(psock->bpf_tx_msg); + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + } + + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); } + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sock); kfree(psock); } @@ -488,12 +1414,14 @@ static struct smap_psock *smap_init_psock(struct sock *sock, if (!psock) return ERR_PTR(-ENOMEM); + psock->eval = __SK_NONE; psock->sock = sock; skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); - psock->refcnt = 1; + INIT_LIST_HEAD(&psock->ingress); + refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -503,8 +1431,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock, static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - int err = -EINVAL; u64 cost; + int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -517,20 +1445,19 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (attr->value_size > KMALLOC_MAX_SIZE) return ERR_PTR(-E2BIG); + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + stab = kzalloc(sizeof(*stab), GFP_USER); if (!stab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - stab->map.map_type = attr->map_type; - stab->map.key_size = attr->key_size; - stab->map.value_size = attr->value_size; - stab->map.max_entries = attr->max_entries; - stab->map.map_flags = attr->map_flags; - stab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&stab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); + err = -EINVAL; if (cost >= U32_MAX - PAGE_SIZE) goto free_stab; @@ -604,11 +1531,6 @@ static void sock_map_free(struct bpf_map *map) } rcu_read_unlock(); - if (stab->bpf_verdict) - bpf_prog_put(stab->bpf_verdict); - if (stab->bpf_parse) - bpf_prog_put(stab->bpf_parse); - sock_map_remove_complete(stab); } @@ -702,10 +1624,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct smap_psock_map_entry *e = NULL; - struct bpf_prog *verdict, *parse; + struct bpf_prog *verdict, *parse, *tx_msg; struct sock *osock, *sock; struct smap_psock *psock; u32 i = *(u32 *)key; + bool new = false; int err; if (unlikely(flags > BPF_EXIST)) @@ -728,6 +1651,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, */ verdict = READ_ONCE(stab->bpf_verdict); parse = READ_ONCE(stab->bpf_parse); + tx_msg = READ_ONCE(stab->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -746,6 +1670,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } + if (tx_msg) { + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + if (IS_ERR(tx_msg)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(tx_msg); + } + } + write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); @@ -760,7 +1695,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - psock->refcnt++; + if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { + err = -EBUSY; + goto out_progs; + } + if (!refcount_inc_not_zero(&psock->refcnt)) { + err = -EAGAIN; + goto out_progs; + } } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { @@ -769,6 +1711,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } set_bit(SMAP_TX_RUNNING, &psock->state); + new = true; } e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); @@ -781,6 +1724,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ + if (tx_msg) + bpf_tcp_msg_add(psock, sock, tx_msg); + if (new) { + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_free; + } + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) @@ -802,8 +1753,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - if (osock != sock && parse) - smap_stop_sock(opsock, osock); smap_list_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); @@ -816,6 +1765,8 @@ out_progs: bpf_prog_put(verdict); if (parse) bpf_prog_put(parse); + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; @@ -830,6 +1781,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return -EINVAL; switch (type) { + case BPF_SK_MSG_VERDICT: + orig = xchg(&stab->bpf_tx_msg, prog); + break; case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); break; @@ -880,6 +1834,23 @@ static int sock_map_update_elem(struct bpf_map *map, return err; } +static void sock_map_release(struct bpf_map *map, struct file *map_file) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *orig; + + orig = xchg(&stab->bpf_parse, NULL); + if (orig) + bpf_prog_put(orig); + orig = xchg(&stab->bpf_verdict, NULL); + if (orig) + bpf_prog_put(orig); + + orig = xchg(&stab->bpf_tx_msg, NULL); + if (orig) + bpf_prog_put(orig); +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -887,6 +1858,7 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, + .map_release = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a15bc636cc98..57eeb1234b67 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,19 @@ #include <linux/filter.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> +#include <linux/elf.h> +#include <linux/pagemap.h> #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ + BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; - u64 ip[]; + u64 data[]; }; struct bpf_stack_map { @@ -29,6 +32,17 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ + return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ + return stack_map_use_build_id(map) ? + sizeof(struct bpf_stack_build_id) : sizeof(u64); +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; @@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - value_size < 8 || value_size % 8 || - value_size / 8 > sysctl_perf_event_max_stack) + value_size < 8 || value_size % 8) + return ERR_PTR(-EINVAL); + + BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); + if (attr->map_flags & BPF_F_STACK_BUILD_ID) { + if (value_size % sizeof(struct bpf_stack_build_id) || + value_size / sizeof(struct bpf_stack_build_id) + > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -88,14 +110,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_smap; - smap->map.map_type = attr->map_type; - smap->map.key_size = attr->key_size; + bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; - smap->map.max_entries = attr->max_entries; - smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) @@ -118,13 +136,184 @@ free_smap: return ERR_PTR(err); } +#define BPF_BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int stack_map_parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BPF_BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + BPF_BUILD_ID_SIZE); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int stack_map_get_build_id_32(void *page_addr, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int stack_map_get_build_id_64(void *page_addr, + unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +static int stack_map_get_build_id(struct vm_area_struct *vma, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = page_address(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = stack_map_get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = stack_map_get_build_id_64(page_addr, build_id); +out: + put_page(page); + return ret; +} + +static void stack_map_get_build_id_offset(struct bpf_map *map, + struct stack_map_bucket *bucket, + u64 *ips, u32 trace_nr, bool user) +{ + int i; + struct vm_area_struct *vma; + struct bpf_stack_build_id *id_offs; + + bucket->nr = trace_nr; + id_offs = (struct bpf_stack_build_id *)bucket->data; + + /* + * We cannot do up_read() in nmi context, so build_id lookup is + * only supported for non-nmi events. If at some point, it is + * possible to run find_vma() without taking the semaphore, we + * would like to allow build_id lookup in nmi context. + * + * Same fallback is used for kernel stack (!user) on a stackmap + * with build_id. + */ + if (!user || !current || !current->mm || in_nmi() || + down_read_trylock(¤t->mm->mmap_sem) == 0) { + /* cannot access current->mm, fall back to ips */ + for (i = 0; i < trace_nr; i++) { + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + } + return; + } + + for (i = 0; i < trace_nr; i++) { + vma = find_vma(current->mm, ips[i]); + if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + /* per entry fall back to ips */ + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + continue; + } + id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] + - vma->vm_start; + id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + } + up_read(¤t->mm->mmap_sem); +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / 8; + u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -132,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, bool user = flags & BPF_F_USER_STACK; bool kernel = !user; u64 *ips; + bool hash_matches; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) @@ -160,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); - if (bucket && bucket->hash == hash) { - if (flags & BPF_F_FAST_STACK_CMP) + hash_matches = bucket && bucket->hash == hash; + /* fast cmp */ + if (hash_matches && flags & BPF_F_FAST_STACK_CMP) + return id; + + if (stack_map_use_build_id(map)) { + /* for build_id+offset, pop a bucket before slow cmp */ + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + stack_map_get_build_id_offset(map, new_bucket, ips, + trace_nr, user); + trace_len = trace_nr * sizeof(struct bpf_stack_build_id); + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, new_bucket->data, trace_len) == 0) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; - if (bucket->nr == trace_nr && - memcmp(bucket->ip, ips, trace_len) == 0) + } + if (bucket && !(flags & BPF_F_REUSE_STACKID)) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); + return -EEXIST; + } + } else { + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, ips, trace_len) == 0) return id; + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + memcpy(new_bucket->data, ips, trace_len); } - /* this call stack is not in the map, try to add it */ - if (bucket && !(flags & BPF_F_REUSE_STACKID)) - return -EEXIST; - - new_bucket = (struct stack_map_bucket *) - pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; - - memcpy(new_bucket->ip, ips, trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; @@ -216,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) if (!bucket) return -ENOENT; - trace_len = bucket->nr * sizeof(u64); - memcpy(value, bucket->ip, trace_len); + trace_len = bucket->nr * stack_map_data_size(map); + memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); @@ -226,9 +435,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return 0; } -static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) { - return -EINVAL; + struct bpf_stack_map *smap = container_of(map, + struct bpf_stack_map, map); + u32 id; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!key) { + id = 0; + } else { + id = *(u32 *)key; + if (id >= smap->n_buckets || !smap->buckets[id]) + id = 0; + else + id++; + } + + while (id < smap->n_buckets && !smap->buckets[id]) + id++; + + if (id >= smap->n_buckets) + return -ENOENT; + + *(u32 *)next_key = id; + return 0; } static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5cb783fc8224..4ca46df19c9a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr, return 0; } +const struct bpf_map_ops bpf_map_offload_ops = { + .map_alloc = bpf_map_offload_map_alloc, + .map_free = bpf_map_offload_map_free, +}; + static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { + const struct bpf_map_ops *ops; struct bpf_map *map; + int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || - !bpf_map_types[attr->map_type]) + if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + return ERR_PTR(-EINVAL); + ops = bpf_map_types[attr->map_type]; + if (!ops) return ERR_PTR(-EINVAL); - map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return ERR_PTR(err); + } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; + map = ops->map_alloc(attr); if (IS_ERR(map)) return map; - map->ops = bpf_map_types[attr->map_type]; + map->ops = ops; map->map_type = attr->map_type; return map; } @@ -134,6 +150,16 @@ void bpf_map_area_free(void *area) kvfree(area); } +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) +{ + map->map_type = attr->map_type; + map->key_size = attr->key_size; + map->value_size = attr->value_size; + map->max_entries = attr->max_entries; + map->map_flags = attr->map_flags; + map->numa_node = bpf_map_attr_numa_node(attr); +} + int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); @@ -177,11 +203,13 @@ static int bpf_map_alloc_id(struct bpf_map *map) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&map_idr_lock); id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); if (id > 0) map->id = id; spin_unlock_bh(&map_idr_lock); + idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; @@ -189,16 +217,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { unsigned long flags; + /* Offloaded maps are removed from the IDR store when their device + * disappears - even if someone holds an fd to them they are unusable, + * the memory is gone, all ops will fail; they are simply waiting for + * refcnt to drop to be freed. + */ + if (!map->id) + return; + if (do_idr_lock) spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); + map->id = 0; if (do_idr_lock) spin_unlock_irqrestore(&map_idr_lock, flags); @@ -378,7 +415,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_name +#define BPF_MAP_CREATE_LAST_FIELD map_ifindex /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -566,8 +603,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -654,7 +693,10 @@ static int map_update_elem(union bpf_attr *attr) goto free_value; /* Need to create a kthread, thus must support schedule */ - if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_update_elem(map, key, value, attr->flags); + goto out; + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -669,10 +711,7 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); @@ -731,6 +770,11 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + goto out; + } + preempt_disable(); __this_cpu_inc(bpf_prog_active); rcu_read_lock(); @@ -738,7 +782,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_delete_elem(map, ufd, key); kfree(key); @@ -788,9 +832,15 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_get_next_key(map, key, next_key); + goto out; + } + rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); +out: if (err) goto free_next_key; @@ -892,11 +942,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&prog_idr_lock); id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); if (id > 0) prog->aux->id = id; spin_unlock_bh(&prog_idr_lock); + idr_preload_end(); /* id is in [1, INT_MAX) */ if (WARN_ON_ONCE(!id)) @@ -905,9 +957,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { - /* cBPF to eBPF migrations are currently not in the idr store. */ + /* cBPF to eBPF migrations are currently not in the idr store. + * Offloaded programs are removed from the store when their device + * disappears - even if someone grabs an fd to them they are unusable, + * simply waiting for refcnt to drop to be freed. + */ if (!prog->aux->id) return; @@ -917,6 +973,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) __acquire(&prog_idr_lock); idr_remove(&prog_idr, prog->aux->id); + prog->aux->id = 0; if (do_idr_lock) spin_unlock_bh(&prog_idr_lock); @@ -937,10 +994,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + int i; + trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); + + for (i = 0; i < prog->aux->func_cnt; i++) + bpf_prog_kallsyms_del(prog->aux->func[i]); bpf_prog_kallsyms_del(prog); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } @@ -1108,8 +1171,63 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +/* Initially all BPF programs could be loaded w/o specifying + * expected_attach_type. Later for some of them specifying expected_attach_type + * at load time became required so that program could be validated properly. + * Programs of types that are allowed to be loaded both w/ and w/o (for + * backward compatibility) expected_attach_type, should have the default attach + * type assigned to expected_attach_type for the latter case, so that it can be + * validated later at attach time. + * + * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if + * prog type requires it but has some attach types that have to be backward + * compatible. + */ +static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) +{ + switch (attr->prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't + * exist so checking for non-zero is the way to go here. + */ + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_CGROUP_INET_SOCK_CREATE; + break; + } +} + +static int +bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + switch (expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + return 0; + default: + return -EINVAL; + } + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return 0; + default: + return -EINVAL; + } + default: + return 0; + } +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex +#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type static int bpf_prog_load(union bpf_attr *attr) { @@ -1146,11 +1264,19 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + bpf_prog_load_fixup_attach_type(attr); + if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + prog->expected_attach_type = attr->expected_attach_type; + + prog->aux->offload_requested = !!attr->prog_ifindex; + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; @@ -1172,7 +1298,7 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; - if (attr->prog_ifindex) { + if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; @@ -1194,7 +1320,8 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; /* eBPF program is ready to be JITed */ - prog = bpf_prog_select_runtime(prog, &err); + if (!prog->bpf_func) + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; @@ -1249,11 +1376,99 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +struct bpf_raw_tracepoint { + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; +}; + +static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +{ + struct bpf_raw_tracepoint *raw_tp = filp->private_data; + + if (raw_tp->prog) { + bpf_probe_unregister(raw_tp->btp, raw_tp->prog); + bpf_prog_put(raw_tp->prog); + } + kfree(raw_tp); + return 0; +} + +static const struct file_operations bpf_raw_tp_fops = { + .release = bpf_raw_tracepoint_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; + char tp_name[128]; + int tp_fd, err; + + if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), + sizeof(tp_name) - 1) < 0) + return -EFAULT; + tp_name[sizeof(tp_name) - 1] = 0; + + btp = bpf_find_raw_tracepoint(tp_name); + if (!btp) + return -ENOENT; + + raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); + if (!raw_tp) + return -ENOMEM; + raw_tp->btp = btp; + + prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, + BPF_PROG_TYPE_RAW_TRACEPOINT); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out_free_tp; + } + + err = bpf_probe_register(raw_tp->btp, prog); + if (err) + goto out_put_prog; + + raw_tp->prog = prog; + tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, + O_CLOEXEC); + if (tp_fd < 0) { + bpf_probe_unregister(raw_tp->btp, prog); + err = tp_fd; + goto out_put_prog; + } + return tp_fd; + +out_put_prog: + bpf_prog_put(prog); +out_free_tp: + kfree(raw_tp); + return err; +} + #ifdef CONFIG_CGROUP_BPF +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +static int sockmap_get_from_fd(const union bpf_attr *attr, + int type, bool attach) { struct bpf_prog *prog = NULL; int ufd = attr->target_fd; @@ -1267,8 +1482,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return PTR_ERR(map); if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_SK_SKB); + prog = bpf_prog_get_type(attr->attach_bpf_fd, type); if (IS_ERR(prog)) { fdput(f); return PTR_ERR(prog); @@ -1312,17 +1526,27 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, true); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); default: return -EINVAL; } @@ -1331,6 +1555,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); + if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { + bpf_prog_put(prog); + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) { bpf_prog_put(prog); @@ -1367,17 +1596,27 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); default: return -EINVAL; } @@ -1416,6 +1655,12 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; @@ -1439,6 +1684,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -1551,6 +1798,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, + unsigned long addr) +{ + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (prog->aux->used_maps[i] == (void *)addr) + return prog->aux->used_maps[i]; + return NULL; +} + +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +{ + const struct bpf_map *map; + struct bpf_insn *insns; + u64 imm; + int i; + + insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), + GFP_USER); + if (!insns) + return insns; + + for (i = 0; i < prog->len; i++) { + if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + insns[i].code = BPF_JMP | BPF_CALL; + insns[i].imm = BPF_FUNC_tail_call; + /* fall-through */ + } + if (insns[i].code == (BPF_JMP | BPF_CALL) || + insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { + if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + insns[i].code = BPF_JMP | BPF_CALL; + if (!bpf_dump_raw_ok()) + insns[i].imm = 0; + continue; + } + + if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + + imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; + map = bpf_map_from_imm(prog, imm); + if (map) { + insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].imm = map->id; + insns[i + 1].imm = 0; + continue; + } + + if (!bpf_dump_raw_ok() && + imm == (unsigned long)prog->aux) { + insns[i].imm = 0; + insns[i + 1].imm = 0; + continue; + } + } + + return insns; +} + static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1598,24 +1906,51 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, goto done; } - ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; - if (info.jited_prog_len && ulen) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; - } - ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { + struct bpf_insn *insns_sanitized; + bool fault; + + if (prog->blinded && !bpf_dump_raw_ok()) { + info.xlated_prog_insns = 0; + goto done; + } + insns_sanitized = bpf_insn_prepare_dump(prog); + if (!insns_sanitized) + return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); - if (copy_to_user(uinsns, prog->insnsi, ulen)) + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_offload_info_fill(&info, prog); + if (err) + return err; + goto done; + } + + /* NOTE: the following code is supposed to be skipped for offload. + * bpf_prog_offload_info_fill() is the place to fill similar fields + * for offload. + */ + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -1646,6 +1981,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_info_fill(&info, map); + if (err) + return err; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; @@ -1687,7 +2028,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr = {}; int err; - if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) + if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; err = check_uarg_tail_zero(uattr, sizeof(attr), size); @@ -1759,6 +2100,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr); break; + case BPF_RAW_TRACEPOINT_OPEN: + err = bpf_raw_tracepoint_open(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 13551e623501..5dd1dcb902bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20,6 +20,8 @@ #include <linux/file.h> #include <linux/vmalloc.h> #include <linux/stringify.h> +#include <linux/bsearch.h> +#include <linux/sort.h> #include "disasm.h" @@ -166,23 +168,12 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); -/* log_level controls verbosity level of eBPF verifier. - * verbose() is used to dump the verification trace to the log, so the user - * can figure out what's wrong with the program - */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args) { - struct bpf_verifer_log *log = &env->log; unsigned int n; - va_list args; - - if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) - return; - va_start(args, fmt); n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - va_end(args); WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); @@ -196,6 +187,37 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, log->ubuf = NULL; } +/* log_level controls verbosity level of eBPF verifier. + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program + */ +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(&env->log, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); + +__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) +{ + struct bpf_verifier_env *env = private_data; + va_list args; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(&env->log, fmt, args); + va_end(args); +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -216,23 +238,48 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) +{ + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); +} + +static struct bpf_func_state *func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) +{ + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; +} + static void print_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state *state) + const struct bpf_func_state *state) { - struct bpf_reg_state *reg; + const struct bpf_reg_state *reg; enum bpf_reg_type t; int i; + if (state->frameno) + verbose(env, " frame%d:", state->frameno); for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; t = reg->type; if (t == NOT_INIT) continue; - verbose(env, " R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "=%s", reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); + if (t == PTR_TO_STACK) + verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) @@ -277,16 +324,21 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, " fp%d=%s", - -MAX_BPF_STACK + i * BPF_REG_SIZE, + if (state->stack[i].slot_type[0] == STACK_SPILL) { + verbose(env, " fp%d", + (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); + } + if (state->stack[i].slot_type[0] == STACK_ZERO) + verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); } verbose(env, "\n"); } -static int copy_stack_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_stack_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { if (!src->stack) return 0; @@ -302,13 +354,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from - * the program calls into realloc_verifier_state() to grow the stack size. + * the program calls into realloc_func_state() to grow the stack size. * Note there is a non-zero 'parent' pointer inside bpf_verifier_state * which this function copies over. It points to previous bpf_verifier_state * which is never reallocated */ -static int realloc_verifier_state(struct bpf_verifier_state *state, int size, - bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int size, + bool copy_old) { u32 old_size = state->allocated_stack; struct bpf_stack_state *new_stack; @@ -341,10 +393,23 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size, return 0; } +static void free_func_state(struct bpf_func_state *state) +{ + if (!state) + return; + kfree(state->stack); + kfree(state); +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { - kfree(state->stack); + int i; + + for (i = 0; i <= state->curframe; i++) { + free_func_state(state->frame[i]); + state->frame[i] = NULL; + } if (free_self) kfree(state); } @@ -352,18 +417,46 @@ static void free_verifier_state(struct bpf_verifier_state *state, /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ -static int copy_verifier_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_func_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { int err; - err = realloc_verifier_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, false); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); } +static int copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src) +{ + struct bpf_func_state *dst; + int i, err; + + /* if dst has more stack frames then src frame, free them */ + for (i = src->curframe + 1; i <= dst_state->curframe; i++) { + free_func_state(dst_state->frame[i]); + dst_state->frame[i] = NULL; + } + dst_state->curframe = src->curframe; + dst_state->parent = src->parent; + for (i = 0; i <= src->curframe; i++) { + dst = dst_state->frame[i]; + if (!dst) { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return -ENOMEM; + dst_state->frame[i] = dst; + } + err = copy_func_state(dst, src->frame[i]); + if (err) + return err; + } + return 0; +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -416,6 +509,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, } return &elem->st; err: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; /* pop all elements and return */ while (!pop_stack(env, NULL, NULL)); return NULL; @@ -449,6 +544,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } +static void __mark_reg_const_zero(struct bpf_reg_state *reg) +{ + __mark_reg_known(reg, 0); + reg->off = 0; + reg->type = SCALAR_VALUE; +} + static void mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { @@ -560,6 +662,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->id = 0; reg->off = 0; reg->var_off = tnum_unknown; + reg->frameno = 0; __mark_reg_unbounded(reg); } @@ -568,8 +671,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_unknown(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@ -587,8 +690,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_not_init(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@ -596,8 +699,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, } static void init_reg_state(struct bpf_verifier_env *env, - struct bpf_reg_state *regs) + struct bpf_func_state *state) { + struct bpf_reg_state *regs = state->regs; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -608,41 +712,218 @@ static void init_reg_state(struct bpf_verifier_env *env, /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); + regs[BPF_REG_FP].frameno = state->frameno; /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); } +#define BPF_MAIN_FUNC (-1) +static void init_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, + int callsite, int frameno, int subprogno) +{ + state->callsite = callsite; + state->frameno = frameno; + state->subprogno = subprogno; + init_reg_state(env, state); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ DST_OP_NO_MARK /* same as above, check only, don't mark */ }; -static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) +static int cmp_subprogs(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +static int find_subprog(struct bpf_verifier_env *env, int off) +{ + u32 *p; + + p = bsearch(&off, env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs); + if (!p) + return -ENOENT; + return p - env->subprog_starts; + +} + +static int add_subprog(struct bpf_verifier_env *env, int off) +{ + int insn_cnt = env->prog->len; + int ret; + + if (off >= insn_cnt || off < 0) { + verbose(env, "call to invalid destination\n"); + return -EINVAL; + } + ret = find_subprog(env, off); + if (ret >= 0) + return 0; + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + verbose(env, "too many subprograms\n"); + return -E2BIG; + } + env->subprog_starts[env->subprog_cnt++] = off; + sort(env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + + /* determine subprog starts. The end is one before the next starts */ + for (i = 0; i < insn_cnt; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + if (!env->allow_ptr_leaks) { + verbose(env, "function calls to other bpf functions are allowed for root only\n"); + return -EPERM; + } + if (bpf_prog_is_dev_bound(env->prog->aux)) { + verbose(env, "function calls in offloaded programs are not supported yet\n"); + return -EINVAL; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + } + + if (env->log.level > 1) + for (i = 0; i < env->subprog_cnt; i++) + verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + + /* now check that all jumps are within the same subprog */ + subprog_start = 0; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + for (i = 0; i < insn_cnt; i++) { + u8 code = insn[i].code; + + if (BPF_CLASS(code) != BPF_JMP) + goto next; + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + goto next; + off = i + insn[i].off + 1; + if (off < subprog_start || off >= subprog_end) { + verbose(env, "jump out of range from insn %d to %d\n", i, off); + return -EINVAL; + } +next: + if (i == subprog_end - 1) { + /* to avoid fall-through from one subprog into another + * the last insn of the subprog should be either exit + * or unconditional jump back + */ + if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP | BPF_JA)) { + verbose(env, "last insn is not an exit or jmp\n"); + return -EINVAL; + } + subprog_start = subprog_end; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + } + } + return 0; +} + +static +struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) +{ + struct bpf_verifier_state *tmp = NULL; + + /* 'parent' could be a state of caller and + * 'state' could be a state of callee. In such case + * parent->curframe < state->curframe + * and it's ok for r1 - r5 registers + * + * 'parent' could be a callee's state after it bpf_exit-ed. + * In such case parent->curframe > state->curframe + * and it's ok for r0 only + */ + if (parent->curframe == state->curframe || + (parent->curframe < state->curframe && + regno >= BPF_REG_1 && regno <= BPF_REG_5) || + (parent->curframe > state->curframe && + regno == BPF_REG_0)) + return parent; + + if (parent->curframe > state->curframe && + regno >= BPF_REG_6) { + /* for callee saved regs we have to skip the whole chain + * of states that belong to callee and mark as LIVE_READ + * the registers before the call + */ + tmp = parent; + while (tmp && tmp->curframe != state->curframe) { + tmp = tmp->parent; + } + if (!tmp) + goto bug; + parent = tmp; + } else { + goto bug; + } + return parent; +bug: + verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); + verbose(env, "regno %d parent frame %d current frame %d\n", + regno, parent->curframe, state->curframe); + return NULL; +} + +static int mark_reg_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) { - struct bpf_verifier_state *parent = state->parent; + bool writes = parent == state->parent; /* Observe write marks */ if (regno == BPF_REG_FP) /* We don't need to worry about FP liveness because it's read-only */ - return; + return 0; while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) break; + parent = skip_callee(env, state, parent, regno); + if (!parent) + return -EFAULT; /* ... then we depend on parent's value */ - parent->regs[regno].live |= REG_LIVE_READ; + parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } + return 0; } static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state->regs; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -655,7 +936,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(env->cur_state, regno); + return mark_reg_read(env, vstate, vstate->parent, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -686,17 +967,25 @@ static bool is_spillable_regtype(enum bpf_reg_type type) } } +/* Does this register contain a constant zero? */ +static bool register_is_null(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, - int size, int value_regno) + struct bpf_func_state *state, /* func where register points to */ + int off, int size, int value_regno) { + struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + enum bpf_reg_type type; - err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -709,8 +998,9 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0 && - is_spillable_regtype(state->regs[value_regno].type)) { + is_spillable_regtype((type = cur->regs[value_regno].type))) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -718,51 +1008,116 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + if (state != cur && type == PTR_TO_STACK) { + verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); + return -EINVAL; + } + /* save register state */ - state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) state->stack[spi].slot_type[i] = STACK_SPILL; } else { + u8 type = STACK_MISC; + /* regular write of data into stack */ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + if (size == BPF_REG_SIZE) + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + /* when we zero initialize stack slots mark them as such */ + if (value_regno >= 0 && + register_is_null(&cur->regs[value_regno])) + type = STACK_ZERO; + for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - STACK_MISC; + type; } return 0; } -static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) +/* registers of every function are unique and mark_reg_read() propagates + * the liveness in the following cases: + * - from callee into caller for R1 - R5 that were used as arguments + * - from caller into callee for R0 that used as result of the call + * - from caller to the same caller skipping states of the callee for R6 - R9, + * since R6 - R9 are callee saved by implicit function prologue and + * caller's R6 != callee's R6, so when we propagate liveness up to + * parent states we need to skip callee states for R6 - R9. + * + * stack slot marking is different, since stacks of caller and callee are + * accessible in both (since caller can pass a pointer to caller's stack to + * callee which can pass it to another function), hence mark_stack_slot_read() + * has to propagate the stack liveness to all parent states at given frame number. + * Consider code: + * f1() { + * ptr = fp - 8; + * *ptr = ctx; + * call f2 { + * .. = *ptr; + * } + * .. = *ptr; + * } + * First *ptr is reading from f1's stack and mark_stack_slot_read() has + * to mark liveness at the f1's frame and not f2's frame. + * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has + * to propagate liveness to f2 states at f1's frame level and further into + * f1 states at f1's frame level until write into that stack slot + */ +static void mark_stack_slot_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + int slot, int frameno) { - struct bpf_verifier_state *parent = state->parent; + bool writes = parent == state->parent; /* Observe write marks */ while (parent) { + if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) + /* since LIVE_WRITTEN mark is only done for full 8-byte + * write the read marks are conservative and parent + * state may not even have the stack allocated. In such case + * end the propagation, since the loop reached beginning + * of the function + */ + break; /* if read wasn't screened by an earlier write ... */ - if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) + if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; + parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } } static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, int size, - int value_regno) + struct bpf_func_state *reg_state /* func where register points to */, + int off, int size, int value_regno) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; u8 *stype; - if (state->allocated_stack <= slot) { + if (reg_state->allocated_stack <= slot) { verbose(env, "invalid read from stack off %d+0 size %d\n", off, size); return -EACCES; } - stype = state->stack[spi].slot_type; + stype = reg_state->stack[spi].slot_type; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { @@ -778,21 +1133,44 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->stack[spi].spilled_ptr; - mark_stack_slot_read(state, spi); + state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; + /* mark reg as written since spilled pointer state likely + * has its liveness marks cleared by is_state_visited() + * which resets stack/reg liveness for state transitions + */ + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); return 0; } else { + int zeros = 0; + for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); - return -EACCES; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + continue; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { + zeros++; + continue; + } + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); + if (value_regno >= 0) { + if (zeros == size) { + /* any size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[value_regno]); + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, value_regno); } + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - if (value_regno >= 0) - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -817,7 +1195,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -881,6 +1260,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access; @@ -943,7 +1323,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, }; if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -985,6 +1365,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_CTX; } +static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return type_is_pkt_pointer(reg->type); +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1045,10 +1432,10 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env, } static int check_ptr_alignment(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) + const struct bpf_reg_state *reg, int off, + int size, bool strict_alignment_once) { - bool strict = env->strict_alignment; + bool strict = env->strict_alignment || strict_alignment_once; const char *pointer_desc = ""; switch (reg->type) { @@ -1079,6 +1466,103 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static int update_stack_depth(struct bpf_verifier_env *env, + const struct bpf_func_state *func, + int off) +{ + u16 stack = env->subprog_stack_depth[func->subprogno]; + + if (stack >= -off) + return 0; + + /* update known max for given subprogram */ + env->subprog_stack_depth[func->subprogno] = -off; + return 0; +} + +/* starting from main bpf function walk all instructions of the function + * and recursively walk all callees that given function can call. + * Ignore jump and exit insns. + * Since recursion is prevented by check_cfg() this algorithm + * only needs a local stack of MAX_CALL_FRAMES to remember callsites + */ +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret_insn[MAX_CALL_FRAMES]; + int ret_prog[MAX_CALL_FRAMES]; + +process_func: + /* round up to 32-bytes, since this is granularity + * of interpreter stack size + */ + depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + if (depth > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + frame + 1, depth); + return -EACCES; + } +continue_func: + if (env->subprog_cnt == subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[subprog]; + for (; i < subprog_end; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + /* remember insn and function to return to */ + ret_insn[frame] = i + 1; + ret_prog[frame] = subprog; + + /* find the callee */ + i = i + insn[i].imm + 1; + subprog = find_subprog(env, i); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i); + return -EFAULT; + } + subprog++; + frame++; + if (frame >= MAX_CALL_FRAMES) { + WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); + return -EFAULT; + } + goto process_func; + } + /* end of for() loop means the last insn of the 'subprog' + * was reached. Doesn't matter whether it was JA or EXIT + */ + if (frame == 0) + return 0; + depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + frame--; + i = ret_insn[frame]; + subprog = ret_prog[frame]; + goto continue_func; +} + +#ifndef CONFIG_BPF_JIT_ALWAYS_ON +static int get_callee_stack_depth(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int idx) +{ + int start = idx + insn->imm + 1, subprog; + + subprog = find_subprog(env, start); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + start); + return -EFAULT; + } + subprog++; + return env->subprog_stack_depth[subprog]; +} +#endif + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1108,13 +1592,13 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, - int bpf_size, enum bpf_access_type t, - int value_regno) +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, + int off, int bpf_size, enum bpf_access_type t, + int value_regno, bool strict_alignment_once) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1122,7 +1606,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return size; /* alignment checks will add in reg->off themselves */ - err = check_ptr_alignment(env, reg, off, size); + err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err; @@ -1203,8 +1687,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; + state = func(env, reg); + err = update_stack_depth(env, state, off); + if (err) + return err; if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, @@ -1265,27 +1751,23 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d context is not allowed\n", - insn->dst_reg); + if (is_ctx_reg(env, insn->dst_reg) || + is_pkt_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? + "context" : "packet"); return -EACCES; } /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1); + BPF_SIZE(insn->code), BPF_READ, -1, true); if (err) return err; /* check whether atomic_add can write into the same memory */ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1); -} - -/* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state reg) -{ - return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); + BPF_SIZE(insn->code), BPF_WRITE, -1, true); } /* when register 'regno' is passed into function that will read 'access_size' @@ -1298,32 +1780,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_verifier_state *state = env->cur_state; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg = cur_regs(env) + regno; + struct bpf_func_state *state = func(env, reg); int off, i, slot, spi; - if (regs[regno].type != PTR_TO_STACK) { + if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(regs[regno])) + register_is_null(reg)) return 0; verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[regs[regno].type], + reg_type_str[reg->type], reg_type_str[PTR_TO_STACK]); return -EACCES; } /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(regs[regno].var_off)) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); return -EACCES; } - off = regs[regno].off + regs[regno].var_off.value; + off = reg->off + reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", @@ -1331,9 +1813,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; - if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -1341,17 +1820,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { + u8 *stype; + slot = -(off + i) - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot || - state->stack[spi].slot_type[slot % BPF_REG_SIZE] != - STACK_MISC) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); - return -EACCES; + if (state->allocated_stack <= slot) + goto err; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + if (*stype == STACK_MISC) + goto mark; + if (*stype == STACK_ZERO) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + goto mark; } +err: + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; +mark: + /* reading any byte out of 8-byte 'spill_slot' will cause + * the whole slot to be marked as 'read' + */ + mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, + spi, state->frameno); } - return 0; + return update_stack_depth(env, state, off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -1374,6 +1868,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MEM || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_UNINIT_MEM; +} + +static bool arg_type_is_mem_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_SIZE || + type == ARG_CONST_SIZE_OR_ZERO; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -1423,15 +1930,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_MEM || - arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_UNINIT_MEM) { + } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg) && + if (register_is_null(reg) && arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && @@ -1486,25 +1991,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* bpf_xxx(..., buf, len) call will access 'len' bytes - * from stack pointer 'buf'. Check it - * note: regno == len, regno - 1 == buf - */ - if (regno == 0) { - /* kernel subsystem misconfigured verifier */ - verbose(env, - "ARG_CONST_SIZE cannot be first argument\n"); - return -EACCES; - } - /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ - if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to @@ -1592,7 +2084,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_map) goto error; break; default: @@ -1604,6 +2097,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; + if (env->subprog_cnt) { + verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + return -EINVAL; + } break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: @@ -1626,6 +2123,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_redirect_map: + case BPF_FUNC_msg_redirect_map: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; @@ -1644,7 +2142,7 @@ error: return -EINVAL; } -static int check_raw_mode(const struct bpf_func_proto *fn) +static bool check_raw_mode_ok(const struct bpf_func_proto *fn) { int count = 0; @@ -1659,15 +2157,52 @@ static int check_raw_mode(const struct bpf_func_proto *fn) if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; - return count > 1 ? -EINVAL : 0; + /* We only support one arg being in raw mode at the moment, + * which is sufficient for the helper functions we have + * right now. + */ + return count <= 1; +} + +static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, + enum bpf_arg_type arg_next) +{ + return (arg_type_is_mem_ptr(arg_curr) && + !arg_type_is_mem_size(arg_next)) || + (!arg_type_is_mem_ptr(arg_curr) && + arg_type_is_mem_size(arg_next)); +} + +static bool check_arg_pair_ok(const struct bpf_func_proto *fn) +{ + /* bpf_xxx(..., buf, len) call will access 'len' + * bytes from memory 'buf'. Both arg types need + * to be paired, so make sure there's no buggy + * helper function specification. + */ + if (arg_type_is_mem_size(fn->arg1_type) || + arg_type_is_mem_ptr(fn->arg5_type) || + check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || + check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || + check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || + check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + return false; + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) +{ + return check_raw_mode_ok(fn) && + check_arg_pair_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, + struct bpf_func_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1684,7 +2219,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } } -static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + __clear_all_pkt_pointers(env, vstate->frame[i]); +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + int i, subprog, target_insn; + + if (state->curframe + 1 >= MAX_CALL_FRAMES) { + verbose(env, "the call stack of %d frames is too deep\n", + state->curframe + 2); + return -E2BIG; + } + + target_insn = *insn_idx + insn->imm; + subprog = find_subprog(env, target_insn + 1); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn + 1); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + if (state->frame[state->curframe + 1]) { + verbose(env, "verifier bug. Frame %d already allocated\n", + state->curframe + 1); + return -EFAULT; + } + + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + *insn_idx /* callsite */, + state->curframe + 1 /* frameno within this callchain */, + subprog + 1 /* subprog number within this prog */); + + /* copy r1 - r5 args that callee can access */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + + /* after the call regsiters r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, caller->regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + /* and go analyze first insn of the callee */ + *insn_idx = target_insn; + + if (env->log.level) { + verbose(env, "caller:\n"); + print_verifier_state(env, caller); + verbose(env, "callee:\n"); + print_verifier_state(env, callee); + } + return 0; +} + +static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + struct bpf_reg_state *r0; + + callee = state->frame[state->curframe]; + r0 = &callee->regs[BPF_REG_0]; + if (r0->type == PTR_TO_STACK) { + /* technically it's ok to return caller's stack pointer + * (or caller's caller's pointer) back to the caller, + * since these pointers are valid. Only current stack + * pointer will be invalid as soon as function exits, + * but let's be conservative + */ + verbose(env, "cannot return stack pointer to the caller\n"); + return -EINVAL; + } + + state->curframe--; + caller = state->frame[state->curframe]; + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + + *insn_idx = callee->callsite + 1; + if (env->log.level) { + verbose(env, "returning from callee:\n"); + print_verifier_state(env, callee); + verbose(env, "to caller at %d:\n", *insn_idx); + print_verifier_state(env, caller); + } + /* clear everything in the callee */ + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return 0; +} + +static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; @@ -1700,8 +2349,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id); - + fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -1725,10 +2373,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - /* We only support one arg being in raw mode at the moment, which - * is sufficient for the helper functions we have right now. - */ - err = check_raw_mode(fn); + err = check_func_proto(fn); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -1763,7 +2408,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); + err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, + BPF_WRITE, -1, false); if (err) return err; } @@ -1884,7 +2530,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -2319,7 +2967,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); @@ -2370,12 +3020,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -2537,14 +3187,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct bpf_verifier_state *state, +static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i; + int i, j; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -2614,12 +3265,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - reg = &state->stack[i].spilled_ptr; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &state->stack[i].spilled_ptr; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } } } @@ -2857,20 +3511,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, +static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs; u32 id = regs[regno].id; - int i; + int i, j; for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + } } } @@ -2970,8 +3628,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; - struct bpf_reg_state *regs = this_branch->regs, *dst_reg; + struct bpf_verifier_state *this_branch = env->cur_state; + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; + struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); int err; @@ -3014,8 +3674,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == SCALAR_VALUE && - tnum_equals_const(dst_reg->var_off, insn->imm)) { - if (opcode == BPF_JEQ) { + tnum_is_const(dst_reg->var_off)) { + if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || + (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through */ @@ -3033,6 +3694,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); if (!other_branch) return -EFAULT; + other_branch_regs = other_branch->frame[other_branch->curframe]->regs; /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. @@ -3045,22 +3707,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && regs[insn->src_reg].type == SCALAR_VALUE) { if (tnum_is_const(regs[insn->src_reg].var_off)) - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, regs[insn->src_reg].var_off.value, opcode); else if (tnum_is_const(dst_reg->var_off)) - reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + reg_set_min_max_inv(&other_branch_regs[insn->src_reg], ®s[insn->src_reg], dst_reg->var_off.value, opcode); else if (opcode == BPF_JEQ || opcode == BPF_JNE) /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch->regs[insn->src_reg], - &other_branch->regs[insn->dst_reg], + reg_combine_min_max(&other_branch_regs[insn->src_reg], + &other_branch_regs[insn->dst_reg], ®s[insn->src_reg], ®s[insn->dst_reg], opcode); } } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, opcode); } @@ -3081,7 +3743,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level) - print_verifier_state(env, this_branch); + print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -3166,6 +3828,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->subprog_cnt) { + /* when program has LD_ABS insn JITs and interpreter assume + * that r1 == ctx == skb which is not the case for callees + * that can have arbitrary arguments. It's problematic + * for main prog as well since JITs would need to analyze + * all functions in order to make proper register save/restore + * decisions in the main prog. Hence disallow LD_ABS with calls + */ + verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); + return -EINVAL; + } + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -3213,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: break; @@ -3342,6 +4017,10 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; + ret = check_subprogs(env); + if (ret < 0) + return ret; + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -3374,6 +4053,14 @@ peek_stack: goto err_free; if (t + 1 < insn_cnt) env->explored_states[t + 1] = STATE_LIST_MARK; + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + env->explored_states[t] = STATE_LIST_MARK; + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -3492,11 +4179,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) { + bool equal; + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + + if (rold->type == PTR_TO_STACK) + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar + */ + return equal && rold->frameno == rcur->frameno; + + if (equal) return true; if (rold->type == NOT_INIT) @@ -3568,7 +4265,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, tnum_in(rold->var_off, rcur->var_off); case PTR_TO_CTX: case CONST_PTR_TO_MAP: - case PTR_TO_STACK: case PTR_TO_PACKET_END: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -3583,8 +4279,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, +static bool stacksafe(struct bpf_func_state *old, + struct bpf_func_state *cur, struct idpair *idmap) { int i, spi; @@ -3602,8 +4298,19 @@ static bool stacksafe(struct bpf_verifier_state *old, for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE; + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + /* explored state didn't use this */ + continue; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + /* if old state was safe with misc data in the stack + * it will be safe with zero-initialized stack. + * The opposite is not true + */ + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in @@ -3660,9 +4367,8 @@ static bool stacksafe(struct bpf_verifier_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) +static bool func_states_equal(struct bpf_func_state *old, + struct bpf_func_state *cur) { struct idpair *idmap; bool ret = false; @@ -3686,71 +4392,72 @@ out_free: return ret; } +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + int i; + + if (old->curframe != cur->curframe) + return false; + + /* for states to be equal callsites have to be the same + * and all frame states need to be equivalent + */ + for (i = 0; i <= old->curframe; i++) { + if (old->frame[i]->callsite != cur->frame[i]->callsite) + return false; + if (!func_states_equal(old->frame[i], cur->frame[i])) + return false; + } + return true; +} + /* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at a - * jump target (in the first iteration of the propagate_liveness() loop), - * we didn't arrive by the straight-line code, so read marks in state must - * propagate to parent regardless of state's write marks. + * straight-line code between a state and its parent. When we arrive at an + * equivalent state (jump target or such) we didn't arrive by the straight-line + * code, so read marks in the state must propagate to the parent regardless + * of the state's write marks. That's what 'parent == state->parent' comparison + * in mark_reg_read() and mark_stack_slot_read() is for. */ -static bool do_propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) +static int propagate_liveness(struct bpf_verifier_env *env, + const struct bpf_verifier_state *vstate, + struct bpf_verifier_state *vparent) { - bool writes = parent == state->parent; /* Observe write marks */ - bool touched = false; /* any changes made? */ - int i; + int i, frame, err = 0; + struct bpf_func_state *state, *parent; - if (!parent) - return touched; + if (vparent->curframe != vstate->curframe) { + WARN(1, "propagate_live: parent frame %d current frame %d\n", + vparent->curframe, vstate->curframe); + return -EFAULT; + } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); /* We don't need to worry about FP liveness because it's read-only */ for (i = 0; i < BPF_REG_FP; i++) { - if (parent->regs[i].live & REG_LIVE_READ) - continue; - if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) + if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; - if (state->regs[i].live & REG_LIVE_READ) { - parent->regs[i].live |= REG_LIVE_READ; - touched = true; + if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, vstate, vparent, i); + if (err) + return err; } } + /* ... and stack slots */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (writes && - (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { - parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; - touched = true; + for (frame = 0; frame <= vstate->curframe; frame++) { + state = vstate->frame[frame]; + parent = vparent->frame[frame]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + continue; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) + mark_stack_slot_read(env, vstate, vparent, i, frame); } } - return touched; -} - -/* "parent" is "a state from which we reach the current state", but initially - * it is not the state->parent (i.e. "the state whose straight-line code leads - * to the current state"), instead it is the state that happened to arrive at - * a (prunable) equivalent of the current state. See comment above - * do_propagate_liveness() for consequences of this. - * This function is just a more efficient way of calling mark_reg_read() or - * mark_stack_slot_read() on each reg in "parent" that is read in "state", - * though it requires that parent != state->parent in the call arguments. - */ -static void propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) -{ - while (do_propagate_liveness(state, parent)) { - /* Something changed, so we need to feed those changes onward */ - state = parent; - parent = state->parent; - } + return err; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) @@ -3758,7 +4465,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i, err; + int i, j, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3779,7 +4486,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, cur); + err = propagate_liveness(env, &sl->state, cur); + if (err) + return err; return 1; } sl = sl->next; @@ -3787,9 +4496,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, - * but it will either reach bpf_exit (which means it's safe) or - * it will be rejected. Since there are no loops, we won't be - * seeing this 'insn_idx' instruction again on the way to bpf_exit + * but it will either reach outer most bpf_exit (which means it's safe) + * or it will be rejected. Since there are no loops, we won't be + * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) + * again on the way to bpf_exit */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) @@ -3813,19 +4523,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - cur->regs[i].live = REG_LIVE_NONE; - for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) - if (cur->stack[i].slot_type[0] == STACK_SPILL) - cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; - return 0; -} + cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; -static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) -{ - if (env->dev_ops && env->dev_ops->insn_hook) - return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + /* all stack frames are accessible from callee, clear them all */ + for (j = 0; j <= cur->curframe; j++) { + struct bpf_func_state *frame = cur->frame[j]; + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + } return 0; } @@ -3834,7 +4540,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len; + int insn_cnt = env->prog->len, i; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@ -3842,9 +4548,18 @@ static int do_check(struct bpf_verifier_env *env) state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; - env->cur_state = state; - init_reg_state(env, state->regs); + state->curframe = 0; state->parent = NULL; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + 0 /* subprogno, zero == main subprog */); insn_idx = 0; for (;;) { struct bpf_insn *insn; @@ -3891,19 +4606,26 @@ static int do_check(struct bpf_verifier_env *env) else verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env, state); + print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } if (env->log.level) { + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + .private_data = env, + }; + verbose(env, "%d: ", insn_idx); - print_bpf_insn(verbose, env, insn, - env->allow_ptr_leaks); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } - err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); - if (err) - return err; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + err = bpf_prog_offload_verify_insn(env, insn_idx, + prev_insn_idx); + if (err) + return err; + } regs = cur_regs(env); env->insn_aux_data[insn_idx].seen = true; @@ -3933,7 +4655,7 @@ static int do_check(struct bpf_verifier_env *env) */ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg); + insn->dst_reg, false); if (err) return err; @@ -3985,7 +4707,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg); + insn->src_reg, false); if (err) return err; @@ -4020,7 +4742,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, - -1); + -1, false); if (err) return err; @@ -4030,13 +4752,17 @@ static int do_check(struct bpf_verifier_env *env) if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || - insn->src_reg != BPF_REG_0 || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL) || insn->dst_reg != BPF_REG_0) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } - err = check_call(env, insn->imm, insn_idx); + if (insn->src_reg == BPF_PSEUDO_CALL) + err = check_func_call(env, insn, &insn_idx); + else + err = check_helper_call(env, insn->imm, insn_idx); if (err) return err; @@ -4061,6 +4787,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (state->curframe) { + /* exit from nested function */ + prev_insn_idx = insn_idx; + err = prepare_func_exit(env, &insn_idx); + if (err) + return err; + do_print_state = true; + continue; + } + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time @@ -4121,8 +4857,17 @@ process_bpf_exit: insn_idx++; } - verbose(env, "processed %d insns, stack depth %d\n", insn_processed, - env->prog->aux->stack_depth); + verbose(env, "processed %d insns (limit %d), stack depth ", + insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); + for (i = 0; i < env->subprog_cnt + 1; i++) { + u32 depth = env->subprog_stack_depth[i]; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt + 1) + verbose(env, "+"); + } + verbose(env, "\n"); + env->prog->aux->stack_depth = env->subprog_stack_depth[0]; return 0; } @@ -4155,6 +4900,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } } + + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + !bpf_offload_dev_match(prog, map)) { + verbose(env, "offload device mismatch between prog and map\n"); + return -EINVAL; + } + return 0; } @@ -4252,6 +5004,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) next_insn: insn++; i++; + continue; + } + + /* Basic sanity check before we invest more work here. */ + if (!bpf_opcode_in_insntable(insn->code)) { + verbose(env, "unknown opcode %02x\n", insn->code); + return -EINVAL; } } @@ -4308,6 +5067,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, return 0; } +static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + for (i = 0; i < env->subprog_cnt; i++) { + if (env->subprog_starts[i] < off) + continue; + env->subprog_starts[i] += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -4318,17 +5090,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return NULL; if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; + adjust_subprog_starts(env, off, len); return new_prog; } -/* The verifier does more data flow analysis than llvm and will not explore - * branches that are dead at run time. Malicious programs can have dead code - * too. Therefore replace all dead at-run-time code with nops. +/* The verifier does more data flow analysis than llvm and will not + * explore branches that are dead at run time. Malicious programs can + * have dead code too. Therefore replace all dead at-run-time code + * with 'ja -1'. + * + * Just nops are not optimal, e.g. if they would sit at the end of the + * program and through another bug we would manage to jump there, then + * we'd execute beyond program memory otherwise. Returning exception + * code also wouldn't work since we can have subprogs where the dead + * code could be located. */ static void sanitize_dead_code(struct bpf_verifier_env *env) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); struct bpf_insn *insn = env->prog->insnsi; const int insn_cnt = env->prog->len; int i; @@ -4336,7 +5116,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { if (aux_data[i].seen) continue; - memcpy(insn + i, &nop, sizeof(nop)); + memcpy(insn + i, &trap, sizeof(trap)); } } @@ -4452,6 +5232,180 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +static int jit_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog, **func, *tmp; + int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_insn *insn; + void *old_bpf_func; + int err = -ENOMEM; + + if (env->subprog_cnt == 0) + return 0; + + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = find_subprog(env, i + insn->imm + 1); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i + insn->imm + 1); + return -EFAULT; + } + /* temporarily remember subprog id inside insn instead of + * aux_data, since next loop will split up all insns into funcs + */ + insn->off = subprog + 1; + /* remember original imm in case JIT fails and fallback + * to interpreter will be needed + */ + env->insn_aux_data[i].call_imm = insn->imm; + /* point imm to __bpf_call_base+1 from JITs point of view */ + insn->imm = 1; + } + + func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + if (!func) + return -ENOMEM; + + for (i = 0; i <= env->subprog_cnt; i++) { + subprog_start = subprog_end; + if (env->subprog_cnt == i) + subprog_end = prog->len; + else + subprog_end = env->subprog_starts[i]; + + len = subprog_end - subprog_start; + func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + if (!func[i]) + goto out_free; + memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], + len * sizeof(struct bpf_insn)); + func[i]->type = prog->type; + func[i]->len = len; + if (bpf_prog_calc_tag(func[i])) + goto out_free; + func[i]->is_func = 1; + /* Use bpf_prog_F_tag to indicate functions in stack traces. + * Long term would need debug info to populate names + */ + func[i]->aux->name[0] = 'F'; + func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->jit_requested = 1; + func[i] = bpf_int_jit_compile(func[i]); + if (!func[i]->jited) { + err = -ENOTSUPP; + goto out_free; + } + cond_resched(); + } + /* at this point all bpf functions were successfully JITed + * now populate all bpf_calls with correct addresses and + * run last pass of JIT + */ + for (i = 0; i <= env->subprog_cnt; i++) { + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = insn->off; + insn->off = 0; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + func[subprog]->bpf_func - + __bpf_call_base; + } + } + for (i = 0; i <= env->subprog_cnt; i++) { + old_bpf_func = func[i]->bpf_func; + tmp = bpf_int_jit_compile(func[i]); + if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { + verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); + err = -EFAULT; + goto out_free; + } + cond_resched(); + } + + /* finally lock prog and jit images for all functions and + * populate kallsysm + */ + for (i = 0; i <= env->subprog_cnt; i++) { + bpf_prog_lock_ro(func[i]); + bpf_prog_kallsyms_add(func[i]); + } + + /* Last step: make now unused interpreter insns from main + * prog consistent for later dump requests, so they can + * later look the same as if they were interpreted only. + */ + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + unsigned long addr; + + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = env->insn_aux_data[i].call_imm; + subprog = find_subprog(env, i + insn->off + 1); + addr = (unsigned long)func[subprog + 1]->bpf_func; + addr &= PAGE_MASK; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + addr - __bpf_call_base; + } + + prog->jited = 1; + prog->bpf_func = func[0]->bpf_func; + prog->aux->func = func; + prog->aux->func_cnt = env->subprog_cnt + 1; + return 0; +out_free: + for (i = 0; i <= env->subprog_cnt; i++) + if (func[i]) + bpf_jit_free(func[i]); + kfree(func); + /* cleanup main prog to be interpreted */ + prog->jit_requested = 0; + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = 0; + insn->imm = env->insn_aux_data[i].call_imm; + } + return err; +} + +static int fixup_call_args(struct bpf_verifier_env *env) +{ +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + int i, depth; +#endif + int err; + + err = 0; + if (env->prog->jit_requested) { + err = jit_subprogs(env); + if (err == 0) + return 0; + } +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + depth = get_callee_stack_depth(env, insn, i); + if (depth < 0) + return depth; + bpf_patch_call_args(insn, depth); + } + err = 0; +#endif + return err; +} + /* fixup insn->imm field of bpf_call instructions * and inline eligible helpers as explicit sequence of BPF instructions * @@ -4469,15 +5423,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - /* due to JIT bugs clear upper 32-bits of src register - * before div/mod operation - */ - insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); - insn_buf[1] = *insn; - cnt = 2; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx div 0 -> 0 */ + BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + *insn, + }; + struct bpf_insn mask_and_mod[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx mod 0 -> Rx */ + BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + *insn, + }; + struct bpf_insn *patchlet; + + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + patchlet = mask_and_div + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); + } else { + patchlet = mask_and_mod + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) return -ENOMEM; @@ -4489,11 +5465,15 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL)) continue; + if (insn->src_reg == BPF_PSEUDO_CALL) + continue; if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can @@ -4545,7 +5525,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * handlers are currently limited to 64 bit only. */ - if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && + if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON || @@ -4593,7 +5573,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = env->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ @@ -4635,7 +5615,7 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { struct bpf_verifier_env *env; - struct bpf_verifer_log *log; + struct bpf_verifier_log *log; int ret = -EINVAL; /* no program is valid */ @@ -4680,7 +5660,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (env->prog->aux->offload) { + if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env); if (ret) goto err_unlock; @@ -4697,12 +5677,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->explored_states) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = check_cfg(env); if (ret < 0) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); @@ -4717,12 +5697,18 @@ skip_full_check: sanitize_dead_code(env); if (ret == 0) + ret = check_max_stack_depth(env); + + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); if (ret == 0) ret = fixup_bpf_calls(env); + if (ret == 0) + ret = fixup_call_args(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8cda3bc3ae22..a662bfcbea0e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3183,6 +3183,16 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) if (cgroup_is_threaded(cgrp)) return 0; + /* + * If @cgroup is populated or has domain controllers enabled, it + * can't be switched. While the below cgroup_can_be_thread_root() + * test can catch the same conditions, that's only when @parent is + * not mixable, so let's check it explicitly. + */ + if (cgroup_is_populated(cgrp) || + cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) + return -EOPNOTSUPP; + /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) @@ -4514,10 +4524,10 @@ static struct cftype cgroup_base_files[] = { * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence. */ -static void css_free_work_fn(struct work_struct *work) +static void css_free_rwork_fn(struct work_struct *work) { - struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys_state *css = container_of(to_rcu_work(work), + struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; @@ -4563,15 +4573,6 @@ static void css_free_work_fn(struct work_struct *work) } } -static void css_free_rcu_fn(struct rcu_head *rcu_head) -{ - struct cgroup_subsys_state *css = - container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - - INIT_WORK(&css->destroy_work, css_free_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); -} - static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = @@ -4621,7 +4622,8 @@ static void css_release_work_fn(struct work_struct *work) mutex_unlock(&cgroup_mutex); - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); + queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -4755,7 +4757,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); + queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f7efa7b4d825..b42037e6e81d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1254,9 +1254,9 @@ done: return retval; } -int current_cpuset_is_being_rebound(void) +bool current_cpuset_is_being_rebound(void) { - int ret; + bool ret; rcu_read_lock(); ret = task_cs(current) == cpuset_being_rebound; diff --git a/kernel/compat.c b/kernel/compat.c index d1cee656a7ed..6d21894806b4 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -355,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); + unsigned int retlen = min(len, cpumask_size()); if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) ret = -EFAULT; @@ -488,80 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) } EXPORT_SYMBOL_GPL(get_compat_sigset); -int -put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, - unsigned int size) -{ - /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ -#ifdef __BIG_ENDIAN - compat_sigset_t v; - switch (_NSIG_WORDS) { - case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; - case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; - case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; - case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; - } - return copy_to_user(compat, &v, size) ? -EFAULT : 0; -#else - return copy_to_user(compat, set, size) ? -EFAULT : 0; -#endif -} - -#ifdef CONFIG_NUMA -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages32, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); -} - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return sys_migrate_pages(pid, nr_bits + 1, old, new); -} -#endif - /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 8d9643767142..108fecc20fc1 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -18,6 +18,7 @@ CONFIG_VIRTUALIZATION=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y +CONFIG_S390_GUEST=y CONFIG_VIRTIO=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BLK=y diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 7fa0c4ae6394..9bfdffc100da 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -10,3 +10,7 @@ CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y +CONFIG_CC_STACKPROTECTOR_NONE=y +# CONFIG_CC_STACKPROTECTOR_REGULAR is not set +# CONFIG_CC_STACKPROTECTOR_STRONG is not set +# CONFIG_CC_STACKPROTECTOR_AUTO is not set diff --git a/kernel/cpu.c b/kernel/cpu.c index 53f7dc65f9a3..0db8938fbb23 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -124,24 +124,11 @@ struct cpuhp_step { }; static DEFINE_MUTEX(cpuhp_state_mutex); -static struct cpuhp_step cpuhp_bp_states[]; -static struct cpuhp_step cpuhp_ap_states[]; - -static bool cpuhp_is_ap_state(enum cpuhp_state state) -{ - /* - * The extra check for CPUHP_TEARDOWN_CPU is only for documentation - * purposes as that state is handled explicitly in cpu_down. - */ - return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; -} +static struct cpuhp_step cpuhp_hp_states[]; static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { - struct cpuhp_step *sp; - - sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; - return sp + state; + return cpuhp_hp_states + state; } /** @@ -239,6 +226,15 @@ err: } #ifdef CONFIG_SMP +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitly in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) { struct completion *done = bringup ? &st->done_up : &st->done_down; @@ -1224,7 +1220,7 @@ int __boot_cpu_id; #endif /* CONFIG_SMP */ /* Boot processor state steps */ -static struct cpuhp_step cpuhp_bp_states[] = { +static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", .startup.single = NULL, @@ -1289,24 +1285,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - /* - * Handled on controll processor until the plugged processor manages - * this itself. - */ - [CPUHP_TEARDOWN_CPU] = { - .name = "cpu:teardown", - .startup.single = NULL, - .teardown.single = takedown_cpu, - .cant_stop = true, - }, -#else - [CPUHP_BRINGUP_CPU] = { }, -#endif -}; - -/* Application processor state steps */ -static struct cpuhp_step cpuhp_ap_states[] = { -#ifdef CONFIG_SMP /* Final state before CPU kills itself */ [CPUHP_AP_IDLE_DEAD] = { .name = "idle:dead", @@ -1340,6 +1318,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { [CPUHP_AP_ONLINE] = { .name = "ap:online", }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup.single = NULL, + .teardown.single = takedown_cpu, + .cant_stop = true, + }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", @@ -1408,11 +1396,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state) switch (state) { case CPUHP_AP_ONLINE_DYN: - step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN; + step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN; end = CPUHP_AP_ONLINE_DYN_END; break; case CPUHP_BP_PREPARE_DYN: - step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN; + step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN; end = CPUHP_BP_PREPARE_DYN_END; break; default: diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 4f63597c824d..f7674d676889 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -376,6 +376,7 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa(vmcoreinfo_note); } +EXPORT_SYMBOL(paddr_vmcoreinfo_note); static int __init crash_save_vmcoreinfo_init(void) { @@ -453,6 +454,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_swapbacked); VMCOREINFO_NUMBER(PG_slab); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 90ff129c88a2..62c301ad0773 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -242,11 +242,11 @@ static void kdb_printbp(kdb_bp_t *bp, int i) kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT); if (bp->bp_enabled) - kdb_printf("\n is enabled"); + kdb_printf("\n is enabled "); else kdb_printf("\n is disabled"); - kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n", + kdb_printf(" addr at %016lx, hardtype=%d installed=%d\n", bp->bp_addr, bp->bp_type, bp->bp_installed); kdb_printf("\n"); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index dbb0781a0533..e405677ee08d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1150,6 +1150,16 @@ void kdb_set_current_task(struct task_struct *p) kdb_current_regs = NULL; } +static void drop_newline(char *buf) +{ + size_t len = strlen(buf); + + if (len == 0) + return; + if (*(buf + len - 1) == '\n') + *(buf + len - 1) = '\0'; +} + /* * kdb_local - The main code for kdb. This routine is invoked on a * specific processor, it is not global. The main kdb() routine @@ -1327,6 +1337,7 @@ do_full_getstr: cmdptr = cmd_head; diag = kdb_parse(cmdbuf); if (diag == KDB_NOTFOUND) { + drop_newline(cmdbuf); kdb_printf("Unknown kdb command: '%s'\n", cmdbuf); diag = 0; } @@ -1566,6 +1577,7 @@ static int kdb_md(int argc, const char **argv) int symbolic = 0; int valid = 0; int phys = 0; + int raw = 0; kdbgetintenv("MDCOUNT", &mdcount); kdbgetintenv("RADIX", &radix); @@ -1575,9 +1587,10 @@ static int kdb_md(int argc, const char **argv) repeat = mdcount * 16 / bytesperword; if (strcmp(argv[0], "mdr") == 0) { - if (argc != 2) + if (argc == 2 || (argc == 0 && last_addr != 0)) + valid = raw = 1; + else return KDB_ARGCOUNT; - valid = 1; } else if (isdigit(argv[0][2])) { bytesperword = (int)(argv[0][2] - '0'); if (bytesperword == 0) { @@ -1613,7 +1626,10 @@ static int kdb_md(int argc, const char **argv) radix = last_radix; bytesperword = last_bytesperword; repeat = last_repeat; - mdcount = ((repeat * bytesperword) + 15) / 16; + if (raw) + mdcount = repeat; + else + mdcount = ((repeat * bytesperword) + 15) / 16; } if (argc) { @@ -1630,7 +1646,10 @@ static int kdb_md(int argc, const char **argv) diag = kdbgetularg(argv[nextarg], &val); if (!diag) { mdcount = (int) val; - repeat = mdcount * 16 / bytesperword; + if (raw) + repeat = mdcount; + else + repeat = mdcount * 16 / bytesperword; } } if (argc >= nextarg+1) { @@ -1640,8 +1659,15 @@ static int kdb_md(int argc, const char **argv) } } - if (strcmp(argv[0], "mdr") == 0) - return kdb_mdr(addr, mdcount); + if (strcmp(argv[0], "mdr") == 0) { + int ret; + last_addr = addr; + ret = kdb_mdr(addr, mdcount); + last_addr += mdcount; + last_repeat = mdcount; + last_bytesperword = bytesperword; // to make REPEAT happy + return ret; + } switch (radix) { case 10: @@ -2473,41 +2499,6 @@ static int kdb_kill(int argc, const char **argv) return 0; } -struct kdb_tm { - int tm_sec; /* seconds */ - int tm_min; /* minutes */ - int tm_hour; /* hours */ - int tm_mday; /* day of the month */ - int tm_mon; /* month */ - int tm_year; /* year */ -}; - -static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) -{ - /* This will work from 1970-2099, 2100 is not a leap year */ - static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31, - 31, 30, 31, 30, 31 }; - memset(tm, 0, sizeof(*tm)); - tm->tm_sec = tv->tv_sec % (24 * 60 * 60); - tm->tm_mday = tv->tv_sec / (24 * 60 * 60) + - (2 * 365 + 1); /* shift base from 1970 to 1968 */ - tm->tm_min = tm->tm_sec / 60 % 60; - tm->tm_hour = tm->tm_sec / 60 / 60; - tm->tm_sec = tm->tm_sec % 60; - tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1)); - tm->tm_mday %= (4*365+1); - mon_day[1] = 29; - while (tm->tm_mday >= mon_day[tm->tm_mon]) { - tm->tm_mday -= mon_day[tm->tm_mon]; - if (++tm->tm_mon == 12) { - tm->tm_mon = 0; - ++tm->tm_year; - mon_day[1] = 28; - } - } - ++tm->tm_mday; -} - /* * Most of this code has been lifted from kernel/timer.c::sys_sysinfo(). * I cannot call that code directly from kdb, it has an unconditional @@ -2515,10 +2506,10 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) */ static void kdb_sysinfo(struct sysinfo *val) { - struct timespec uptime; - ktime_get_ts(&uptime); + u64 uptime = ktime_get_mono_fast_ns(); + memset(val, 0, sizeof(*val)); - val->uptime = uptime.tv_sec; + val->uptime = div_u64(uptime, NSEC_PER_SEC); val->loads[0] = avenrun[0]; val->loads[1] = avenrun[1]; val->loads[2] = avenrun[2]; @@ -2533,8 +2524,8 @@ static void kdb_sysinfo(struct sysinfo *val) */ static int kdb_summary(int argc, const char **argv) { - struct timespec now; - struct kdb_tm tm; + time64_t now; + struct tm tm; struct sysinfo val; if (argc) @@ -2548,9 +2539,9 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - now = __current_kernel_time(); - kdb_gmtime(&now, &tm); - kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " + now = __ktime_get_real_seconds(); + time64_to_tm(now, 0, &tm); + kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d " "tz_minuteswest %d\n", 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index d35cc2d3a4cc..990b3cc526c8 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -129,13 +129,13 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) } if (i >= ARRAY_SIZE(kdb_name_table)) { debug_kfree(kdb_name_table[0]); - memcpy(kdb_name_table, kdb_name_table+1, + memmove(kdb_name_table, kdb_name_table+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-1)); } else { debug_kfree(knt1); knt1 = kdb_name_table[i]; - memcpy(kdb_name_table+i, kdb_name_table+i+1, + memmove(kdb_name_table+i, kdb_name_table+i+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-i-1)); } diff --git a/kernel/events/core.c b/kernel/events/core.c index d0d9bfb47d2e..2d5fe26551f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) + for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } @@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp) static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); + struct perf_cgroup *cgrp = cpuctx->cgrp; + struct cgroup_subsys_state *css; + + if (cgrp) { + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + __update_cgrp_time(cgrp); + } + } } static inline void update_cgrp_time_from_event(struct perf_event *event) @@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; + struct cgroup_subsys_state *css; /* * ctx->lock held by caller @@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task, return; cgrp = perf_cgroup_from_task(task, ctx); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; + } } static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); @@ -937,27 +948,39 @@ list_update_cgroup_event(struct perf_event *event, if (!is_cgroup_event(event)) return; - if (add && ctx->nr_cgroups++) - return; - else if (!add && --ctx->nr_cgroups) - return; /* * Because cgroup events are always per-cpu events, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; - /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ - if (add) { + + /* + * Since setting cpuctx->cgrp is conditional on the current @cgrp + * matching the event's cgroup, we must do this for every new event, + * because if the first would mismatch, the second would not try again + * and we would leave cpuctx->cgrp unset. + */ + if (add && !cpuctx->cgrp) { struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) cpuctx->cgrp = cgrp; - } else { - list_del(cpuctx_entry); - cpuctx->cgrp = NULL; } + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + + /* no cgroup running */ + if (!add) + cpuctx->cgrp = NULL; + + cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; + if (add) + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + else + list_del(cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -1041,7 +1064,7 @@ list_update_cgroup_event(struct perf_event *event, static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; - int rotations = 0; + bool rotations; lockdep_assert_irqs_disabled(); @@ -1460,8 +1483,21 @@ static enum event_type_t get_event_type(struct perf_event *event) return event_type; } -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +/* + * Helper function to initialize event group nodes. + */ +static void init_event_group(struct perf_event *event) +{ + RB_CLEAR_NODE(&event->group_node); + event->group_index = 0; +} + +/* + * Extract pinned or flexible groups from the context + * based on event attrs bits. + */ +static struct perf_event_groups * +get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; @@ -1470,6 +1506,156 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) } /* + * Helper function to initializes perf_event_group trees. + */ +static void perf_event_groups_init(struct perf_event_groups *groups) +{ + groups->tree = RB_ROOT; + groups->index = 0; +} + +/* + * Compare function for event groups; + * + * Implements complex key that first sorts by CPU and then by virtual index + * which provides ordering when rotating groups for the same CPU. + */ +static bool +perf_event_groups_less(struct perf_event *left, struct perf_event *right) +{ + if (left->cpu < right->cpu) + return true; + if (left->cpu > right->cpu) + return false; + + if (left->group_index < right->group_index) + return true; + if (left->group_index > right->group_index) + return false; + + return false; +} + +/* + * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for + * key (see perf_event_groups_less). This places it last inside the CPU + * subtree. + */ +static void +perf_event_groups_insert(struct perf_event_groups *groups, + struct perf_event *event) +{ + struct perf_event *node_event; + struct rb_node *parent; + struct rb_node **node; + + event->group_index = ++groups->index; + + node = &groups->tree.rb_node; + parent = *node; + + while (*node) { + parent = *node; + node_event = container_of(*node, struct perf_event, group_node); + + if (perf_event_groups_less(event, node_event)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&event->group_node, parent, node); + rb_insert_color(&event->group_node, &groups->tree); +} + +/* + * Helper function to insert event into the pinned or flexible groups. + */ +static void +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_insert(groups, event); +} + +/* + * Delete a group from a tree. + */ +static void +perf_event_groups_delete(struct perf_event_groups *groups, + struct perf_event *event) +{ + WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || + RB_EMPTY_ROOT(&groups->tree)); + + rb_erase(&event->group_node, &groups->tree); + init_event_group(event); +} + +/* + * Helper function to delete event from its groups. + */ +static void +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_delete(groups, event); +} + +/* + * Get the leftmost event in the @cpu subtree. + */ +static struct perf_event * +perf_event_groups_first(struct perf_event_groups *groups, int cpu) +{ + struct perf_event *node_event = NULL, *match = NULL; + struct rb_node *node = groups->tree.rb_node; + + while (node) { + node_event = container_of(node, struct perf_event, group_node); + + if (cpu < node_event->cpu) { + node = node->rb_left; + } else if (cpu > node_event->cpu) { + node = node->rb_right; + } else { + match = node_event; + node = node->rb_left; + } + } + + return match; +} + +/* + * Like rb_entry_next_safe() for the @cpu subtree. + */ +static struct perf_event * +perf_event_groups_next(struct perf_event *event) +{ + struct perf_event *next; + + next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); + if (next && next->cpu == event->cpu) + return next; + + return NULL; +} + +/* + * Iterate through the whole groups tree. + */ +#define perf_event_groups_for_each(event, groups) \ + for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ + typeof(*event), group_node); event; \ + event = rb_entry_safe(rb_next(&event->group_node), \ + typeof(*event), group_node)) + +/* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ @@ -1489,12 +1675,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { - struct list_head *list; - event->group_caps = event->event_caps; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); + add_event_to_groups(event, ctx); } list_update_cgroup_event(event, ctx, true); @@ -1652,12 +1834,12 @@ static void perf_group_attach(struct perf_event *event) group_leader->group_caps &= event->event_caps; - list_add_tail(&event->group_entry, &group_leader->sibling_list); + list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; perf_event__header_size(group_leader); - list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } @@ -1688,7 +1870,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list_del_rcu(&event->event_entry); if (event->group_leader == event) - list_del_init(&event->group_entry); + del_event_from_groups(event, ctx); /* * If event was in error state, then keep it @@ -1706,9 +1888,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; - struct list_head *list = NULL; + struct perf_event_context *ctx = event->ctx; - lockdep_assert_held(&event->ctx->lock); + lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. @@ -1722,34 +1904,42 @@ static void perf_group_detach(struct perf_event *event) * If this is a sibling, remove it from its group. */ if (event->group_leader != event) { - list_del_init(&event->group_entry); + list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; } - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ - list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + sibling->group_leader = sibling; + list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; + if (!RB_EMPTY_NODE(&event->group_node)) { + add_event_to_groups(sibling, event->ctx); + + if (sibling->state == PERF_EVENT_STATE_ACTIVE) { + struct list_head *list = sibling->attr.pinned ? + &ctx->pinned_active : &ctx->flexible_active; + + list_add_tail(&sibling->active_list, list); + } + } + WARN_ON_ONCE(sibling->ctx != event->ctx); } out: perf_event__header_size(event->group_leader); - list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + for_each_sibling_event(tmp, event->group_leader) perf_event__header_size(tmp); } @@ -1772,13 +1962,13 @@ static inline int __pmu_filter_match(struct perf_event *event) */ static inline int pmu_filter_match(struct perf_event *event) { - struct perf_event *child; + struct perf_event *sibling; if (!__pmu_filter_match(event)) return 0; - list_for_each_entry(child, &event->sibling_list, group_entry) { - if (!__pmu_filter_match(child)) + for_each_sibling_event(sibling, event) { + if (!__pmu_filter_match(sibling)) return 0; } @@ -1805,6 +1995,13 @@ event_sched_out(struct perf_event *event, if (event->state != PERF_EVENT_STATE_ACTIVE) return; + /* + * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but + * we can schedule events _OUT_ individually through things like + * __perf_remove_from_context(). + */ + list_del_init(&event->active_list); + perf_pmu_disable(event->pmu); event->pmu->del(event, 0); @@ -1845,7 +2042,7 @@ group_sched_out(struct perf_event *group_event, /* * Schedule out siblings (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) + for_each_sibling_event(event, group_event) event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); @@ -2124,7 +2321,7 @@ group_sched_in(struct perf_event *group_event, /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + for_each_sibling_event(event, group_event) { if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; @@ -2140,7 +2337,7 @@ group_error: * partial group before returning: * The events up to the failed event are scheduled out normally. */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + for_each_sibling_event(event, group_event) { if (event == partial_group) break; @@ -2246,7 +2443,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { - enum event_type_t ctx_event_type = event_type & EVENT_ALL; + enum event_type_t ctx_event_type; bool cpu_event = !!(event_type & EVENT_CPU); /* @@ -2256,6 +2453,8 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; + ctx_event_type = event_type & EVENT_ALL; + perf_pmu_disable(cpuctx->ctx.pmu); if (task_ctx) task_ctx_sched_out(cpuctx, task_ctx, event_type); @@ -2315,6 +2514,18 @@ static int __perf_install_in_context(void *info) raw_spin_lock(&task_ctx->lock); } +#ifdef CONFIG_CGROUP_PERF + if (is_cgroup_event(event)) { + /* + * If the current cgroup doesn't match the event's + * cgroup, we should not try to schedule it. + */ + struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); + reprogram = cgroup_is_descendant(cgrp->css.cgroup, + event->cgrp->css.cgroup); + } +#endif + if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); @@ -2648,12 +2859,47 @@ int perf_event_refresh(struct perf_event *event, int refresh) } EXPORT_SYMBOL_GPL(perf_event_refresh); +static int perf_event_modify_breakpoint(struct perf_event *bp, + struct perf_event_attr *attr) +{ + int err; + + _perf_event_disable(bp); + + err = modify_user_hw_breakpoint_check(bp, attr, true); + if (err) { + if (!bp->attr.disabled) + _perf_event_enable(bp); + + return err; + } + + if (!attr->disabled) + _perf_event_enable(bp); + return 0; +} + +static int perf_event_modify_attr(struct perf_event *event, + struct perf_event_attr *attr) +{ + if (event->attr.type != attr->type) + return -EINVAL; + + switch (event->attr.type) { + case PERF_TYPE_BREAKPOINT: + return perf_event_modify_breakpoint(event, attr); + default: + /* Place holder for future additions. */ + return -EOPNOTSUPP; + } +} + static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { + struct perf_event *event, *tmp; int is_active = ctx->is_active; - struct perf_event *event; lockdep_assert_held(&ctx->lock); @@ -2700,12 +2946,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) group_sched_out(event, cpuctx, ctx); } if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) group_sched_out(event, cpuctx, ctx); } perf_pmu_enable(ctx->pmu); @@ -2992,53 +3238,116 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static int visit_groups_merge(struct perf_event_groups *groups, int cpu, + int (*func)(struct perf_event *, void *), void *data) { - struct perf_event *event; + struct perf_event **evt, *evt1, *evt2; + int ret; - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; + evt1 = perf_event_groups_first(groups, -1); + evt2 = perf_event_groups_first(groups, cpu); + + while (evt1 || evt2) { + if (evt1 && evt2) { + if (evt1->group_index < evt2->group_index) + evt = &evt1; + else + evt = &evt2; + } else if (evt1) { + evt = &evt1; + } else { + evt = &evt2; + } - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); + ret = func(*evt, data); + if (ret) + return ret; - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + *evt = perf_event_groups_next(*evt); } + + return 0; +} + +struct sched_in_data { + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + int can_add_hw; +}; + +static int pinned_sched_in(struct perf_event *event, void *data) +{ + struct sched_in_data *sid = data; + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { + if (!group_sched_in(event, sid->cpuctx, sid->ctx)) + list_add_tail(&event->active_list, &sid->ctx->pinned_active); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + return 0; +} + +static int flexible_sched_in(struct perf_event *event, void *data) +{ + struct sched_in_data *sid = data; + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { + if (!group_sched_in(event, sid->cpuctx, sid->ctx)) + list_add_tail(&event->active_list, &sid->ctx->flexible_active); + else + sid->can_add_hw = 0; + } + + return 0; +} + +static void +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct sched_in_data sid = { + .ctx = ctx, + .cpuctx = cpuctx, + .can_add_hw = 1, + }; + + visit_groups_merge(&ctx->pinned_groups, + smp_processor_id(), + pinned_sched_in, &sid); } static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct perf_event *event; - int can_add_hw = 1; - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; + struct sched_in_data sid = { + .ctx = ctx, + .cpuctx = cpuctx, + .can_add_hw = 1, + }; - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } + visit_groups_merge(&ctx->flexible_groups, + smp_processor_id(), + flexible_sched_in, &sid); } static void @@ -3119,7 +3428,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!list_empty(&ctx->pinned_groups)) + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); @@ -3348,55 +3657,81 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, } /* - * Round-robin a context's events: + * Move @event to the tail of the @ctx's elegible events. */ -static void rotate_ctx(struct perf_event_context *ctx) +static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); + if (ctx->rotate_disable) + return; + + perf_event_groups_delete(&ctx->flexible_groups, event); + perf_event_groups_insert(&ctx->flexible_groups, event); +} + +static inline struct perf_event * +ctx_first_active(struct perf_event_context *ctx) +{ + return list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx) +static bool perf_rotate_context(struct perf_cpu_context *cpuctx) { + struct perf_event *cpu_event = NULL, *task_event = NULL; + bool cpu_rotate = false, task_rotate = false; struct perf_event_context *ctx = NULL; - int rotate = 0; + + /* + * Since we run this from IRQ context, nobody can install new + * events, thus the event count values are stable. + */ if (cpuctx->ctx.nr_events) { if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + cpu_rotate = true; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { if (ctx->nr_events != ctx->nr_active) - rotate = 1; + task_rotate = true; } - if (!rotate) - goto done; + if (!(cpu_rotate || task_rotate)) + return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) + if (task_rotate) + task_event = ctx_first_active(ctx); + if (cpu_rotate) + cpu_event = ctx_first_active(&cpuctx->ctx); + + /* + * As per the order given at ctx_resched() first 'pop' task flexible + * and then, if needed CPU flexible. + */ + if (task_event || (ctx && cpu_event)) ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (cpu_event) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + if (task_event) + rotate_ctx(ctx, task_event); + if (cpu_event) + rotate_ctx(&cpuctx->ctx, cpu_event); perf_event_sched_in(cpuctx, ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); -done: - return rotate; + return true; } void perf_event_task_tick(void) @@ -3541,7 +3876,7 @@ static void __perf_event_read(void *info) pmu->read(event); - list_for_each_entry(sub, &event->sibling_list, group_entry) { + for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -3715,9 +4050,11 @@ static void __perf_event_init_context(struct perf_event_context *ctx) raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->active_ctx_list); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); + perf_event_groups_init(&ctx->pinned_groups); + perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); + INIT_LIST_HEAD(&ctx->pinned_active); + INIT_LIST_HEAD(&ctx->flexible_active); atomic_set(&ctx->refcount, 1); } @@ -4110,6 +4447,9 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); + if (event->hw.target) + put_task_struct(event->hw.target); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -4387,7 +4727,7 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -4524,7 +4864,7 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct ring_buffer *rb; - __poll_t events = POLLHUP; + __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); @@ -4581,7 +4921,7 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - list_for_each_entry(sibling, &event->sibling_list, group_entry) + for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } @@ -4663,6 +5003,8 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -4732,6 +5074,20 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon rcu_read_unlock(); return 0; } + + case PERF_EVENT_IOC_QUERY_BPF: + return perf_event_query_prog_array(event, (void __user *)arg); + + case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { + struct perf_event_attr new_attr; + int err = perf_copy_attr((struct perf_event_attr __user *)arg, + &new_attr); + + if (err) + return err; + + return perf_event_modify_attr(event, &new_attr); + } default: return -ENOTTY; } @@ -5727,7 +6083,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if (leader != event) + if ((leader != event) && + (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader); @@ -5736,7 +6093,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && @@ -7993,9 +8350,127 @@ static struct pmu perf_tracepoint = { .read = perf_swevent_read, }; +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) +/* + * Flags in config, used by dynamic PMU kprobe and uprobe + * The flags should match following PMU_FORMAT_ATTR(). + * + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe + * if not set, create kprobe/uprobe + */ +enum perf_probe_config { + PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ +}; + +PMU_FORMAT_ATTR(retprobe, "config:0"); + +static struct attribute *probe_attrs[] = { + &format_attr_retprobe.attr, + NULL, +}; + +static struct attribute_group probe_format_group = { + .name = "format", + .attrs = probe_attrs, +}; + +static const struct attribute_group *probe_attr_groups[] = { + &probe_format_group, + NULL, +}; +#endif + +#ifdef CONFIG_KPROBE_EVENTS +static int perf_kprobe_event_init(struct perf_event *event); +static struct pmu perf_kprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_kprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_kprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_kprobe.type) + return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_kprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_kprobe_destroy; + + return 0; +} +#endif /* CONFIG_KPROBE_EVENTS */ + +#ifdef CONFIG_UPROBE_EVENTS +static int perf_uprobe_event_init(struct perf_event *event); +static struct pmu perf_uprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_uprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_uprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_uprobe.type) + return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_uprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_uprobe_destroy; + + return 0; +} +#endif /* CONFIG_UPROBE_EVENTS */ + static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +#ifdef CONFIG_KPROBE_EVENTS + perf_pmu_register(&perf_kprobe, "kprobe", -1); +#endif +#ifdef CONFIG_UPROBE_EVENTS + perf_pmu_register(&perf_uprobe, "uprobe", -1); +#endif } static void perf_event_free_filter(struct perf_event *event) @@ -8072,13 +8547,32 @@ static void perf_event_free_bpf_handler(struct perf_event *event) } #endif +/* + * returns true if the event is a tracepoint, or a kprobe/upprobe created + * with perf_event_open() + */ +static inline bool perf_event_is_tracing(struct perf_event *event) +{ + if (event->pmu == &perf_tracepoint) + return true; +#ifdef CONFIG_KPROBE_EVENTS + if (event->pmu == &perf_kprobe) + return true; +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (event->pmu == &perf_uprobe) + return true; +#endif + return false; +} + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; int ret; - if (event->attr.type != PERF_TYPE_TRACEPOINT) + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog_fd); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; @@ -8100,6 +8594,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); @@ -8117,7 +8618,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) static void perf_event_free_bpf_prog(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_TRACEPOINT) { + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } @@ -8313,7 +8814,8 @@ restart: * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * - * if <size> is not specified, the range is treated as a single address. + * if <size> is not specified or is zero, the range is treated as a single + * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, @@ -8363,6 +8865,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { + static const enum perf_addr_filter_action_t actions[] = { + [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, + [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, + [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, + }; ret = -EINVAL; if (!*start) @@ -8379,12 +8886,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, switch (token) { case IF_ACT_FILTER: case IF_ACT_START: - filter->filter = 1; - case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; + filter->action = actions[token]; state = IF_STATE_SOURCE; break; @@ -8397,15 +8903,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_SOURCE) goto fail; - if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) - filter->range = 1; - *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; - if (filter->range) { + if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) @@ -8413,7 +8916,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { - int fpos = filter->range ? 2 : 1; + int fpos = token == IF_SRC_FILE ? 2 : 1; filename = match_strdup(&args[fpos]); if (!filename) { @@ -8439,6 +8942,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (kernel && event->attr.exclude_kernel) goto fail; + /* + * ACTION "filter" must have a non-zero length region + * specified. + */ + if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && + !filter->size) + goto fail; + if (!kernel) { if (!filename) goto fail; @@ -8536,47 +9047,36 @@ fail_clear_files: return ret; } -static int -perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) -{ - struct perf_event_context *ctx = event->ctx; - int ret; - - /* - * Beware, here be dragons!! - * - * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint - * stuff does not actually need it. So temporarily drop ctx->mutex. As per - * perf_event_ctx_lock() we already have a reference on ctx. - * - * This can result in event getting moved to a different ctx, but that - * does not affect the tracepoint state. - */ - mutex_unlock(&ctx->mutex); - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - mutex_lock(&ctx->mutex); - - return ret; -} - static int perf_event_set_filter(struct perf_event *event, void __user *arg) { - char *filter_str; int ret = -EINVAL; - - if ((event->attr.type != PERF_TYPE_TRACEPOINT || - !IS_ENABLED(CONFIG_EVENT_TRACING)) && - !has_addr_filter(event)) - return -EINVAL; + char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); - if (IS_ENABLED(CONFIG_EVENT_TRACING) && - event->attr.type == PERF_TYPE_TRACEPOINT) - ret = perf_tracepoint_set_filter(event, filter_str); - else if (has_addr_filter(event)) +#ifdef CONFIG_EVENT_TRACING + if (perf_event_is_tracing(event)) { + struct perf_event_context *ctx = event->ctx; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but + * the tracepoint stuff does not actually need it. So + * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we + * already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, + * but that does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + } else +#endif + if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); @@ -9429,9 +9929,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); - INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->active_list); + init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); @@ -9465,6 +9966,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ + get_task_struct(task); event->hw.target = task; } @@ -9580,6 +10082,8 @@ err_ns: perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); + if (event->hw.target) + put_task_struct(event->hw.target); kfree(event); return ERR_PTR(err); @@ -9706,6 +10210,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (!attr->sample_max_stack) + attr->sample_max_stack = sysctl_perf_event_max_stack; + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); out: @@ -9919,9 +10426,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; - if (!attr.sample_max_stack) - attr.sample_max_stack = sysctl_perf_event_max_stack; - /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument @@ -10195,8 +10699,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(group_leader, 0); put_ctx(gctx); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -10217,8 +10720,7 @@ SYSCALL_DEFINE5(perf_event_open, * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + for_each_sibling_event(sibling, group_leader) { perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); @@ -10857,7 +11359,7 @@ static int inherit_group(struct perf_event *parent_event, * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) @@ -10956,7 +11458,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) @@ -10972,7 +11474,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3f8cb1e14588..6e28d2866be5 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -44,6 +44,7 @@ #include <linux/list.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/bug.h> #include <linux/hw_breakpoint.h> /* @@ -85,9 +86,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp) return 1; } -static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +static inline enum bp_type_idx find_slot_idx(u64 bp_type) { - if (bp->attr.bp_type & HW_BREAKPOINT_RW) + if (bp_type & HW_BREAKPOINT_RW) return TYPE_DATA; return TYPE_INST; @@ -122,7 +123,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.target == tsk && - find_slot_idx(iter) == type && + find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } @@ -277,7 +278,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ -static int __reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { struct bp_busy_slots slots = {0}; enum bp_type_idx type; @@ -288,11 +289,11 @@ static int __reserve_bp_slot(struct perf_event *bp) return -ENOMEM; /* Basic checks */ - if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || - bp->attr.bp_type == HW_BREAKPOINT_INVALID) + if (bp_type == HW_BREAKPOINT_EMPTY || + bp_type == HW_BREAKPOINT_INVALID) return -EINVAL; - type = find_slot_idx(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); @@ -317,19 +318,19 @@ int reserve_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); - ret = __reserve_bp_slot(bp); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); return ret; } -static void __release_bp_slot(struct perf_event *bp) +static void __release_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int weight; - type = find_slot_idx(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); } @@ -339,11 +340,43 @@ void release_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); arch_unregister_hw_breakpoint(bp); - __release_bp_slot(bp); + __release_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); } +static int __modify_bp_slot(struct perf_event *bp, u64 old_type) +{ + int err; + + __release_bp_slot(bp, old_type); + + err = __reserve_bp_slot(bp, bp->attr.bp_type); + if (err) { + /* + * Reserve the old_type slot back in case + * there's no space for the new type. + * + * This must succeed, because we just released + * the old_type slot in the __release_bp_slot + * call above. If not, something is broken. + */ + WARN_ON(__reserve_bp_slot(bp, old_type)); + } + + return err; +} + +static int modify_bp_slot(struct perf_event *bp, u64 old_type) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + ret = __modify_bp_slot(bp, old_type); + mutex_unlock(&nr_bp_mutex); + return ret; +} + /* * Allow the kernel debugger to reserve breakpoint slots without * taking a lock using the dbg_* variant of for the reserve and @@ -354,7 +387,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp) if (mutex_is_locked(&nr_bp_mutex)) return -1; - return __reserve_bp_slot(bp); + return __reserve_bp_slot(bp, bp->attr.bp_type); } int dbg_release_bp_slot(struct perf_event *bp) @@ -362,7 +395,7 @@ int dbg_release_bp_slot(struct perf_event *bp) if (mutex_is_locked(&nr_bp_mutex)) return -1; - __release_bp_slot(bp); + __release_bp_slot(bp, bp->attr.bp_type); return 0; } @@ -423,20 +456,45 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); +int +modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, + bool check) +{ + u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; + int old_type = bp->attr.bp_type; + bool modify = attr->bp_type != old_type; + int err = 0; + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (check && memcmp(&bp->attr, attr, sizeof(*attr))) + return -EINVAL; + + err = validate_hw_breakpoint(bp); + if (!err && modify) + err = modify_bp_slot(bp, old_type); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + return err; + } + + bp->attr.disabled = attr->disabled; + return 0; +} + /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify * @attr: new breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - int err = 0; - /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. @@ -448,30 +506,14 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; - - if (attr->disabled) - goto end; + if (!attr->disabled) { + int err = modify_user_hw_breakpoint_check(bp, attr, false); - err = validate_hw_breakpoint(bp); - if (!err) + if (err) + return err; perf_event_enable(bp); - - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - if (!bp->attr.disabled) - perf_event_enable(bp); - - return err; + bp->attr.disabled = 0; } - -end: - bp->attr.disabled = attr->disabled; - return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 141aa2ca8728..6c6b3c48db71 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -19,7 +19,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, POLLIN); + atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending); diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0975b0268545..a5697119290e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -19,7 +19,6 @@ #include <linux/syscalls.h> #include <linux/sysctl.h> #include <linux/types.h> -#include <linux/fs_struct.h> #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) diff --git a/kernel/exit.c b/kernel/exit.c index 995453d9fb55..c3c7ac560114 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1691,7 +1691,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { - return sys_wait4(pid, stat_addr, options, NULL); + return kernel_wait4(pid, stat_addr, options, NULL); } #endif diff --git a/kernel/extable.c b/kernel/extable.c index a17fdb63dc3e..6a5b61ebc66c 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -64,7 +64,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -static inline int init_kernel_text(unsigned long addr) +int init_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_sinittext && addr < (unsigned long)_einittext) diff --git a/kernel/fail_function.c b/kernel/fail_function.c new file mode 100644 index 000000000000..1d5632d8bbcc --- /dev/null +++ b/kernel/fail_function.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fail_function.c: Function-based error injection + */ +#include <linux/error-injection.h> +#include <linux/debugfs.h> +#include <linux/fault-inject.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); + +static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs, + unsigned long flags) +{ + /* + * A dummy post handler is required to prohibit optimizing, because + * jump optimization does not support execution path overriding. + */ +} + +struct fei_attr { + struct list_head list; + struct kprobe kp; + unsigned long retval; +}; +static DEFINE_MUTEX(fei_lock); +static LIST_HEAD(fei_attr_list); +static DECLARE_FAULT_ATTR(fei_fault_attr); +static struct dentry *fei_debugfs_dir; + +static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv) +{ + switch (get_injectable_error_type(addr)) { + case EI_ETYPE_NULL: + if (retv != 0) + return 0; + break; + case EI_ETYPE_ERRNO: + if (retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + case EI_ETYPE_ERRNO_NULL: + if (retv != 0 && retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + } + + return retv; +} + +static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) +{ + struct fei_attr *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (attr) { + attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL); + if (!attr->kp.symbol_name) { + kfree(attr); + return NULL; + } + attr->kp.pre_handler = fei_kprobe_handler; + attr->kp.post_handler = fei_post_handler; + attr->retval = adjust_error_retval(addr, 0); + INIT_LIST_HEAD(&attr->list); + } + return attr; +} + +static void fei_attr_free(struct fei_attr *attr) +{ + if (attr) { + kfree(attr->kp.symbol_name); + kfree(attr); + } +} + +static struct fei_attr *fei_attr_lookup(const char *sym) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (!strcmp(attr->kp.symbol_name, sym)) + return attr; + } + + return NULL; +} + +static bool fei_attr_is_valid(struct fei_attr *_attr) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (attr == _attr) + return true; + } + + return false; +} + +static int fei_retval_set(void *data, u64 val) +{ + struct fei_attr *attr = data; + unsigned long retv = (unsigned long)val; + int err = 0; + + mutex_lock(&fei_lock); + /* + * Since this operation can be done after retval file is removed, + * It is safer to check the attr is still valid before accessing + * its member. + */ + if (!fei_attr_is_valid(attr)) { + err = -ENOENT; + goto out; + } + + if (attr->kp.addr) { + if (adjust_error_retval((unsigned long)attr->kp.addr, + val) != retv) + err = -EINVAL; + } + if (!err) + attr->retval = val; +out: + mutex_unlock(&fei_lock); + + return err; +} + +static int fei_retval_get(void *data, u64 *val) +{ + struct fei_attr *attr = data; + int err = 0; + + mutex_lock(&fei_lock); + /* Here we also validate @attr to ensure it still exists. */ + if (!fei_attr_is_valid(attr)) + err = -ENOENT; + else + *val = attr->retval; + mutex_unlock(&fei_lock); + + return err; +} +DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, + "%llx\n"); + +static int fei_debugfs_add_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); + if (!dir) + return -ENOMEM; + + if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +static void fei_debugfs_remove_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); + if (dir) + debugfs_remove_recursive(dir); +} + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) +{ + struct fei_attr *attr = container_of(kp, struct fei_attr, kp); + + if (should_fail(&fei_fault_attr, 1)) { + regs_set_return_value(regs, attr->retval); + override_function_with_return(regs); + /* Kprobe specific fixup */ + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + + return 0; +} +NOKPROBE_SYMBOL(fei_kprobe_handler) + +static void *fei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&fei_lock); + return seq_list_start(&fei_attr_list, *pos); +} + +static void fei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&fei_lock); +} + +static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &fei_attr_list, pos); +} + +static int fei_seq_show(struct seq_file *m, void *v) +{ + struct fei_attr *attr = list_entry(v, struct fei_attr, list); + + seq_printf(m, "%pf\n", attr->kp.addr); + return 0; +} + +static const struct seq_operations fei_seq_ops = { + .start = fei_seq_start, + .next = fei_seq_next, + .stop = fei_seq_stop, + .show = fei_seq_show, +}; + +static int fei_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fei_seq_ops); +} + +static void fei_attr_remove(struct fei_attr *attr) +{ + fei_debugfs_remove_attr(attr); + unregister_kprobe(&attr->kp); + list_del(&attr->list); + fei_attr_free(attr); +} + +static void fei_attr_remove_all(void) +{ + struct fei_attr *attr, *n; + + list_for_each_entry_safe(attr, n, &fei_attr_list, list) { + fei_attr_remove(attr); + } +} + +static ssize_t fei_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct fei_attr *attr; + unsigned long addr; + char *buf, *sym; + int ret; + + /* cut off if it is too long */ + if (count > KSYM_NAME_LEN) + count = KSYM_NAME_LEN; + buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, buffer, count)) { + ret = -EFAULT; + goto out; + } + buf[count] = '\0'; + sym = strstrip(buf); + + mutex_lock(&fei_lock); + + /* Writing just spaces will remove all injection points */ + if (sym[0] == '\0') { + fei_attr_remove_all(); + ret = count; + goto out; + } + /* Writing !function will remove one injection point */ + if (sym[0] == '!') { + attr = fei_attr_lookup(sym + 1); + if (!attr) { + ret = -ENOENT; + goto out; + } + fei_attr_remove(attr); + ret = count; + goto out; + } + + addr = kallsyms_lookup_name(sym); + if (!addr) { + ret = -EINVAL; + goto out; + } + if (!within_error_injection_list(addr)) { + ret = -ERANGE; + goto out; + } + if (fei_attr_lookup(sym)) { + ret = -EBUSY; + goto out; + } + attr = fei_attr_new(sym, addr); + if (!attr) { + ret = -ENOMEM; + goto out; + } + + ret = register_kprobe(&attr->kp); + if (!ret) + ret = fei_debugfs_add_attr(attr); + if (ret < 0) + fei_attr_remove(attr); + else { + list_add_tail(&attr->list, &fei_attr_list); + ret = count; + } +out: + kfree(buf); + mutex_unlock(&fei_lock); + return ret; +} + +static const struct file_operations fei_ops = { + .open = fei_open, + .read = seq_read, + .write = fei_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init fei_debugfs_init(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_function", NULL, + &fei_fault_attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + /* injectable attribute is just a symlink of error_inject/list */ + if (!debugfs_create_symlink("injectable", dir, + "../error_injection/list")) + goto error; + + if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) + goto error; + + fei_debugfs_dir = dir; + + return 0; +error: + debugfs_remove_recursive(dir); + return -ENOMEM; +} + +late_initcall(fei_debugfs_init); diff --git a/kernel/fork.c b/kernel/fork.c index 2295fc69717f..242c8c93d285 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> +#include <linux/sched/mm.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> @@ -282,8 +283,9 @@ static void free_thread_stack(struct task_struct *tsk) void thread_stack_cache_init(void) { - thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, - THREAD_SIZE, 0, NULL); + thread_stack_cache = kmem_cache_create_usercopy("thread_stack", + THREAD_SIZE, THREAD_SIZE, 0, 0, + THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } # endif @@ -390,6 +392,237 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +#ifdef CONFIG_MMU +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + LIST_HEAD(uf); + + uprobe_start_dup_mmap(); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); + if (tmp->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); +fail_uprobe_end: + uprobe_end_dup_mmap(); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct *mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct *mm) +{ + pgd_free(mm, mm->pgd); +} +#else +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); +#endif +} + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + WARN_ON_ONCE(mm == current->mm); + WARN_ON_ONCE(mm == current->active_mm); + mm_free_pgd(mm); + destroy_context(mm); + hmm_mm_destroy(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); + put_user_ns(mm->user_ns); + free_mm(mm); +} +EXPORT_SYMBOL_GPL(__mmdrop); + +static void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm; + + mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); @@ -457,6 +690,21 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif +static void task_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + /* Fetch thread_struct whitelist for the architecture. */ + arch_thread_struct_whitelist(offset, size); + + /* + * Handle zero-sized whitelist or empty thread_struct, otherwise + * adjust offset to position of thread_struct in task_struct. + */ + if (unlikely(*size == 0)) + *offset = 0; + else + *offset += offsetof(struct task_struct, thread); +} + void __init fork_init(void) { int i; @@ -465,11 +713,14 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN 0 #endif int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + unsigned long useroffset, usersize; /* create a slab on which task_structs can be allocated */ - task_struct_cachep = kmem_cache_create("task_struct", + task_struct_whitelist(&useroffset, &usersize); + task_struct_cachep = kmem_cache_create_usercopy("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, + useroffset, usersize, NULL); #endif /* do the arch specific task caches init */ @@ -594,181 +845,8 @@ free_tsk: return NULL; } -#ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - LIST_HEAD(uf); - - uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ - tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - down_write(&oldmm->mmap_sem); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); - return 0; -} -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) @@ -858,27 +936,6 @@ fail_nopgd: return NULL; } -static void check_mm(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); - - if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); - } - - if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -#endif -} - /* * Allocate and initialize an mm_struct. */ @@ -894,24 +951,6 @@ struct mm_struct *mm_alloc(void) return mm_init(mm, current, current_user_ns()); } -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - hmm_mm_destroy(mm); - mmu_notifier_mm_destroy(mm); - check_mm(mm); - put_user_ns(mm->user_ns); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); @@ -1161,8 +1200,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * not set up a proper pointer then tough luck. */ put_user(0, tsk->clear_child_tid); - sys_futex(tsk->clear_child_tid, FUTEX_WAKE, - 1, NULL, NULL, 0); + do_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0, 0); } tsk->clear_child_tid = NULL; } @@ -1544,6 +1583,10 @@ static __latent_entropy struct task_struct *copy_process( int retval; struct task_struct *p; + /* + * Don't allow sharing the root directory with processes in a different + * namespace + */ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -2019,6 +2062,8 @@ long _do_fork(unsigned long clone_flags, int __user *child_tidptr, unsigned long tls) { + struct completion vfork; + struct pid *pid; struct task_struct *p; int trace = 0; long nr; @@ -2044,43 +2089,40 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); + + if (IS_ERR(p)) + return PTR_ERR(p); + /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ - if (!IS_ERR(p)) { - struct completion vfork; - struct pid *pid; + trace_sched_process_fork(current, p); - trace_sched_process_fork(current, p); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); - pid = get_task_pid(p, PIDTYPE_PID); - nr = pid_vnr(pid); + if (clone_flags & CLONE_PARENT_SETTID) + put_user(nr, parent_tidptr); - if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); - - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; - init_completion(&vfork); - get_task_struct(p); - } - - wake_up_new_task(p); + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); + get_task_struct(p); + } - /* forking complete and child started to run, tell ptracer */ - if (unlikely(trace)) - ptrace_event_pid(trace, pid); + wake_up_new_task(p); - if (clone_flags & CLONE_VFORK) { - if (!wait_for_vfork_done(p, &vfork)) - ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); - } + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) + ptrace_event_pid(trace, pid); - put_pid(pid); - } else { - nr = PTR_ERR(p); + if (clone_flags & CLONE_VFORK) { + if (!wait_for_vfork_done(p, &vfork)) + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); return nr; } @@ -2224,9 +2266,11 @@ void __init proc_caches_init(void) * maximum number of CPU's we can ever have. The cpumask_allocation * is at the end of the structure, exactly for that reason. */ - mm_cachep = kmem_cache_create("mm_struct", + mm_cachep = kmem_cache_create_usercopy("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct mm_struct, saved_auxv), + sizeof_field(struct mm_struct, saved_auxv), NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); @@ -2312,7 +2356,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp * constructed. Here we are modifying the current, active, * task_struct. */ -SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) +int ksys_unshare(unsigned long unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; @@ -2428,6 +2472,11 @@ bad_unshare_out: return err; } +SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) +{ + return ksys_unshare(unshare_flags); +} + /* * Helper to unshare the files of the current task. * We don't want to expose copy_files internals to diff --git a/kernel/futex.c b/kernel/futex.c index 7f719d110908..1f450e092c74 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -862,24 +862,6 @@ static void put_pi_state(struct futex_pi_state *pi_state) } } -/* - * Look up the task based on what TID userspace gave us. - * We dont trust it. - */ -static struct task_struct *futex_find_get_task(pid_t pid) -{ - struct task_struct *p; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) - get_task_struct(p); - - rcu_read_unlock(); - - return p; -} - #ifdef CONFIG_FUTEX_PI /* @@ -1183,7 +1165,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, */ if (!pid) return -ESRCH; - p = futex_find_get_task(pid); + p = find_get_task_by_vpid(pid); if (!p) return -ESRCH; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 6fc87ccda1d7..c6766f326072 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -132,3 +132,9 @@ config GENERIC_IRQ_DEBUGFS If you don't know what to do here, say N. endmenu + +config GENERIC_IRQ_MULTI_HANDLER + depends on !MULTI_IRQ_HANDLER + bool + help + Allow to specify the low level IRQ handler at run time. diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index a37a3b4b6342..f4f29b9d90ee 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_possible_cpumask(void) +static cpumask_var_t *alloc_node_to_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_possible_cpumask(cpumask_var_t *masks) +static void free_node_to_cpumask(cpumask_var_t *masks) { int node; @@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_possible_cpumask(cpumask_var_t *masks) +static void build_node_to_cpumask(cpumask_var_t *masks) { int cpu; @@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -94,73 +94,46 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, return nodes; } -/** - * irq_create_affinity_masks - Create affinity masks for multiqueue spreading - * @nvecs: The total number of vectors - * @affd: Description of the affinity requirements - * - * Returns the masks pointer or NULL if allocation failed. - */ -struct cpumask * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +static int irq_build_affinity_masks(const struct irq_affinity *affd, + int startvec, int numvecs, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct cpumask *masks) { - int n, nodes, cpus_per_vec, extra_vecs, curvec; - int affv = nvecs - affd->pre_vectors - affd->post_vectors; - int last_affv = affv + affd->pre_vectors; + int n, nodes, cpus_per_vec, extra_vecs, done = 0; + int last_affv = affd->pre_vectors + numvecs; + int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; - struct cpumask *masks; - cpumask_var_t nmsk, *node_to_possible_cpumask; - - /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. - */ - if (!affv) - return NULL; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto out; - node_to_possible_cpumask = alloc_node_to_possible_cpumask(); - if (!node_to_possible_cpumask) - goto out; + if (!cpumask_weight(cpu_mask)) + return 0; - /* Fill out vectors at the beginning that don't need affinity */ - for (curvec = 0; curvec < affd->pre_vectors; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); - - /* Stabilize the cpumasks */ - get_online_cpus(); - build_node_to_possible_cpumask(node_to_possible_cpumask); - nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, - &nodemsk); + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* * If the number of nodes in the mask is greater than or equal the * number of vectors we just spread the vectors across the nodes. */ - if (affv <= nodes) { + if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, - node_to_possible_cpumask[n]); - if (++curvec == last_affv) + cpumask_copy(masks + curvec, node_to_cpumask[n]); + if (++done == numvecs) break; + if (++curvec == last_affv) + curvec = affd->pre_vectors; } - goto done; + goto out; } for_each_node_mask(n, nodemsk) { int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ - vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; + vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -181,19 +154,96 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } - if (curvec >= last_affv) + done += v; + if (done >= numvecs) break; + if (curvec >= last_affv) + curvec = affd->pre_vectors; --nodes; } -done: +out: + return done; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements + * + * Returns the masks pointer or NULL if allocation failed. + */ +struct cpumask * +irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +{ + int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + int curvec, usedvecs; + cpumask_var_t nmsk, npresmsk, *node_to_cpumask; + struct cpumask *masks = NULL; + + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (nvecs == affd->pre_vectors + affd->post_vectors) + return NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto outcpumsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto outnpresmsk; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto outnodemsk; + + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); + + /* Stabilize the cpumasks */ + get_online_cpus(); + build_node_to_cpumask(node_to_cpumask); + + /* Spread on present CPUs starting from affd->pre_vectors */ + usedvecs = irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + + /* + * Spread on non present CPUs starting from the next vector to be + * handled. If the spreading of present CPUs already exhausted the + * vector space, assign the non present CPUs to the already spread + * out vectors. + */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors; + else + curvec = affd->pre_vectors + usedvecs; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + usedvecs += irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, npresmsk, + nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors + affvecs; + else + curvec = affd->pre_vectors + usedvecs; for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); - free_node_to_possible_cpumask(node_to_possible_cpumask); -out: + +outnodemsk: + free_node_to_cpumask(node_to_cpumask); +outnpresmsk: + free_cpumask_var(npresmsk); +outcpumsk: free_cpumask_var(nmsk); return masks; } diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 4e8089b319ae..16cbf6beb276 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/autoprobe.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains the interrupt probing code and driver APIs. @@ -71,7 +69,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) + if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 043bfc35b353..a2b3d9de999c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1,13 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/chip.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the core interrupt handling code, for irq-chip - * based architectures. - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the core interrupt handling code, for irq-chip based + * architectures. Detailed information is available in + * Documentation/core-api/genericirq.rst */ #include <linux/irq.h> @@ -294,11 +292,11 @@ int irq_activate(struct irq_desc *desc) return 0; } -void irq_activate_and_startup(struct irq_desc *desc, bool resend) +int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) - return; - irq_startup(desc, resend, IRQ_START_FORCE); + return 0; + return irq_startup(desc, resend, IRQ_START_FORCE); } static void __irq_disable(struct irq_desc *desc, bool mask); diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 9eb09aef0313..5b1072e394b2 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic cpu hotunplug interrupt migration code copied from the * arch/arm implementation diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index e4d3819a91cc..8ccb326d2977 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -3,8 +3,6 @@ * Debugging printout: */ -#include <linux/kallsyms.h> - #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ @@ -19,14 +17,14 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->handle_irq(): %p, %pS\n", + desc->handle_irq, desc->handle_irq); + printk("->irq_data.chip(): %p, %pS\n", + desc->irq_data.chip, desc->irq_data.chip); printk("->action(): %p\n", desc->action); if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); + printk("->action->handler(): %p, %pS\n", + desc->action->handler, desc->action->handler); } ___P(IRQ_LEVEL); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index acfaaef8672a..4dadeb3d6666 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -1,8 +1,6 @@ -/* - * Copyright 2017 Thomas Gleixner <tglx@linutronix.de> - * - * This file is licensed under the GPL V2. - */ +// SPDX-License-Identifier: GPL-2.0 +// Copyright 2017 Thomas Gleixner <tglx@linutronix.de> + #include <linux/irqdomain.h> #include <linux/irq.h> #include <linux/uaccess.h> diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 194c506d9d20..6a682c229e10 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/module.h> #include <linux/interrupt.h> #include <linux/device.h> diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 326a67f2410b..0b0cdf206dc4 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 508c03dfef25..e2999a070a99 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Library implementing the most common irq chip callback functions * diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 79f987b942b8..38554bc35375 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/handle.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the core interrupt handling code. - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the core interrupt handling code. Detailed + * information is available in Documentation/core-api/genericirq.rst * */ @@ -20,6 +18,10 @@ #include "internals.h" +#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER +void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; +#endif + /** * handle_bad_irq - handle spurious and unhandled irqs * @desc: description of the interrupt @@ -207,3 +209,14 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); return ret; } + +#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER +int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) +{ + if (handle_arch_irq) + return -EBUSY; + + handle_arch_irq = handle_irq; + return 0; +} +#endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ab19371eab9b..ca6afa267070 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_COND false extern int irq_activate(struct irq_desc *desc); -extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); +extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 259a22aa9934..8b778e37dc6d 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/ipi.c - * * Copyright (C) 2015 Imagination Technologies Ltd * Author: Qais Yousef <qais.yousef@imgtec.com> * diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 24caabf1a0f7..fc4f361a86bb 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl> * @@ -7,6 +8,7 @@ * option) any later version. */ +#include <linux/slab.h> #include <linux/irq_sim.h> #include <linux/irq.h> @@ -49,7 +51,8 @@ static void irq_sim_handle_irq(struct irq_work *work) * @sim: The interrupt simulator object to initialize. * @num_irqs: Number of interrupts to allocate * - * Returns 0 on success and a negative error number on failure. + * On success: return the base of the allocated interrupt range. + * On failure: a negative errno. */ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) { @@ -78,7 +81,7 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); sim->irq_count = num_irqs; - return 0; + return sim->irq_base; } EXPORT_SYMBOL_GPL(irq_sim_init); @@ -110,7 +113,8 @@ static void devm_irq_sim_release(struct device *dev, void *res) * @sim: The interrupt simulator object to initialize. * @num_irqs: Number of interrupts to allocate * - * Returns 0 on success and a negative error number on failure. + * On success: return the base of the allocated interrupt range. + * On failure: a negative errno. */ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, unsigned int num_irqs) @@ -123,7 +127,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, return -ENOMEM; rv = irq_sim_init(sim, num_irqs); - if (rv) { + if (rv < 0) { devres_free(dr); return rv; } @@ -131,7 +135,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, dr->sim = sim; devres_add(dev, dr); - return 0; + return rv; } EXPORT_SYMBOL_GPL(devm_irq_sim_init); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 49b54e9979cc..afc7f902d74a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -1,10 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the interrupt descriptor management code - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the interrupt descriptor management code. Detailed + * information is available in Documentation/core-api/genericirq.rst * */ #include <linux/irq.h> @@ -210,6 +210,22 @@ static ssize_t type_show(struct kobject *kobj, } IRQ_ATTR_RO(type); +static ssize_t wakeup_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + ret = sprintf(buf, "%s\n", + irqd_is_wakeup_set(&desc->irq_data) ? "enabled" : "disabled"); + raw_spin_unlock_irq(&desc->lock); + + return ret; + +} +IRQ_ATTR_RO(wakeup); + static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -253,6 +269,7 @@ static struct attribute *irq_attrs[] = { &chip_name_attr.attr, &hwirq_attr.attr, &type_attr.attr, + &wakeup_attr.attr, &name_attr.attr, &actions_attr.attr, NULL diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e6a9c36470ee..5d9fc01b60a6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + #define pr_fmt(fmt) "irq: " fmt #include <linux/acpi.h> @@ -1726,25 +1728,14 @@ static int irq_domain_debug_show(struct seq_file *m, void *p) irq_domain_debug_show_one(m, d, 0); return 0; } - -static int irq_domain_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_domain_debug_show, inode->i_private); -} - -static const struct file_operations dfs_domain_ops = { - .open = irq_domain_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { if (!d->name || !domain_dir || d->debugfs_file) return; d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, - &dfs_domain_ops); + &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) @@ -1760,7 +1751,8 @@ void __init irq_domain_debugfs_init(struct dentry *root) if (!domain_dir) return; - debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); + debugfs_create_file("default", 0444, domain_dir, NULL, + &irq_domain_debug_fops); mutex_lock(&irq_domain_mutex); list_for_each_entry(d, &irq_domain_list, link) debugfs_add_domain_dir(d); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0f922729bab9..e3336d904f64 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/manage.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * @@ -855,10 +854,14 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (cpumask_available(desc->irq_common_data.affinity)) - cpumask_copy(mask, desc->irq_common_data.affinity); - else + if (cpumask_available(desc->irq_common_data.affinity)) { + const struct cpumask *m; + + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); + } else { valid = false; + } raw_spin_unlock_irq(&desc->lock); if (valid) @@ -1519,9 +1522,9 @@ EXPORT_SYMBOL_GPL(setup_irq); * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ -static struct irqaction *__free_irq(unsigned int irq, void *dev_id) +static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; @@ -1651,7 +1654,7 @@ void remove_irq(unsigned int irq, struct irqaction *act) struct irq_desc *desc = irq_to_desc(irq); if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) - __free_irq(irq, act->dev_id); + __free_irq(desc, act->dev_id); } EXPORT_SYMBOL_GPL(remove_irq); @@ -1685,7 +1688,7 @@ const void *free_irq(unsigned int irq, void *dev_id) desc->affinity_notify = NULL; #endif - action = __free_irq(irq, dev_id); + action = __free_irq(desc, dev_id); if (!action) return NULL; diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 5187dfe809ac..5092494bf261 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -1,8 +1,6 @@ -/* - * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de> - * - * SPDX-License-Identifier: GPL-2.0 - */ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de> + #include <linux/spinlock.h> #include <linux/seq_file.h> #include <linux/bitmap.h> @@ -16,6 +14,7 @@ struct cpumap { unsigned int available; unsigned int allocated; unsigned int managed; + bool initialized; bool online; unsigned long alloc_map[IRQ_MATRIX_SIZE]; unsigned long managed_map[IRQ_MATRIX_SIZE]; @@ -81,9 +80,11 @@ void irq_matrix_online(struct irq_matrix *m) BUG_ON(cm->online); - bitmap_zero(cm->alloc_map, m->matrix_bits); - cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc); - cm->allocated = 0; + if (!cm->initialized) { + cm->available = m->alloc_size; + cm->available -= cm->managed + m->systembits_inalloc; + cm->initialized = true; + } m->global_available += cm->available; cm->online = true; m->online_maps++; @@ -370,14 +371,16 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) return; - if (cm->online) { - clear_bit(bit, cm->alloc_map); - cm->allocated--; + clear_bit(bit, cm->alloc_map); + cm->allocated--; + + if (cm->online) m->total_allocated--; - if (!managed) { - cm->available++; + + if (!managed) { + cm->available++; + if (cm->online) m->global_available++; - } } trace_irq_matrix_free(bit, cpu, m, cm); } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2f3c4f5382cc..2a8571f72b17 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/msi.c - * * Copyright (C) 2014 Intel Corp. * Author: Jiang Liu <jiang.liu@linux.intel.com> * diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 6bd9b58429cc..d6961d3c6f9e 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/pm.c - * * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file contains power management functions related to interrupts. diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e8f374971e37..7cb091d81d91 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/proc.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains the /proc/irq/ handling code. diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 1d08f45135c2..95414ad3506a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/resend.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner * diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index ef2a47e0eab6..d867d6ddafdd 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/spurious.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains spurious interrupt handling. @@ -10,7 +8,6 @@ #include <linux/jiffies.h> #include <linux/irq.h> #include <linux/module.h> -#include <linux/kallsyms.h> #include <linux/interrupt.h> #include <linux/moduleparam.h> #include <linux/timer.h> diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e0923fa4927a..1e4cb63a5c82 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -1,13 +1,6 @@ -/* - * linux/kernel/irq/timings.c - * - * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> + #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/slab.h> diff --git a/kernel/jump_label.c b/kernel/jump_label.c index b4517095db6a..01ebdf1f9f40 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -16,6 +16,7 @@ #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> #include <linux/cpu.h> +#include <asm/sections.h> #ifdef HAVE_JUMP_LABEL @@ -366,12 +367,16 @@ static void __jump_label_update(struct static_key *key, { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { /* - * entry->code set to 0 invalidates module init text sections - * kernel_text_address() verifies we are not in core kernel - * init code, see jump_label_invalidate_module_init(). + * An entry->code of 0 indicates an entry which has been + * disabled because it was in an init text area. */ - if (entry->code && kernel_text_address(entry->code)) - arch_jump_label_transform(entry, jump_label_type(entry)); + if (entry->code) { + if (kernel_text_address(entry->code)) + arch_jump_label_transform(entry, jump_label_type(entry)); + else + WARN_ONCE(1, "can't patch jump_label at %pS", + (void *)(unsigned long)entry->code); + } } } @@ -417,6 +422,19 @@ void __init jump_label_init(void) cpus_read_unlock(); } +/* Disable any jump label entries in __init/__exit code */ +void __init jump_label_invalidate_initmem(void) +{ + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + for (iter = iter_start; iter < iter_stop; iter++) { + if (init_section_contains((void *)(unsigned long)iter->code, 1)) + iter->code = 0; + } +} + #ifdef CONFIG_MODULES static enum jump_label_type jump_label_init_type(struct jump_entry *entry) @@ -633,6 +651,7 @@ static void jump_label_del_module(struct module *mod) } } +/* Disable any jump label entries in module init code */ static void jump_label_invalidate_module_init(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index d5fa4116688a..a23e21ada81b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -12,7 +12,6 @@ * compression (see scripts/kallsyms.c for a more complete description) */ #include <linux/kallsyms.h> -#include <linux/module.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> @@ -20,15 +19,12 @@ #include <linux/err.h> #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ -#include <linux/mm.h> #include <linux/ctype.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/compiler.h> -#include <asm/sections.h> - /* * These will be re-linked against their real values * during the second link stage. @@ -52,37 +48,6 @@ extern const u16 kallsyms_token_index[] __weak; extern const unsigned long kallsyms_markers[] __weak; -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -static inline int is_kernel_text(unsigned long addr) -{ - if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || - arch_is_kernel_text(addr)) - return 1; - return in_gate_area_no_mm(addr); -} - -static inline int is_kernel(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return in_gate_area_no_mm(addr); -} - -static int is_ksym_addr(unsigned long addr) -{ - if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) - return is_kernel(addr); - - return is_kernel_text(addr) || is_kernel_inittext(addr); -} - /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, @@ -464,17 +429,6 @@ int sprint_backtrace(char *buffer, unsigned long address) return __sprint_symbol(buffer, address, -1, 1); } -/* Look up a kernel symbol and print it to the kernel messages. */ -void __print_symbol(const char *fmt, unsigned long address) -{ - char buffer[KSYM_SYMBOL_LEN]; - - sprint_symbol(buffer, address); - - printk(fmt, buffer); -} -EXPORT_SYMBOL(__print_symbol); - /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; diff --git a/kernel/kcov.c b/kernel/kcov.c index 7594c033d98a..2c16f1ab5e10 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -358,7 +358,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, */ if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; - if (kcov->t != NULL) + t = current; + if (kcov->t != NULL || t->kcov != NULL) return -EBUSY; if (arg == KCOV_TRACE_PC) kcov->mode = KCOV_MODE_TRACE_PC; @@ -370,7 +371,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, #endif else return -EINVAL; - t = current; /* Cache in task struct for performance. */ t->kcov_size = kcov->size; t->kcov_area = kcov->area; diff --git a/kernel/kexec.c b/kernel/kexec.c index e62ec4dc6620..aed8fb2564b3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -192,11 +192,9 @@ out: * that to happen you need to do that yourself. */ -SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, - struct kexec_segment __user *, segments, unsigned long, flags) +static inline int kexec_load_check(unsigned long nr_segments, + unsigned long flags) { - int result; - /* We only trust the superuser with rebooting the system. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; @@ -208,17 +206,29 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) return -EINVAL; - /* Verify we are on the appropriate architecture */ - if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && - ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) - return -EINVAL; - /* Put an artificial cap on the number * of segments passed to kexec_load. */ if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; + return 0; +} + +SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, + struct kexec_segment __user *, segments, unsigned long, flags) +{ + int result; + + result = kexec_load_check(nr_segments, flags); + if (result) + return result; + + /* Verify we are on the appropriate architecture */ + if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && + ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) + return -EINVAL; + /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -247,15 +257,16 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, struct kexec_segment out, __user *ksegments; unsigned long i, result; + result = kexec_load_check(nr_segments, flags); + if (result) + return result; + /* Don't allow clients that don't understand the native * architecture to do anything. */ if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; - if (nr_segments > KEXEC_SEGMENT_MAX) - return -EINVAL; - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); @@ -272,6 +283,21 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EFAULT; } - return sys_kexec_load(entry, nr_segments, ksegments, flags); + /* Because we write directly to the reserved memory + * region when loading crash kernels we need a mutex here to + * prevent multiple crash kernels from attempting to load + * simultaneously, and to prevent a crash kernel from loading + * over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + result = do_kexec_load(entry, nr_segments, ksegments, flags); + + mutex_unlock(&kexec_mutex); + + return result; } #endif diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index e5bcd94c1efb..75d8e7cf040e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -22,50 +22,123 @@ #include <linux/ima.h> #include <crypto/hash.h> #include <crypto/sha.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/slab.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> #include "kexec_internal.h" static int kexec_calculate_store_digests(struct kimage *image); +/* + * Currently this is the only default function that is exported as some + * architectures need it to do additional handlings. + * In the future, other default functions may be exported too if required. + */ +int kexec_image_probe_default(struct kimage *image, void *buf, + unsigned long buf_len) +{ + const struct kexec_file_ops * const *fops; + int ret = -ENOEXEC; + + for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) { + ret = (*fops)->probe(buf, buf_len); + if (!ret) { + image->fops = *fops; + return ret; + } + } + + return ret; +} + /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { - return -ENOEXEC; + return kexec_image_probe_default(image, buf, buf_len); +} + +static void *kexec_image_load_default(struct kimage *image) +{ + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); } void * __weak arch_kexec_kernel_image_load(struct kimage *image) { - return ERR_PTR(-ENOEXEC); + return kexec_image_load_default(image); +} + +static int kexec_image_post_load_cleanup_default(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image->image_loader_data); } int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { - return -EINVAL; + return kexec_image_post_load_cleanup_default(image); } #ifdef CONFIG_KEXEC_VERIFY_SIG +static int kexec_image_verify_sig_default(struct kimage *image, void *buf, + unsigned long buf_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification.\n"); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(buf, buf_len); +} + int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { - return -EKEYREJECTED; + return kexec_image_verify_sig_default(image, buf, buf_len); } #endif -/* Apply relocations of type RELA */ +/* + * arch_kexec_apply_relocations_add - apply relocations of type RELA + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ int __weak -arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) +arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("RELA relocation unsupported.\n"); return -ENOEXEC; } -/* Apply relocations of type REL */ +/* + * arch_kexec_apply_relocations - apply relocations of type REL + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ int __weak -arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) +arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("REL relocation unsupported.\n"); return -ENOEXEC; @@ -532,6 +605,9 @@ static int kexec_calculate_store_digests(struct kimage *image) struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; + if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY)) + return 0; + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); zero_buf_sz = PAGE_SIZE; @@ -633,87 +709,29 @@ out: return ret; } -/* Actually load purgatory. Lot of code taken from kexec-tools */ -static int __kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down) +#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY +/* + * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. + * @pi: Purgatory to be loaded. + * @kbuf: Buffer to setup. + * + * Allocates the memory needed for the buffer. Caller is responsible to free + * the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi, + struct kexec_buf *kbuf) { - struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, bss_align, bss_sz, bss_pad; - unsigned long entry, load_addr, curr_load_addr, bss_addr, offset; - unsigned char *buf_addr, *src; - int i, ret = 0, entry_sidx = -1; - const Elf_Shdr *sechdrs_c; - Elf_Shdr *sechdrs = NULL; - struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, - .buf_min = min, .buf_max = max, - .top_down = top_down }; - - /* - * sechdrs_c points to section headers in purgatory and are read - * only. No modifications allowed. - */ - sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; - - /* - * We can not modify sechdrs_c[] and its fields. It is read only. - * Copy it over to a local copy where one can store some temporary - * data and free it at the end. We need to modify ->sh_addr and - * ->sh_offset fields to keep track of permanent and temporary - * locations of sections. - */ - sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - if (!sechdrs) - return -ENOMEM; - - memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - - /* - * We seem to have multiple copies of sections. First copy is which - * is embedded in kernel in read only section. Some of these sections - * will be copied to a temporary buffer and relocated. And these - * sections will finally be copied to their final destination at - * segment load time. - * - * Use ->sh_offset to reflect section address in memory. It will - * point to original read only copy if section is not allocatable. - * Otherwise it will point to temporary copy which will be relocated. - * - * Use ->sh_addr to contain final address of the section where it - * will go during execution time. - */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type == SHT_NOBITS) - continue; - - sechdrs[i].sh_offset = (unsigned long)pi->ehdr + - sechdrs[i].sh_offset; - } - - /* - * Identify entry point section and make entry relative to section - * start. - */ - entry = pi->ehdr->e_entry; - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - /* Make entry section relative */ - if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && - ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > - pi->ehdr->e_entry)) { - entry_sidx = i; - entry -= sechdrs[i].sh_addr; - break; - } - } + const Elf_Shdr *sechdrs; + unsigned long bss_align; + unsigned long bss_sz; + unsigned long align; + int i, ret; - /* Determine how much memory is needed to load relocatable object. */ - bss_align = 1; - bss_sz = 0; + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + kbuf->buf_align = bss_align = 1; + kbuf->bufsz = bss_sz = 0; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -721,111 +739,124 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, align = sechdrs[i].sh_addralign; if (sechdrs[i].sh_type != SHT_NOBITS) { - if (kbuf.buf_align < align) - kbuf.buf_align = align; - kbuf.bufsz = ALIGN(kbuf.bufsz, align); - kbuf.bufsz += sechdrs[i].sh_size; + if (kbuf->buf_align < align) + kbuf->buf_align = align; + kbuf->bufsz = ALIGN(kbuf->bufsz, align); + kbuf->bufsz += sechdrs[i].sh_size; } else { - /* bss section */ if (bss_align < align) bss_align = align; bss_sz = ALIGN(bss_sz, align); bss_sz += sechdrs[i].sh_size; } } + kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align); + kbuf->memsz = kbuf->bufsz + bss_sz; + if (kbuf->buf_align < bss_align) + kbuf->buf_align = bss_align; - /* Determine the bss padding required to align bss properly */ - bss_pad = 0; - if (kbuf.bufsz & (bss_align - 1)) - bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1)); - - kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz; + kbuf->buffer = vzalloc(kbuf->bufsz); + if (!kbuf->buffer) + return -ENOMEM; + pi->purgatory_buf = kbuf->buffer; - /* Allocate buffer for purgatory */ - kbuf.buffer = vzalloc(kbuf.bufsz); - if (!kbuf.buffer) { - ret = -ENOMEM; + ret = kexec_add_buffer(kbuf); + if (ret) goto out; - } - if (kbuf.buf_align < bss_align) - kbuf.buf_align = bss_align; + return 0; +out: + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + return ret; +} - /* Add buffer to segment list */ - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - pi->purgatory_load_addr = kbuf.mem; +/* + * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer. + * @pi: Purgatory to be loaded. + * @kbuf: Buffer prepared to store purgatory. + * + * Allocates the memory needed for the buffer. Caller is responsible to free + * the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, + struct kexec_buf *kbuf) +{ + unsigned long bss_addr; + unsigned long offset; + Elf_Shdr *sechdrs; + int i; + + /* + * The section headers in kexec_purgatory are read-only. In order to + * have them modifiable make a temporary copy. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, + pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + pi->sechdrs = sechdrs; - /* Load SHF_ALLOC sections */ - buf_addr = kbuf.buffer; - load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + kbuf.bufsz + bss_pad; + offset = 0; + bss_addr = kbuf->mem + kbuf->bufsz; + kbuf->image->start = pi->ehdr->e_entry; for (i = 0; i < pi->ehdr->e_shnum; i++) { + unsigned long align; + void *src, *dst; + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) continue; align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *) sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; - } else { + if (sechdrs[i].sh_type == SHT_NOBITS) { bss_addr = ALIGN(bss_addr, align); sechdrs[i].sh_addr = bss_addr; bss_addr += sechdrs[i].sh_size; + continue; } - } - /* Update entry point based on load address of text section */ - if (entry_sidx >= 0) - entry += sechdrs[entry_sidx].sh_addr; + offset = ALIGN(offset, align); + if (sechdrs[i].sh_flags & SHF_EXECINSTR && + pi->ehdr->e_entry >= sechdrs[i].sh_addr && + pi->ehdr->e_entry < (sechdrs[i].sh_addr + + sechdrs[i].sh_size)) { + kbuf->image->start -= sechdrs[i].sh_addr; + kbuf->image->start += kbuf->mem + offset; + } - /* Make kernel jump to purgatory after shutdown */ - image->start = entry; + src = (void *)pi->ehdr + sechdrs[i].sh_offset; + dst = pi->purgatory_buf + offset; + memcpy(dst, src, sechdrs[i].sh_size); - /* Used later to get/set symbol values */ - pi->sechdrs = sechdrs; + sechdrs[i].sh_addr = kbuf->mem + offset; + sechdrs[i].sh_offset = offset; + offset += sechdrs[i].sh_size; + } - /* - * Used later to identify which section is purgatory and skip it - * from checksumming. - */ - pi->purgatory_buf = kbuf.buffer; - return ret; -out: - vfree(sechdrs); - vfree(kbuf.buffer); - return ret; + return 0; } static int kexec_apply_relocations(struct kimage *image) { int i, ret; struct purgatory_info *pi = &image->purgatory_info; - Elf_Shdr *sechdrs = pi->sechdrs; + const Elf_Shdr *sechdrs; + + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - /* Apply relocations */ for (i = 0; i < pi->ehdr->e_shnum; i++) { - Elf_Shdr *section, *symtab; + const Elf_Shdr *relsec; + const Elf_Shdr *symtab; + Elf_Shdr *section; + + relsec = sechdrs + i; - if (sechdrs[i].sh_type != SHT_RELA && - sechdrs[i].sh_type != SHT_REL) + if (relsec->sh_type != SHT_RELA && + relsec->sh_type != SHT_REL) continue; /* @@ -834,12 +865,12 @@ static int kexec_apply_relocations(struct kimage *image) * symbol table. And ->sh_info contains section header * index of section to which relocations apply. */ - if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || - sechdrs[i].sh_link >= pi->ehdr->e_shnum) + if (relsec->sh_info >= pi->ehdr->e_shnum || + relsec->sh_link >= pi->ehdr->e_shnum) return -ENOEXEC; - section = &sechdrs[sechdrs[i].sh_info]; - symtab = &sechdrs[sechdrs[i].sh_link]; + section = pi->sechdrs + relsec->sh_info; + symtab = sechdrs + relsec->sh_link; if (!(section->sh_flags & SHF_ALLOC)) continue; @@ -856,12 +887,12 @@ static int kexec_apply_relocations(struct kimage *image) * Respective architecture needs to provide support for applying * relocations of type SHT_RELA/SHT_REL. */ - if (sechdrs[i].sh_type == SHT_RELA) - ret = arch_kexec_apply_relocations_add(pi->ehdr, - sechdrs, i); - else if (sechdrs[i].sh_type == SHT_REL) - ret = arch_kexec_apply_relocations(pi->ehdr, - sechdrs, i); + if (relsec->sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi, section, + relsec, symtab); + else if (relsec->sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi, section, + relsec, symtab); if (ret) return ret; } @@ -869,10 +900,18 @@ static int kexec_apply_relocations(struct kimage *image) return 0; } -/* Load relocatable purgatory object and relocate it appropriately */ -int kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down, - unsigned long *load_addr) +/* + * kexec_load_purgatory - Load and relocate the purgatory object. + * @image: Image to add the purgatory to. + * @kbuf: Memory parameters to use. + * + * Allocates the memory needed for image->purgatory_info.sechdrs and + * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible + * to free the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf) { struct purgatory_info *pi = &image->purgatory_info; int ret; @@ -880,55 +919,51 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, if (kexec_purgatory_size <= 0) return -EINVAL; - if (kexec_purgatory_size < sizeof(Elf_Ehdr)) - return -ENOEXEC; - - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; - - if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 - || pi->ehdr->e_type != ET_REL - || !elf_check_arch(pi->ehdr) - || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) - return -ENOEXEC; - - if (pi->ehdr->e_shoff >= kexec_purgatory_size - || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > - kexec_purgatory_size - pi->ehdr->e_shoff)) - return -ENOEXEC; + pi->ehdr = (const Elf_Ehdr *)kexec_purgatory; - ret = __kexec_load_purgatory(image, min, max, top_down); + ret = kexec_purgatory_setup_kbuf(pi, kbuf); if (ret) return ret; + ret = kexec_purgatory_setup_sechdrs(pi, kbuf); + if (ret) + goto out_free_kbuf; + ret = kexec_apply_relocations(image); if (ret) goto out; - *load_addr = pi->purgatory_load_addr; return 0; out: vfree(pi->sechdrs); pi->sechdrs = NULL; - +out_free_kbuf: vfree(pi->purgatory_buf); pi->purgatory_buf = NULL; return ret; } -static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) +/* + * kexec_purgatory_find_symbol - find a symbol in the purgatory + * @pi: Purgatory to search in. + * @name: Name of the symbol. + * + * Return: pointer to symbol in read-only symtab on success, NULL on error. + */ +static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) { - Elf_Sym *syms; - Elf_Shdr *sechdrs; - Elf_Ehdr *ehdr; - int i, k; + const Elf_Shdr *sechdrs; + const Elf_Ehdr *ehdr; + const Elf_Sym *syms; const char *strtab; + int i, k; - if (!pi->sechdrs || !pi->ehdr) + if (!pi->ehdr) return NULL; - sechdrs = pi->sechdrs; ehdr = pi->ehdr; + sechdrs = (void *)ehdr + ehdr->e_shoff; for (i = 0; i < ehdr->e_shnum; i++) { if (sechdrs[i].sh_type != SHT_SYMTAB) @@ -937,8 +972,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, if (sechdrs[i].sh_link >= ehdr->e_shnum) /* Invalid strtab section number */ continue; - strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (Elf_Sym *)sechdrs[i].sh_offset; + strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (void *)ehdr + sechdrs[i].sh_offset; /* Go through symbols for a match */ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { @@ -966,7 +1001,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) { struct purgatory_info *pi = &image->purgatory_info; - Elf_Sym *sym; + const Elf_Sym *sym; Elf_Shdr *sechdr; sym = kexec_purgatory_find_symbol(pi, name); @@ -989,9 +1024,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value) { - Elf_Sym *sym; - Elf_Shdr *sechdrs; struct purgatory_info *pi = &image->purgatory_info; + const Elf_Sym *sym; + Elf_Shdr *sec; char *sym_buf; sym = kexec_purgatory_find_symbol(pi, name); @@ -1004,16 +1039,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return -EINVAL; } - sechdrs = pi->sechdrs; + sec = pi->sechdrs + sym->st_shndx; - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sec->sh_type == SHT_NOBITS) { pr_err("symbol %s is in a bss section. Cannot %s\n", name, get_value ? "get" : "set"); return -EINVAL; } - sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + - sym->st_value; + sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value; if (get_value) memcpy((void *)buf, sym_buf, size); @@ -1022,3 +1056,174 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } +#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ + +int crash_exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happened, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == mem->max_nr_ranges - 1) + return -ENOMEM; + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf; + unsigned int cpu, i; + unsigned long long notes_addr; + unsigned long mstart, mend; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += mem->nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + ehdr = (Elf64_Ehdr *)buf; + phdr = (Elf64_Phdr *)(ehdr + 1); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + phdr++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; + (ehdr->e_phnum)++; + phdr++; + + /* Prepare PT_LOAD type program header for kernel text region */ + if (kernel_map) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + ehdr->e_phnum++; + phdr++; + } + + /* Go through all the ranges in mem->ranges[] and prepare phdr */ + for (i = 0; i < mem->nr_ranges; i++) { + mstart = mem->ranges[i].start; + mend = mem->ranges[i].end; + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + phdr++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + *addr = buf; + *sz = elf_sz; + return 0; +} diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da2ccf142358..102160ff5c66 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,67 +978,90 @@ static int prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void arm_kprobe_ftrace(struct kprobe *p) +static int arm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 0, 0); - WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); - kprobe_ftrace_enabled++; - if (kprobe_ftrace_enabled == 1) { + if (ret) { + pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; + } + + if (kprobe_ftrace_enabled == 0) { ret = register_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (ret) { + pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); + goto err_ftrace; + } } + + kprobe_ftrace_enabled++; + return ret; + +err_ftrace: + /* + * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a + * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental + * empty filter_hash which would undesirably trace all functions. + */ + ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); + return ret; } /* Caller must lock kprobe_mutex */ -static void disarm_kprobe_ftrace(struct kprobe *p) +static int disarm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; - kprobe_ftrace_enabled--; - if (kprobe_ftrace_enabled == 0) { + if (kprobe_ftrace_enabled == 1) { ret = unregister_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) + return ret; } + + kprobe_ftrace_enabled--; + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) do {} while (0) -#define disarm_kprobe_ftrace(p) do {} while (0) +#define arm_kprobe_ftrace(p) (-ENODEV) +#define disarm_kprobe_ftrace(p) (-ENODEV) #endif /* Arm a kprobe with text_mutex */ -static void arm_kprobe(struct kprobe *kp) +static int arm_kprobe(struct kprobe *kp) { - if (unlikely(kprobe_ftrace(kp))) { - arm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return arm_kprobe_ftrace(kp); + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* Disarm a kprobe with text_mutex */ -static void disarm_kprobe(struct kprobe *kp, bool reopt) +static int disarm_kprobe(struct kprobe *kp, bool reopt) { - if (unlikely(kprobe_ftrace(kp))) { - disarm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return disarm_kprobe_ftrace(kp); cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* @@ -1362,9 +1385,15 @@ out: if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) + if (!kprobes_all_disarmed) { /* Arm the breakpoint again. */ - arm_kprobe(ap); + ret = arm_kprobe(ap); + if (ret) { + ap->flags |= KPROBE_FLAG_DISABLED; + list_del_rcu(&p->list); + synchronize_sched(); + } + } } return ret; } @@ -1573,8 +1602,14 @@ int register_kprobe(struct kprobe *p) hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (!kprobes_all_disarmed && !kprobe_disabled(p)) - arm_kprobe(p); + if (!kprobes_all_disarmed && !kprobe_disabled(p)) { + ret = arm_kprobe(p); + if (ret) { + hlist_del_rcu(&p->hlist); + synchronize_sched(); + goto out; + } + } /* Try to optimize kprobe */ try_to_optimize_kprobe(p); @@ -1608,11 +1643,12 @@ static int aggr_kprobe_disabled(struct kprobe *ap) static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; + int ret; /* Get an original kprobe for return */ orig_p = __get_valid_kprobe(p); if (unlikely(orig_p == NULL)) - return NULL; + return ERR_PTR(-EINVAL); if (!kprobe_disabled(p)) { /* Disable probe if it is a child probe */ @@ -1626,8 +1662,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) * should have already been disarmed, so * skip unneed disarming process. */ - if (!kprobes_all_disarmed) - disarm_kprobe(orig_p, true); + if (!kprobes_all_disarmed) { + ret = disarm_kprobe(orig_p, true); + if (ret) { + p->flags &= ~KPROBE_FLAG_DISABLED; + return ERR_PTR(ret); + } + } orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -1644,8 +1685,8 @@ static int __unregister_kprobe_top(struct kprobe *p) /* Disable kprobe. This will disarm it if needed. */ ap = __disable_kprobe(p); - if (ap == NULL) - return -EINVAL; + if (IS_ERR(ap)) + return PTR_ERR(ap); if (ap == p) /* @@ -2078,12 +2119,14 @@ static void kill_kprobe(struct kprobe *p) int disable_kprobe(struct kprobe *kp) { int ret = 0; + struct kprobe *p; mutex_lock(&kprobe_mutex); /* Disable this kprobe */ - if (__disable_kprobe(kp) == NULL) - ret = -EINVAL; + p = __disable_kprobe(kp); + if (IS_ERR(p)) + ret = PTR_ERR(p); mutex_unlock(&kprobe_mutex); return ret; @@ -2116,7 +2159,9 @@ int enable_kprobe(struct kprobe *kp) if (!kprobes_all_disarmed && kprobe_disabled(p)) { p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); + ret = arm_kprobe(p); + if (ret) + p->flags |= KPROBE_FLAG_DISABLED; } out: mutex_unlock(&kprobe_mutex); @@ -2407,11 +2452,12 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -static void arm_all_kprobes(void) +static int arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); @@ -2428,46 +2474,74 @@ static void arm_all_kprobes(void) /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) - if (!kprobe_disabled(p)) - arm_kprobe(p); + /* Arm all kprobes on a best-effort basis */ + hlist_for_each_entry_rcu(p, head, hlist) { + if (!kprobe_disabled(p)) { + err = arm_kprobe(p); + if (err) { + errors++; + ret = err; + } + total++; + } + } } - printk(KERN_INFO "Kprobes globally enabled\n"); + if (errors) + pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); - return; + return ret; } -static void disarm_all_kprobes(void) +static int disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); /* If kprobes are already disarmed, just return */ if (kprobes_all_disarmed) { mutex_unlock(&kprobe_mutex); - return; + return 0; } kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; + /* Disarm all kprobes on a best-effort basis */ hlist_for_each_entry_rcu(p, head, hlist) { - if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - disarm_kprobe(p, false); + if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { + err = disarm_kprobe(p, false); + if (err) { + errors++; + ret = err; + } + total++; + } } } + + if (errors) + pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally disabled\n"); + mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ wait_for_kprobe_optimizer(); + + return ret; } /* @@ -2494,6 +2568,7 @@ static ssize_t write_enabled_file_bool(struct file *file, { char buf[32]; size_t buf_size; + int ret = 0; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -2504,17 +2579,20 @@ static ssize_t write_enabled_file_bool(struct file *file, case 'y': case 'Y': case '1': - arm_all_kprobes(); + ret = arm_all_kprobes(); break; case 'n': case 'N': case '0': - disarm_all_kprobes(); + ret = disarm_all_kprobes(); break; default: return -EINVAL; } + if (ret) + return ret; + return count; } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 89b5f83f1969..023386338269 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock) return; } + printk(KERN_CONT "%p", hlock->instance); print_lock_name(lock_classes + class_idx - 1); - printk(KERN_CONT ", at: [<%p>] %pS\n", - (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); + printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } static void lockdep_print_held_locks(struct task_struct *curr) @@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - printk("\nnew class %p: %s", class->key, class->name); + printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); @@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) } printk("%*s }\n", depth, ""); - printk("%*s ... key at: [<%p>] %pS\n", + printk("%*s ... key at: [<%px>] %pS\n", depth, "", class->key, class->key); } @@ -2340,7 +2340,7 @@ cache_hit: if (very_verbose(class)) { printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", + "%016Lx tail class: [%px] %s\n", (unsigned long long)chain_key, class->key, class->name); } @@ -2349,7 +2349,7 @@ cache_hit: } if (very_verbose(class)) { - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n", (unsigned long long)chain_key, class->key, class->name); } @@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): [<%p>] %pS\n", + printk("hardirqs last enabled at (%u): [<%px>] %pS\n", curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, (void *)curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): [<%p>] %pS\n", + printk("hardirqs last disabled at (%u): [<%px>] %pS\n", curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, (void *)curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): [<%p>] %pS\n", + printk("softirqs last enabled at (%u): [<%px>] %pS\n", curr->softirq_enable_event, (void *)curr->softirq_enable_ip, (void *)curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): [<%p>] %pS\n", + printk("softirqs last disabled at (%u): [<%px>] %pS\n", curr->softirq_disable_event, (void *)curr->softirq_disable_ip, (void *)curr->softirq_disable_ip); } @@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, * Sanity check, the lock-class key must be persistent: */ if (!static_obj(key)) { - printk("BUG: key %p not in .data!\n", key); + printk("BUG: key %px not in .data!\n", key); /* * What it says above ^^^^^, I suggest you read it. */ @@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } atomic_inc((atomic_t *)&class->ops); if (very_verbose(class)) { - printk("\nacquire class [%p] %s", class->key, class->name); + printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); @@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 858a07590e39..2048359f33d2 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1082,15 +1082,16 @@ static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock); /** - * mutex_lock_interruptible - acquire the mutex, interruptible - * @lock: the mutex to be acquired + * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals. + * @lock: The mutex to be acquired. * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. + * Lock the mutex like mutex_lock(). If a signal is delivered while the + * process is sleeping, this function will return without acquiring the + * mutex. * - * This function is similar to (but not equivalent to) down_interruptible(). + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * signal arrived. */ int __sched mutex_lock_interruptible(struct mutex *lock) { @@ -1104,6 +1105,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock) EXPORT_SYMBOL(mutex_lock_interruptible); +/** + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals. + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). If a signal which will be fatal to + * the current process is delivered while the process is sleeping, this + * function will return without acquiring the mutex. + * + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * fatal signal arrived. + */ int __sched mutex_lock_killable(struct mutex *lock) { might_sleep(); @@ -1115,6 +1128,16 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); +/** + * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). While the task is waiting for this + * mutex, it will be accounted as being in the IO wait state by the + * scheduler. + * + * Context: Process context. + */ void __sched mutex_lock_io(struct mutex *lock) { int token; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38ece035039e..d880296245c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -379,6 +379,14 @@ queue: tail = encode_tail(smp_processor_id(), idx); node += idx; + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node->locked = 0; node->next = NULL; pv_init_node(node); @@ -408,14 +416,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* - * The above xchg_tail() is also a load of @lock which - * generates, through decode_tail(), a pointer. The address - * dependency matches the RELEASE of xchg_tail() such that - * the subsequent access to @prev happens after. + * We must ensure that the stores to @node are observed before + * the write to prev->next. The address dependency from + * xchg_tail is not sufficient to ensure this because the read + * component of xchg_tail is unordered with respect to the + * initialisation of @node. */ - - WRITE_ONCE(prev->next, node); + smp_store_release(&prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 65cc0cb984e6..4f014be7a4b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) { __set_current_state(TASK_RUNNING); - if (rt_mutex_has_waiters(lock)) - remove_waiter(lock, &waiter); + remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); } @@ -1616,11 +1615,12 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) { DEFINE_WAKE_Q(wake_q); + unsigned long flags; bool postunlock; - raw_spin_lock_irq(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); postunlock = __rt_mutex_futex_unlock(lock, &wake_q); - raw_spin_unlock_irq(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); if (postunlock) rt_mutex_postunlock(&wake_q); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 68686b3ec3c1..d1d62f942be2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { - struct rt_mutex_waiter *w; - - w = rb_entry(lock->waiters.rb_leftmost, - struct rt_mutex_waiter, tree_entry); - BUG_ON(w->lock != lock); + struct rb_node *leftmost = rb_first_cached(&lock->waiters); + struct rt_mutex_waiter *w = NULL; + if (leftmost) { + w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + BUG_ON(w->lock != lock); + } return w; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f549c552dbf1..30465a2f2b6c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); __up_read(sem); } @@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_clear_owner(sem); __up_write(sem); @@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_set_reader_owned(sem); __downgrade_write(sem); @@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { + DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); __up_read(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a883b8f1fdc6..a17cba8d94bb 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -16,6 +16,12 @@ */ #define RWSEM_READER_OWNED ((struct task_struct *)1UL) +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) +#else +# define DEBUG_RWSEMS_WARN_ON(c) +#endif + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) * do a write to the rwsem cacheline when it is really necessary * to minimize cacheline contention. */ - if (sem->owner != RWSEM_READER_OWNED) + if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED) WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } diff --git a/kernel/memremap.c b/kernel/memremap.c index 403ab9cdb949..895e6b76b25e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -struct page_map { - struct resource res; - struct percpu_ref *ref; - struct dev_pagemap pgmap; - struct vmem_altmap altmap; -}; - static unsigned long order_at(struct resource *res, unsigned long pgoff) { unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; @@ -248,50 +241,59 @@ int device_private_entry_fault(struct vm_area_struct *vma, EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ -static void pgmap_radix_release(struct resource *res) +static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) { unsigned long pgoff, order; mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) + foreach_order_pgoff(res, order, pgoff) { + if (pgoff >= end_pgoff) + break; radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); + } mutex_unlock(&pgmap_lock); synchronize_rcu(); } -static unsigned long pfn_first(struct page_map *page_map) +static unsigned long pfn_first(struct dev_pagemap *pgmap) { - struct dev_pagemap *pgmap = &page_map->pgmap; - const struct resource *res = &page_map->res; - struct vmem_altmap *altmap = pgmap->altmap; + const struct resource *res = &pgmap->res; + struct vmem_altmap *altmap = &pgmap->altmap; unsigned long pfn; pfn = res->start >> PAGE_SHIFT; - if (altmap) + if (pgmap->altmap_valid) pfn += vmem_altmap_offset(altmap); return pfn; } -static unsigned long pfn_end(struct page_map *page_map) +static unsigned long pfn_end(struct dev_pagemap *pgmap) { - const struct resource *res = &page_map->res; + const struct resource *res = &pgmap->res; return (res->start + resource_size(res)) >> PAGE_SHIFT; } +static unsigned long pfn_next(unsigned long pfn) +{ + if (pfn % 1024 == 0) + cond_resched(); + return pfn + 1; +} + #define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) + for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) -static void devm_memremap_pages_release(struct device *dev, void *data) +static void devm_memremap_pages_release(void *data) { - struct page_map *page_map = data; - struct resource *res = &page_map->res; + struct dev_pagemap *pgmap = data; + struct device *dev = pgmap->dev; + struct resource *res = &pgmap->res; resource_size_t align_start, align_size; - struct dev_pagemap *pgmap = &page_map->pgmap; unsigned long pfn; - for_each_device_pfn(pfn, page_map) + for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); if (percpu_ref_tryget_live(pgmap->ref)) { @@ -301,56 +303,51 @@ static void devm_memremap_pages_release(struct device *dev, void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; mem_hotplug_begin(); - arch_remove_memory(align_start, align_size); + arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? + &pgmap->altmap : NULL); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res); - dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, - "%s: failed to free all reserved pages\n", __func__); -} - -/* assumes rcu_read_lock() held at entry */ -struct dev_pagemap *find_dev_pagemap(resource_size_t phys) -{ - struct page_map *page_map; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); - return page_map ? &page_map->pgmap : NULL; + pgmap_radix_release(res, -1); + dev_WARN_ONCE(dev, pgmap->altmap.alloc, + "%s: failed to free all reserved pages\n", __func__); } /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res - * @res: "host memory" address range - * @ref: a live per-cpu reference count - * @altmap: optional descriptor for allocating the memmap from @res + * @pgmap: pointer to a struct dev_pgmap * * Notes: - * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time - * (or devm release event). The expected order of events is that @ref has + * 1/ At a minimum the res, ref and type members of @pgmap must be initialized + * by the caller before passing it to this function + * + * 2/ The altmap field may optionally be initialized, in which case altmap_valid + * must be set to true + * + * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages() + * time (or devm release event). The expected order of events is that ref has * been through percpu_ref_kill() before devm_memremap_pages_release(). The * wait for the completion of all references being dropped and * percpu_ref_exit() must occur after devm_memremap_pages_release(). * - * 2/ @res is expected to be a host memory range that could feasibly be + * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but * this is not enforced. */ -void *devm_memremap_pages(struct device *dev, struct resource *res, - struct percpu_ref *ref, struct vmem_altmap *altmap) +void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; + struct vmem_altmap *altmap = pgmap->altmap_valid ? + &pgmap->altmap : NULL; + struct resource *res = &pgmap->res; unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; - struct dev_pagemap *pgmap; - struct page_map *page_map; - int error, nid, is_ram, i = 0; + int error, nid, is_ram; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -367,47 +364,18 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (!ref) + if (!pgmap->ref) return ERR_PTR(-EINVAL); - page_map = devres_alloc_node(devm_memremap_pages_release, - sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); - if (!page_map) - return ERR_PTR(-ENOMEM); - pgmap = &page_map->pgmap; - - memcpy(&page_map->res, res, sizeof(*res)); - pgmap->dev = dev; - if (altmap) { - memcpy(&page_map->altmap, altmap, sizeof(*altmap)); - pgmap->altmap = &page_map->altmap; - } - pgmap->ref = ref; - pgmap->res = &page_map->res; - pgmap->type = MEMORY_DEVICE_HOST; - pgmap->page_fault = NULL; - pgmap->page_free = NULL; - pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; align_end = align_start + align_size - 1; foreach_order_pgoff(res, order, pgoff) { - struct dev_pagemap *dup; - - rcu_read_lock(); - dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); - rcu_read_unlock(); - if (dup) { - dev_err(dev, "%s: %pr collides with mapping for %s\n", - __func__, res, dev_name(dup->dev)); - error = -EBUSY; - break; - } error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, page_map); + PHYS_PFN(res->start) + pgoff, order, pgmap); if (error) { dev_err(dev, "%s: failed: %d\n", __func__, error); break; @@ -427,16 +395,16 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, false); + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT); + align_size >> PAGE_SHIFT, altmap); mem_hotplug_done(); if (error) goto err_add_memory; - for_each_device_pfn(pfn, page_map) { + for_each_device_pfn(pfn, pgmap) { struct page *page = pfn_to_page(pfn); /* @@ -447,19 +415,18 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, */ list_del(&page->lru); page->pgmap = pgmap; - percpu_ref_get(ref); - if (!(++i % 1024)) - cond_resched(); + percpu_ref_get(pgmap->ref); } - devres_add(dev, page_map); + + devm_add_action(dev, devm_memremap_pages_release, pgmap); + return __va(res->start); err_add_memory: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: - pgmap_radix_release(res); - devres_free(page_map); + pgmap_radix_release(res, pgoff); return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@ -475,34 +442,39 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +/** + * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn + * @pfn: page frame number to lookup page_map + * @pgmap: optional known pgmap that already has a reference + * + * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap + * is non-NULL but does not cover @pfn the reference to it will be released. + */ +struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) { - /* - * 'memmap_start' is the virtual address for the first "struct - * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple - * pointer arithmetic, so we can perform this to_vmem_altmap() - * conversion without concern for the initialization state of - * the struct page fields. - */ - struct page *page = (struct page *) memmap_start; - struct dev_pagemap *pgmap; + resource_size_t phys = PFN_PHYS(pfn); /* - * Unconditionally retrieve a dev_pagemap associated with the - * given physical address, this is only for use in the - * arch_{add|remove}_memory() for setting up and tearing down - * the memmap. + * In the cached case we're already holding a live reference. */ + if (pgmap) { + if (phys >= pgmap->res.start && phys <= pgmap->res.end) + return pgmap; + put_dev_pagemap(pgmap); + } + + /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); + pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); + if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + pgmap = NULL; rcu_read_unlock(); - return pgmap ? pgmap->altmap : NULL; + return pgmap; } #endif /* CONFIG_ZONE_DEVICE */ - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) void put_zone_device_private_or_public_page(struct page *page) { diff --git a/kernel/module.c b/kernel/module.c index 09e48eee4d55..a6e43a5806a1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2181,10 +2181,6 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); - -#ifdef CONFIG_MPU - update_protections(current->mm); -#endif } void *__symbol_get(const char *symbol) @@ -3129,7 +3125,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif - +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + mod->ei_funcs = section_objs(info, "_error_injection_whitelist", + sizeof(*mod->ei_funcs), + &mod->num_ei_funcs); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); @@ -3800,6 +3800,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_disable_nx(mod); ddebug_cleanup: + ftrace_release_mod(mod); dynamic_debug_remove(mod, info->debug); synchronize_sched(); kfree(mod->args); @@ -3819,12 +3820,6 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_sched(); mutex_unlock(&module_mutex); free_module: - /* - * Ftrace needs to clean up what it initialized. - * This does nothing if ftrace_module_init() wasn't called, - * but it must be called outside of module_mutex. - */ - ftrace_release_mod(mod); /* Free lock-classes; relies on the preceding sync_rcu() */ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); @@ -3949,6 +3944,12 @@ static const char *get_ksymbol(struct module *mod, return symname(kallsyms, best); } +void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + /* For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ const char *module_address_lookup(unsigned long addr, @@ -4223,7 +4224,7 @@ static int modules_open(struct inode *inode, struct file *file) m->private = kallsyms_show_value() ? NULL : (void *)8ul; } - return 0; + return err; } static const struct file_operations proc_modules_operations = { diff --git a/kernel/padata.c b/kernel/padata.c index 57c0074d50cc..d568cc56405f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * padata.c - generic interface to process data streams in parallel * diff --git a/kernel/panic.c b/kernel/panic.c index 2cfef408fec9..42e487488554 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,7 +34,8 @@ #define PANIC_BLINK_SPD 18 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; -static unsigned long tainted_mask; +static unsigned long tainted_mask = + IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -289,7 +290,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif - pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); + pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @@ -308,52 +309,40 @@ EXPORT_SYMBOL(panic); * is being removed anyway. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ - { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ - { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ - { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ - { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ - { 'B', ' ', false }, /* TAINT_BAD_PAGE */ - { 'U', ' ', false }, /* TAINT_USER */ - { 'D', ' ', false }, /* TAINT_DIE */ - { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ - { 'W', ' ', false }, /* TAINT_WARN */ - { 'C', ' ', true }, /* TAINT_CRAP */ - { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ - { 'O', ' ', true }, /* TAINT_OOT_MODULE */ - { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ - { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ - { 'K', ' ', true }, /* TAINT_LIVEPATCH */ - { 'X', ' ', true }, /* TAINT_AUX */ + [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true }, + [ TAINT_FORCED_MODULE ] = { 'F', ' ', true }, + [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false }, + [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false }, + [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false }, + [ TAINT_BAD_PAGE ] = { 'B', ' ', false }, + [ TAINT_USER ] = { 'U', ' ', false }, + [ TAINT_DIE ] = { 'D', ' ', false }, + [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false }, + [ TAINT_WARN ] = { 'W', ' ', false }, + [ TAINT_CRAP ] = { 'C', ' ', true }, + [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false }, + [ TAINT_OOT_MODULE ] = { 'O', ' ', true }, + [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true }, + [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, + [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, + [ TAINT_AUX ] = { 'X', ' ', true }, + [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, }; /** - * print_tainted - return a string to represent the kernel taint state. + * print_tainted - return a string to represent the kernel taint state. * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'D' - Kernel has oopsed before - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * 'C' - modules from drivers/staging are loaded. - * 'I' - Working around severe firmware bug. - * 'O' - Out-of-tree module has been loaded. - * 'E' - Unsigned module has been loaded. - * 'L' - A soft lockup has previously occurred. - * 'K' - Kernel has been live patched. - * 'X' - Auxiliary taint, for distros' use. + * For individual taint flag meanings, see Documentation/sysctl/kernel.txt * - * The string is overwritten by the next call to print_tainted(). + * The string is overwritten by the next call to print_tainted(), + * but is always NULL terminated. */ const char *print_tainted(void) { static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; + BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); + if (tainted_mask) { char *s; int i; @@ -554,6 +543,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, else dump_stack(); + print_irqtrace_events(current); + print_oops_end_marker(); /* Just a warning, don't kill lockdep. */ @@ -640,7 +631,7 @@ device_initcall(register_warn_debugfs); */ __visible void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted in: %p\n", + panic("stack-protector: Kernel stack is corrupted in: %pB\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/params.c b/kernel/params.c index cc9108c2a1fd..ce89f757e6da 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b) static void param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { - pr_warn("Setting dangerous option %s - tainting kernel\n", - kp->name); + pr_notice("Setting dangerous option %s - tainting kernel\n", + kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } } diff --git a/kernel/pid.c b/kernel/pid.c index 5d30c87e3c42..157fe4b19971 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT; */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .idr = IDR_INIT, + .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, @@ -343,6 +343,19 @@ struct task_struct *find_task_by_vpid(pid_t vnr) return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } +struct task_struct *find_get_task_by_vpid(pid_t nr) +{ + struct task_struct *task; + + rcu_read_lock(); + task = find_task_by_vpid(nr); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0b53eef7d34b..2a2ac53d8b8b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,55 +23,39 @@ #include <linux/sched/signal.h> #include <linux/idr.h> -struct pid_cache { - int nr_ids; - char name[16]; - struct kmem_cache *cachep; - struct list_head list; -}; - -static LIST_HEAD(pid_caches_lh); static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 +/* Write once array, filled from the beginning. */ +static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; /* * creates the kmem cache to allocate pids from. - * @nr_ids: the number of numerical ids this pid will have to carry + * @level: pid namespace level */ -static struct kmem_cache *create_pid_cachep(int nr_ids) +static struct kmem_cache *create_pid_cachep(unsigned int level) { - struct pid_cache *pcache; - struct kmem_cache *cachep; - + /* Level 0 is init_pid_ns.pid_cachep */ + struct kmem_cache **pkc = &pid_cache[level - 1]; + struct kmem_cache *kc; + char name[4 + 10 + 1]; + unsigned int len; + + kc = READ_ONCE(*pkc); + if (kc) + return kc; + + snprintf(name, sizeof(name), "pid_%u", level + 1); + len = sizeof(struct pid) + level * sizeof(struct upid); mutex_lock(&pid_caches_mutex); - list_for_each_entry(pcache, &pid_caches_lh, list) - if (pcache->nr_ids == nr_ids) - goto out; - - pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); - if (pcache == NULL) - goto err_alloc; - - snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); - cachep = kmem_cache_create(pcache->name, - sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (cachep == NULL) - goto err_cachep; - - pcache->nr_ids = nr_ids; - pcache->cachep = cachep; - list_add(&pcache->list, &pid_caches_lh); -out: + /* Name collision forces to do allocation under mutex. */ + if (!*pkc) + *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); mutex_unlock(&pid_caches_mutex); - return pcache->cachep; - -err_cachep: - kfree(pcache); -err_alloc: - mutex_unlock(&pid_caches_mutex); - return NULL; + /* current can fail, but someone else can succeed. */ + return READ_ONCE(*pkc); } static void proc_cleanup_work(struct work_struct *work) @@ -80,9 +64,6 @@ static void proc_cleanup_work(struct work_struct *work) pid_ns_release_proc(ns); } -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ -#define MAX_PID_NS_LEVEL 32 - static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); @@ -119,7 +100,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns idr_init(&ns->idr); - ns->pid_cachep = create_pid_cachep(level + 1); + ns->pid_cachep = create_pid_cachep(level); if (ns->pid_cachep == NULL) goto out_free_idr; @@ -242,16 +223,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. - * sys_wait4() will also block until our children traced from the + * kernel_wait4() will also block until our children traced from the * parent namespace are detached and become EXIT_DEAD. */ do { clear_thread_flag(TIF_SIGPENDING); - rc = sys_wait4(-1, NULL, __WALL, NULL); + rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* - * sys_wait4() above can't reap the EXIT_DEAD children but we do not + * kernel_wait4() above can't reap the EXIT_DEAD children but we do not * really care, we could reparent them to the global init. We could * exit and reap ->child_reaper even if it is not the last thread in * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5c36e9c56a6..5454cc639a8d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -701,7 +701,7 @@ int hibernate(void) } pr_info("Syncing filesystems ... \n"); - sys_sync(); + ksys_sync(); pr_info("done.\n"); error = freeze_processes(); @@ -1053,7 +1053,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, lock_system_sleep(); swsusp_resume_device = res; unlock_system_sleep(); - pr_info("Starting manual resume from disk\n"); + pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device); noresume = 0; software_resume(); return n; @@ -1061,6 +1061,29 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(resume); +static ssize_t resume_offset_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block); +} + +static ssize_t resume_offset_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t n) +{ + unsigned long long offset; + int rc; + + rc = kstrtoull(buf, 0, &offset); + if (rc) + return rc; + swsusp_resume_block = offset; + + return n; +} + +power_attr(resume_offset); + static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1106,6 +1129,7 @@ power_attr(reserved_size); static struct attribute * g[] = { &disk_attr.attr, + &resume_offset_attr.attr, &resume_attr.attr, &image_size_attr.attr, &reserved_size_attr.attr, diff --git a/kernel/power/power.h b/kernel/power/power.h index f29cd178df90..9e58bdc8a562 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -104,9 +104,6 @@ extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; -extern asmlinkage int swsusp_arch_suspend(void); -extern asmlinkage int swsusp_arch_resume(void); - extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9d7503910ce2..fa39092b7aea 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -295,6 +295,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, * changed */ plist_del(node, &c->list); + /* fall through */ case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); @@ -367,6 +368,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); + /* fall through */ case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0685c4499431..4c10be0f4843 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -560,7 +560,7 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); pr_info("Syncing filesystems ... "); - sys_sync(); + ksys_sync(); pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif diff --git a/kernel/power/user.c b/kernel/power/user.c index 22df9f7ff672..75c959de4b29 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -224,7 +224,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; printk("Syncing filesystems ... "); - sys_sync(); + ksys_sync(); printk("done.\n"); error = freeze_processes(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c2e713f6ae2e..2f4af216bd6e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -42,7 +42,6 @@ #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> -#include <linux/utsname.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> @@ -52,6 +51,7 @@ #include <linux/uaccess.h> #include <asm/sections.h> +#include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> @@ -131,13 +131,10 @@ static int __init control_devkmsg(char *str) /* * Set sysctl string accordingly: */ - if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { - memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); - strncpy(devkmsg_log_str, "on", 2); - } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { - memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); - strncpy(devkmsg_log_str, "off", 3); - } + if (devkmsg_log == DEVKMSG_LOG_MASK_ON) + strcpy(devkmsg_log_str, "on"); + else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) + strcpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* @@ -277,6 +274,13 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +enum con_msg_format_flags { + MSG_FORMAT_DEFAULT = 0, + MSG_FORMAT_SYSLOG = (1 << 0), +}; + +static int console_msg_format = MSG_FORMAT_DEFAULT; + /* * The printk log buffer consists of a chain of concatenated variable * length records. Every record starts with a record header, containing @@ -926,7 +930,7 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) __poll_t ret = 0; if (!user) - return POLLERR|POLLNVAL; + return EPOLLERR|EPOLLNVAL; poll_wait(file, &log_wait, wait); @@ -934,9 +938,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) - ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; + ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else - ret = POLLIN|POLLRDNORM; + ret = EPOLLIN|EPOLLRDNORM; } logbuf_unlock_irq(); @@ -1544,6 +1548,146 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) } /* + * Special console_lock variants that help to reduce the risk of soft-lockups. + * They allow to pass console_lock to another printk() call using a busy wait. + */ + +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_owner_dep_map = { + .name = "console_owner" +}; +#endif + +static DEFINE_RAW_SPINLOCK(console_owner_lock); +static struct task_struct *console_owner; +static bool console_waiter; + +/** + * console_lock_spinning_enable - mark beginning of code where another + * thread might safely busy wait + * + * This basically converts console_lock into a spinlock. This marks + * the section where the console_lock owner can not sleep, because + * there may be a waiter spinning (like a spinlock). Also it must be + * ready to hand over the lock at the end of the section. + */ +static void console_lock_spinning_enable(void) +{ + raw_spin_lock(&console_owner_lock); + console_owner = current; + raw_spin_unlock(&console_owner_lock); + + /* The waiter may spin on us after setting console_owner */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +} + +/** + * console_lock_spinning_disable_and_check - mark end of code where another + * thread was able to busy wait and check if there is a waiter + * + * This is called at the end of the section where spinning is allowed. + * It has two functions. First, it is a signal that it is no longer + * safe to start busy waiting for the lock. Second, it checks if + * there is a busy waiter and passes the lock rights to her. + * + * Important: Callers lose the lock if there was a busy waiter. + * They must not touch items synchronized by console_lock + * in this case. + * + * Return: 1 if the lock rights were passed, 0 otherwise. + */ +static int console_lock_spinning_disable_and_check(void) +{ + int waiter; + + raw_spin_lock(&console_owner_lock); + waiter = READ_ONCE(console_waiter); + console_owner = NULL; + raw_spin_unlock(&console_owner_lock); + + if (!waiter) { + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + return 0; + } + + /* The waiter is now free to continue */ + WRITE_ONCE(console_waiter, false); + + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + /* + * Hand off console_lock to waiter. The waiter will perform + * the up(). After this, the waiter is the console_lock owner. + */ + mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + return 1; +} + +/** + * console_trylock_spinning - try to get console_lock by busy waiting + * + * This allows to busy wait for the console_lock when the current + * owner is running in specially marked sections. It means that + * the current owner is running and cannot reschedule until it + * is ready to lose the lock. + * + * Return: 1 if we got the lock, 0 othrewise + */ +static int console_trylock_spinning(void) +{ + struct task_struct *owner = NULL; + bool waiter; + bool spin = false; + unsigned long flags; + + if (console_trylock()) + return 1; + + printk_safe_enter_irqsave(flags); + + raw_spin_lock(&console_owner_lock); + owner = READ_ONCE(console_owner); + waiter = READ_ONCE(console_waiter); + if (!waiter && owner && owner != current) { + WRITE_ONCE(console_waiter, true); + spin = true; + } + raw_spin_unlock(&console_owner_lock); + + /* + * If there is an active printk() writing to the + * consoles, instead of having it write our data too, + * see if we can offload that load from the active + * printer, and do some printing ourselves. + * Go into a spin only if there isn't already a waiter + * spinning, and there is an active printer, and + * that active printer isn't us (recursive printk?). + */ + if (!spin) { + printk_safe_exit_irqrestore(flags); + return 0; + } + + /* We spin waiting for the owner to release us */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + /* Owner will clear console_waiter on hand off */ + while (READ_ONCE(console_waiter)) + cpu_relax(); + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + printk_safe_exit_irqrestore(flags); + /* + * The owner passed the console lock to us. + * Since we did not spin on console lock, annotate + * this as a trylock. Otherwise lockdep will + * complain. + */ + mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + + return 1; +} + +/* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. @@ -1749,12 +1893,19 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (!in_sched) { /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock()) + if (console_trylock_spinning()) console_unlock(); + preempt_enable(); } return printed_len; @@ -1855,6 +2006,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } +static void console_lock_spinning_enable(void) { } +static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, @@ -1913,6 +2066,17 @@ static int __add_preferred_console(char *name, int idx, char *options, c->index = idx; return 0; } + +static int __init console_msg_format_setup(char *str) +{ + if (!strcmp(str, "syslog")) + console_msg_format = MSG_FORMAT_SYSLOG; + if (!strcmp(str, "default")) + console_msg_format = MSG_FORMAT_DEFAULT; + return 1; +} +__setup("console_msg_format=", console_msg_format_setup); + /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. @@ -1998,7 +2162,7 @@ void suspend_console(void) { if (!console_suspend_enabled) return; - printk("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; up_console_sem(); @@ -2069,20 +2233,7 @@ int console_trylock(void) return 0; } console_locked = 1; - /* - * When PREEMPT_COUNT disabled we can't reliably detect if it's - * safe to schedule (e.g. calling printk while holding a spin_lock), - * because preempt_disable()/preempt_enable() are just barriers there - * and preempt_count() is always 0. - * - * RCU read sections have a separate preemption counter when - * PREEMPT_RCU enabled thus we must take extra care and check - * rcu_preempt_depth(), otherwise RCU read sections modify - * preempt_count(). - */ - console_may_schedule = !oops_in_progress && - preemptible() && - !rcu_preempt_depth(); + console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); @@ -2215,7 +2366,10 @@ skip: goto skip; } - len += msg_print_text(msg, false, text + len, sizeof(text) - len); + len += msg_print_text(msg, + console_msg_format & MSG_FORMAT_SYSLOG, + text + len, + sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), @@ -2229,14 +2383,29 @@ skip: console_seq++; raw_spin_unlock(&logbuf_lock); + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + */ + console_lock_spinning_enable(); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); + + if (console_lock_spinning_disable_and_check()) { + printk_safe_exit_irqrestore(flags); + goto out; + } + printk_safe_exit_irqrestore(flags); if (do_cond_resched) cond_resched(); } + console_locked = 0; /* Release the exclusive_console once it is used */ @@ -2261,6 +2430,7 @@ skip: if (retry && console_trylock()) goto again; +out: if (wake_klogd) wake_up_klogd(); } @@ -2611,6 +2781,7 @@ EXPORT_SYMBOL(unregister_console); */ void __init console_init(void) { + int ret; initcall_t *call; /* Setup the default TTY line discipline. */ @@ -2621,8 +2792,11 @@ void __init console_init(void) * inform about problems etc.. */ call = __con_initcall_start; + trace_initcall_level("console"); while (call < __con_initcall_end) { - (*call)(); + trace_initcall_start((*call)); + ret = (*call)(); + trace_initcall_finish((*call), ret); call++; } } @@ -3087,60 +3261,4 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); -static char dump_stack_arch_desc_str[128]; - -/** - * dump_stack_set_arch_desc - set arch-specific str to show with task dumps - * @fmt: printf-style format string - * @...: arguments for the format string - * - * The configured string will be printed right after utsname during task - * dumps. Usually used to add arch-specific system identifiers. If an - * arch wants to make use of such an ID string, it should initialize this - * as soon as possible during boot. - */ -void __init dump_stack_set_arch_desc(const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), - fmt, args); - va_end(args); -} - -/** - * dump_stack_print_info - print generic debug info for dump_stack() - * @log_lvl: log level - * - * Arch-specific dump_stack() implementations can use this function to - * print out the same debug information as the generic dump_stack(). - */ -void dump_stack_print_info(const char *log_lvl) -{ - printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", - log_lvl, raw_smp_processor_id(), current->pid, current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - if (dump_stack_arch_desc_str[0] != '\0') - printk("%sHardware name: %s\n", - log_lvl, dump_stack_arch_desc_str); - - print_worker_info(log_lvl, current); -} - -/** - * show_regs_print_info - print generic debug info for show_regs() - * @log_lvl: log level - * - * show_regs() implementations can use this function to print out generic - * debug information. - */ -void show_regs_print_info(const char *log_lvl) -{ - dump_stack_print_info(log_lvl); -} - #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f3c82e26b995..21fec73d45d4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1092,6 +1092,10 @@ int ptrace_request(struct task_struct *child, long request, ret = seccomp_get_filter(child, addr, datavp); break; + case PTRACE_SECCOMP_GET_METADATA: + ret = seccomp_get_metadata(child, addr, datavp); + break; + default: break; } @@ -1099,21 +1103,6 @@ int ptrace_request(struct task_struct *child, long request, return ret; } -static struct task_struct *ptrace_get_task_struct(pid_t pid) -{ - struct task_struct *child; - - rcu_read_lock(); - child = find_task_by_vpid(pid); - if (child) - get_task_struct(child); - rcu_read_unlock(); - - if (!child) - return ERR_PTR(-ESRCH); - return child; -} - #ifndef arch_ptrace_attach #define arch_ptrace_attach(child) do { } while (0) #endif @@ -1131,9 +1120,9 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out; } - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); + child = find_get_task_by_vpid(pid); + if (!child) { + ret = -ESRCH; goto out; } @@ -1277,9 +1266,9 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, goto out; } - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); + child = find_get_task_by_vpid(pid); + if (!child) { + ret = -ESRCH; goto out; } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6334f2c1abd0..7a693e31184a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp) WARN_ON_ONCE(rcu_seq_state(*sp) != 1); } +/* Compute the end-of-grace-period value for the specified sequence number. */ +static inline unsigned long rcu_seq_endval(unsigned long *sp) +{ + return (*sp | RCU_SEQ_STATE_MASK) + 1; +} + /* Adjust sequence number for end of update-side operation. */ static inline void rcu_seq_end(unsigned long *sp) { smp_mb(); /* Ensure update-side operation before counter increment. */ WARN_ON_ONCE(!rcu_seq_state(*sp)); - WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); + WRITE_ONCE(*sp, rcu_seq_endval(sp)); } /* Take a snapshot of the update side's sequence number. */ @@ -295,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) + for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + (cpu) <= rnp->grphi; \ + (cpu) = cpumask_next((cpu), cpu_possible_mask)) + +/* + * Iterate over all CPUs in a leaf RCU node's specified mask. + */ +#define rcu_find_next_bit(rnp, cpu, mask) \ + ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) +#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ + for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + (cpu) <= rnp->grphi; \ + (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) /* * Wrappers for the rcu_node::lock acquire and release. @@ -337,7 +353,7 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) #define raw_spin_trylock_rcu_node(p) \ ({ \ @@ -348,6 +364,9 @@ do { \ ___locked; \ }) +#define raw_lockdep_assert_held_rcu_node(p) \ + lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ #ifdef CONFIG_TINY_RCU @@ -356,24 +375,20 @@ static inline bool rcu_gp_is_normal(void) { return true; } static inline bool rcu_gp_is_expedited(void) { return false; } static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 -#ifdef CONFIG_TINY_RCU -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } -#else /* #ifdef CONFIG_TINY_RCU */ -void rcu_request_urgent_qs_task(struct task_struct *t); -#endif /* #else #ifdef CONFIG_TINY_RCU */ - enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, @@ -470,6 +485,7 @@ void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); +extern struct workqueue_struct *rcu_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d1ebdf9868bb..777e7a6a0292 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +/* + * The intended use cases for the nreaders and nwriters module parameters + * are as follows: + * + * 1. Specify only the nr_cpus kernel boot parameter. This will + * set both nreaders and nwriters to the value specified by + * nr_cpus for a mixed reader/writer test. + * + * 2. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nreaders to zero. This will set nwriters to the + * value specified by nr_cpus for an update-only test. + * + * 3. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nwriters to zero. This will set nreaders to the + * value specified by nr_cpus for a read-only test. + * + * Various other use cases may of course be specified. + */ + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, 0, "Number of RCU reader threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 308e6fdbced8..680c96d8c00f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -909,34 +909,38 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) { + if (!can_expedite) pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Disabled dynamic grace-period expediting.\n", - torture_type); - } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) + if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; - else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) - pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); - if (gp_exp1 && cur_ops->exp_sync) + pr_info("%s: Testing conditional GPs.\n", __func__); + } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { + pr_alert("%s: gp_cond without primitives.\n", __func__); + } + if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; - else if (gp_exp && !cur_ops->exp_sync) - pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); - if (gp_normal1 && cur_ops->deferred_free) + pr_info("%s: Testing expedited GPs.\n", __func__); + } else if (gp_exp && !cur_ops->exp_sync) { + pr_alert("%s: gp_exp without primitives.\n", __func__); + } + if (gp_normal1 && cur_ops->deferred_free) { synctype[nsynctypes++] = RTWS_DEF_FREE; - else if (gp_normal && !cur_ops->deferred_free) - pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); - if (gp_sync1 && cur_ops->sync) + pr_info("%s: Testing asynchronous GPs.\n", __func__); + } else if (gp_normal && !cur_ops->deferred_free) { + pr_alert("%s: gp_normal without primitives.\n", __func__); + } + if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; - else if (gp_sync && !cur_ops->sync) - pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); + pr_info("%s: Testing normal GPs.\n", __func__); + } else if (gp_sync && !cur_ops->sync) { + pr_alert("%s: gp_sync without primitives.\n", __func__); + } if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* @@ -1011,6 +1015,9 @@ rcu_torture_writer(void *arg) rcu_unexpedite_gp(); if (++expediting > 3) expediting = -expediting; + } else if (!can_expedite) { /* Disabled during boot, recheck. */ + can_expedite = !rcu_gp_is_expedited() && + !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); @@ -1021,6 +1028,10 @@ rcu_torture_writer(void *arg) while (can_expedite && expediting++ < 0) rcu_unexpedite_gp(); WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " Dynamic grace-period expediting was disabled.\n", + torture_type); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -1045,13 +1056,13 @@ rcu_torture_fakewriter(void *arg) torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else if (gp_normal == gp_exp) { - if (torture_random(&rand) & 0x80) + if (cur_ops->sync && torture_random(&rand) & 0x80) cur_ops->sync(); - else + else if (cur_ops->exp_sync) cur_ops->exp_sync(); - } else if (gp_normal) { + } else if (gp_normal && cur_ops->sync) { cur_ops->sync(); - } else { + } else if (cur_ops->exp_sync) { cur_ops->exp_sync(); } stutter_wait("rcu_torture_fakewriter"); @@ -1557,11 +1568,10 @@ static int rcu_torture_barrier_init(void) atomic_set(&barrier_cbs_count, 0); atomic_set(&barrier_cbs_invoked, 0); barrier_cbs_tasks = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]), GFP_KERNEL); barrier_cbs_wq = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), - GFP_KERNEL); + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL); if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { @@ -1674,7 +1684,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) * next grace period. Unlikely, but can happen. If it * does happen, the debug-objects subsystem won't have splatted. */ - pr_alert("rcutorture: duplicated callback was invoked.\n"); + pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ @@ -1691,7 +1701,7 @@ static void rcu_test_debug_objects(void) init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); /* Try to queue the rh2 pair of callbacks for the same grace period. */ preempt_disable(); /* Prevent preemption from interrupting test. */ @@ -1706,11 +1716,11 @@ static void rcu_test_debug_objects(void) /* Wait for them all to get done so we can safely return. */ rcu_barrier(); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } @@ -1799,7 +1809,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; if (nfakewriters > 0) { - fakewriter_tasks = kzalloc(nfakewriters * + fakewriter_tasks = kcalloc(nfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -1814,7 +1824,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("out of memory"); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d5cea81378cc..fb560fca9ef4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); @@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - lockdep_assert_held(&sp->lock); + lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, - &sdp->work, delay); + srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); } /* @@ -527,11 +526,11 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + bool last_lvl; int cpu; unsigned long flags; unsigned long gpseq; int idx; - int idxnext; unsigned long mask; struct srcu_data *sdp; struct srcu_node *snp; @@ -555,11 +554,11 @@ static void srcu_gp_end(struct srcu_struct *sp) /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - if (snp >= sp->level[rcu_num_lvls - 1]) + last_lvl = snp >= sp->level[rcu_num_lvls - 1]; + if (last_lvl) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); @@ -572,13 +571,16 @@ static void srcu_gp_end(struct srcu_struct *sp) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check)) + if (!(gpseq & counter_wrap_check) && last_lvl) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -593,9 +595,7 @@ static void srcu_gp_end(struct srcu_struct *sp) ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); spin_unlock_irq_rcu_node(sp); - /* Throttle expedited grace periods: Should be rare! */ - srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff - ? 0 : SRCU_INTERVAL); + srcu_reschedule(sp, 0); } else { spin_unlock_irq_rcu_node(sp); } @@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_rcu_node(sp, flags); - if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -691,8 +691,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(system_power_efficient_wq, &sp->work, - srcu_get_delay(sp)); + queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); } spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -1225,7 +1224,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) spin_unlock_irq_rcu_node(sp); if (pushgp) - queue_delayed_work(system_power_efficient_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_wq, &sp->work, delay); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 491bdf39f276..2a734692a581 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) WRITE_ONCE(rdp->gpwrap, true); if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) @@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_kthread ? rsp->gp_kthread->state : ~0, rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { + pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); } @@ -1628,7 +1629,7 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * If RCU is idle, we just wait for the next grace period. @@ -1675,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * Pick up grace-period number for new callbacks. If this @@ -1803,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1843,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1871,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -2296,7 +2297,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2358,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - lockdep_assert_held(&rcu_get_root(rsp)->lock); + raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2383,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Walk up the rcu_node hierarchy. */ for (;;) { @@ -2447,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2592,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -2691,7 +2692,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->n_cbs_invoked += count; rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ @@ -2845,10 +2845,8 @@ static void force_quiescent_state(struct rcu_state *rsp) !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); - if (ret) { - rsp->n_force_qs_lh++; + if (ret) return; - } rnp_old = rnp; } /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ @@ -2857,7 +2855,6 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } @@ -3355,8 +3352,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - rdp->n_rcu_pending++; - /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); @@ -3365,48 +3360,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rcu_scheduler_fully_active && - rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { - rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { - rdp->n_rp_report_qs++; + if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) return 1; - } /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rdp->n_rp_cb_ready++; + if (rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; - } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) { - rdp->n_rp_cpu_needs_gp++; + if (cpu_needs_another_gp(rsp, rdp)) return 1; - } /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ - rdp->n_rp_gp_completed++; + if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ return 1; - } /* Has a new RCU grace period started? */ if (READ_ONCE(rnp->gpnum) != rdp->gpnum || - unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ - rdp->n_rp_gp_started++; + unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; - } /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) { - rdp->n_rp_nocb_defer_wakeup++; + if (rcu_nocb_need_deferred_wakeup(rdp)) return 1; - } /* nothing to do */ - rdp->n_rp_need_nothing++; return 0; } @@ -3618,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; @@ -3636,12 +3614,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); @@ -3649,7 +3624,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4193,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) pr_cont("\n"); } +struct workqueue_struct *rcu_gp_wq; + void __init rcu_init(void) { int cpu; @@ -4219,6 +4195,10 @@ void __init rcu_init(void) rcu_cpu_starting(cpu); rcutree_online_cpu(cpu); } + + /* Create workqueue for expedited GPs and for Tree SRCU. */ + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6488a3b0e729..f491ab4f2e8e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -146,12 +146,6 @@ struct rcu_node { /* boosting for this rcu_node structure. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -184,13 +178,6 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; -/* Index values for nxttail array in struct rcu_data. */ -#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ -#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ -#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ -#define RCU_NEXT_TAIL 3 -#define RCU_NEXT_SIZE 4 - /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -217,8 +204,6 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ - unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ - unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -234,18 +219,7 @@ struct rcu_data { /* Grace period that needs help */ /* from cond_resched(). */ - /* 5) __rcu_pending() statistics. */ - unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_core_needs_qs; - unsigned long n_rp_report_qs; - unsigned long n_rp_cb_ready; - unsigned long n_rp_cpu_needs_gp; - unsigned long n_rp_gp_completed; - unsigned long n_rp_gp_started; - unsigned long n_rp_nocb_defer_wakeup; - unsigned long n_rp_need_nothing; - - /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ + /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; @@ -256,7 +230,7 @@ struct rcu_data { atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ - /* 7) Callback offloading. */ + /* 6) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; @@ -283,7 +257,7 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 8) RCU CPU stall data. */ + /* 7) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -374,10 +348,6 @@ struct rcu_state { /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ - unsigned long n_force_qs_lh; /* ~Number of calls leaving */ - /* due to lock unavailable. */ - unsigned long n_force_qs_ngp; /* Number of calls leaving */ - /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b597731..f72eefab8543 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -29,6 +29,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) } /* + * Return then value that expedited-grace-period counter will have + * at the end of the current grace period. + */ +static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) +{ + return rcu_seq_endval(&rsp->expedited_sequence); +} + +/* * Record the end of an expedited grace period. */ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) @@ -366,21 +375,30 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, int ret; struct rcu_node *rnp; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; - rdp->exp_dynticks_snap = - rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || - !(rnp->qsmaskinitnext & rdp->grpmask)) - mask_ofl_test |= rdp->grpmask; + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) + mask_ofl_test |= mask; + else + rdp->exp_dynticks_snap = snap; + } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -394,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); @@ -417,6 +435,7 @@ retry_ipi: (rnp->expmask & mask)) { /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); schedule_timeout_uninterruptible(1); goto retry_ipi; } @@ -443,6 +462,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; @@ -606,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + queue_work(rcu_gp_wq, &rew.rew_work); } /* Wait for expedited grace period to complete. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fb88a028deec..84fbee4686d3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); @@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); sched_show_task(t); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -957,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp) * expedited grace period must boost all blocked tasks, including * those blocking the pre-existing normal grace period. */ - if (rnp->exp_tasks != NULL) { + if (rnp->exp_tasks != NULL) tb = rnp->exp_tasks; - rnp->n_exp_boosts++; - } else { + else tb = rnp->boost_tasks; - rnp->n_normal_boosts++; - } - rnp->n_tasks_boosted++; /* * We boost task t by manufacturing an rt_mutex that appears to @@ -1042,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -1677,6 +1679,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) char *ticks_title; unsigned long ticks_value; + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + if (rsp->gpnum == rdp->gpnum) { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; @@ -2235,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg) smp_mb__before_atomic(); /* _add after CB invocation. */ atomic_long_add(-c, &rdp->nocb_q_count); atomic_long_add(-cl, &rdp->nocb_q_count_lazy); - rdp->n_nocbs_invoked += c; } return 0; } @@ -2312,8 +2319,11 @@ void __init rcu_init_nohz(void) cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); diff --git a/kernel/relay.c b/kernel/relay.c index 41280033a4c5..c955b10c973c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -163,7 +163,7 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan) { struct rchan_buf *buf; - if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) + if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *)) return NULL; buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); @@ -611,7 +611,6 @@ free_bufs: kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); - kfree(chan); return NULL; } EXPORT_SYMBOL_GPL(relay_open); @@ -925,12 +924,12 @@ static __poll_t relay_file_poll(struct file *filp, poll_table *wait) struct rchan_buf *buf = filp->private_data; if (buf->finalized) - return POLLERR; + return EPOLLERR; if (filp->f_mode & FMODE_READ) { poll_wait(filp, &buf->read_wait, wait); if (!relay_buf_empty(buf)) - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; } return mask; diff --git a/kernel/resource.c b/kernel/resource.c index 54ba6de3757c..2af6c03858b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -651,7 +651,8 @@ static int __find_resource(struct resource *root, struct resource *old, alloc.start = constraint->alignf(constraint->alignf_data, &avail, size, constraint->align); alloc.end = alloc.start + size - 1; - if (resource_contains(&avail, &alloc)) { + if (alloc.start <= alloc.end && + resource_contains(&avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; @@ -1022,6 +1023,7 @@ static void __init __reserve_region_with_split(struct resource *root, struct resource *conflict; struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; + int type = resource_type(root); if (!res) return; @@ -1029,7 +1031,7 @@ static void __init __reserve_region_with_split(struct resource *root, res->name = name; res->start = start; res->end = end; - res->flags = IORESOURCE_BUSY; + res->flags = type | IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; while (1) { @@ -1064,7 +1066,7 @@ static void __init __reserve_region_with_split(struct resource *root, next_res->name = name; next_res->start = conflict->end + 1; next_res->end = end; - next_res->flags = IORESOURCE_BUSY; + next_res->flags = type | IORESOURCE_BUSY; next_res->desc = IORES_DESC_NONE; } } else { @@ -1478,7 +1480,7 @@ void __devm_release_region(struct device *dev, struct resource *parent, EXPORT_SYMBOL(__devm_release_region); /* - * Called from init/main.c to reserve IO ports. + * Reserve I/O ports or memory based on "reserve=" kernel parameter. */ #define MAXRESERVE 4 static int __init reserve_setup(char *str) @@ -1489,26 +1491,38 @@ static int __init reserve_setup(char *str) for (;;) { unsigned int io_start, io_num; int x = reserved; + struct resource *parent; - if (get_option (&str, &io_start) != 2) + if (get_option(&str, &io_start) != 2) break; - if (get_option (&str, &io_num) == 0) + if (get_option(&str, &io_num) == 0) break; if (x < MAXRESERVE) { struct resource *res = reserve + x; + + /* + * If the region starts below 0x10000, we assume it's + * I/O port space; otherwise assume it's memory. + */ + if (io_start < 0x10000) { + res->flags = IORESOURCE_IO; + parent = &ioport_resource; + } else { + res->flags = IORESOURCE_MEM; + parent = &iomem_resource; + } res->name = "reserved"; res->start = io_start; res->end = io_start + io_num - 1; - res->flags = IORESOURCE_BUSY; + res->flags |= IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; res->child = NULL; - if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) + if (request_resource(parent, res) == 0) reserved = x+1; } } return 1; } - __setup("reserve=", reserve_setup); /* @@ -1563,17 +1577,17 @@ static int strict_iomem_checks; /* * check if an address is reserved in the iomem resource tree - * returns 1 if reserved, 0 if not reserved. + * returns true if reserved, false if not reserved. */ -int iomem_is_exclusive(u64 addr) +bool iomem_is_exclusive(u64 addr) { struct resource *p = &iomem_resource; - int err = 0; + bool err = false; loff_t l; int size = PAGE_SIZE; if (!strict_iomem_checks) - return 0; + return false; addr = addr & PAGE_MASK; @@ -1596,7 +1610,7 @@ int iomem_is_exclusive(u64 addr) continue; if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) || p->flags & IORESOURCE_EXCLUSIVE) { - err = 1; + err = true; break; } } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e2f9d4feff40..d9a02b318108 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o -obj-y += wait.o wait_bit.o swait.o completion.o idle.o +obj-y += idle.o fair.o rt.o deadline.o +obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index a43df5193538..6be6c575b6cd 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,13 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * Auto-group scheduling implementation: + */ #include "sched.h" -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/kallsyms.h> -#include <linux/utsname.h> -#include <linux/security.h> -#include <linux/export.h> - unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; @@ -169,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) autogroup_kref_put(prev); } -/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +/* Allocates GFP_KERNEL, cannot be called under any spinlock: */ void sched_autogroup_create_attach(struct task_struct *p) { struct autogroup *ag = autogroup_create(); autogroup_move_group(p, ag); - /* drop extra reference added by autogroup_create() */ + + /* Drop extra reference added by autogroup_create(): */ autogroup_kref_put(ag); } EXPORT_SYMBOL(sched_autogroup_create_attach); -/* Cannot be called under siglock. Currently has no users */ +/* Cannot be called under siglock. Currently has no users: */ void sched_autogroup_detach(struct task_struct *p) { autogroup_move_group(p, &autogroup_default); @@ -203,7 +200,6 @@ static int __init setup_autogroup(char *str) return 1; } - __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS @@ -225,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) if (nice < 0 && !can_nice(current, nice)) return -EPERM; - /* this is a heavy operation taking global locks.. */ + /* This is a heavy operation, taking global locks.. */ if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) return -EAGAIN; @@ -268,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } -#endif /* CONFIG_SCHED_DEBUG */ +#endif diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 27cd22b89824..b96419974a1f 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,15 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_SCHED_AUTOGROUP -#include <linux/kref.h> -#include <linux/rwsem.h> -#include <linux/sched/autogroup.h> - struct autogroup { /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. + * Reference doesn't mean how many threads attach to this + * autogroup now. It just stands for the number of tasks + * which could use this autogroup. */ struct kref kref; struct task_group *tg; @@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) return tg; } -#ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { return 0; } -#endif #endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e086babe6c61..10c83e73837a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,5 +1,5 @@ /* - * sched_clock for unstable cpu clocks + * sched_clock() for unstable CPU clocks * * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * @@ -11,7 +11,7 @@ * Guillaume Chazarain <guichaz@gmail.com> * * - * What: + * What this file implements: * * cpu_clock(i) provides a fast (execution time) high resolution * clock with bounded drift between CPUs. The value of cpu_clock(i) @@ -26,11 +26,11 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * local_clock() -- is cpu_clock() on the current cpu. + * local_clock() -- is cpu_clock() on the current CPU. * * sched_clock_cpu(i) * - * How: + * How it is implemented: * * The implementation either uses sched_clock() when * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the @@ -52,19 +52,7 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include <linux/spinlock.h> -#include <linux/hardirq.h> -#include <linux/export.h> -#include <linux/percpu.h> -#include <linux/ktime.h> -#include <linux/sched.h> -#include <linux/nmi.h> -#include <linux/sched/clock.h> -#include <linux/static_key.h> -#include <linux/workqueue.h> -#include <linux/compiler.h> -#include <linux/tick.h> -#include <linux/init.h> +#include "sched.h" /* * Scheduler clock - returns current time in nanosec units. @@ -302,21 +290,21 @@ again: * cmpxchg64 below only protects one readout. * * We must reread via sched_clock_local() in the retry case on - * 32bit as an NMI could use sched_clock_local() via the + * 32-bit kernels as an NMI could use sched_clock_local() via the * tracer and hit between the readout of - * the low32bit and the high 32bit portion. + * the low 32-bit and the high 32-bit portion. */ this_clock = sched_clock_local(my_scd); /* - * We must enforce atomic readout on 32bit, otherwise the - * update on the remote cpu can hit inbetween the readout of - * the low32bit and the high 32bit portion. + * We must enforce atomic readout on 32-bit, otherwise the + * update on the remote CPU can hit inbetween the readout of + * the low 32-bit and the high 32-bit portion. */ remote_clock = cmpxchg64(&scd->clock, 0, 0); #else /* - * On 64bit the read of [my]scd->clock is atomic versus the - * update, so we can avoid the above 32bit dance. + * On 64-bit kernels the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32-bit dance. */ sched_clock_local(my_scd); again: diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef10dad..e426b0cb9ac6 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -11,10 +11,7 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ - -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/completion.h> +#include "sched.h" /** * complete: - signals a single thread waiting on this completion @@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); bool try_wait_for_completion(struct completion *x) { unsigned long flags; - int ret = 1; + bool ret = true; /* * Since x->done will need to be locked only @@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x) * return early in the blocking case. */ if (!READ_ONCE(x->done)) - return 0; + return false; spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) - ret = 0; + ret = false; else if (x->done != UINT_MAX) x->done--; spin_unlock_irqrestore(&x->wait.lock, flags); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3da7a2444a91..5e10aaeebfcc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5,37 +5,11 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#include <linux/sched.h> -#include <linux/sched/clock.h> -#include <uapi/linux/sched/types.h> -#include <linux/sched/loadavg.h> -#include <linux/sched/hotplug.h> -#include <linux/wait_bit.h> -#include <linux/cpuset.h> -#include <linux/delayacct.h> -#include <linux/init_task.h> -#include <linux/context_tracking.h> -#include <linux/rcupdate_wait.h> -#include <linux/compat.h> - -#include <linux/blkdev.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/prefetch.h> -#include <linux/profile.h> -#include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/sched/isolation.h> +#include "sched.h" #include <asm/switch_to.h> #include <asm/tlb.h> -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#endif -#include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * [L] ->on_rq * RELEASE (rq->lock) * - * If we observe the old cpu in task_rq_lock, the acquire of + * If we observe the old CPU in task_rq_lock, the acquire of * the old rq->lock will fully serialize against the stores. * * If we observe the new CPU in task_rq_lock, the acquire will @@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay) } #endif /* CONFIG_SMP */ -static void init_rq_hrtick(struct rq *rq) +static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP rq->hrtick_csd_pending = 0; @@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq) { } -static inline void init_rq_hrtick(struct rq *rq) +static inline void hrtick_rq_init(struct rq *rq) { } #endif /* CONFIG_SCHED_HRTICK */ @@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) return false; if (idle_cpu(cpu) && !need_resched()) @@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void) * We can't run Idle Load Balance on this CPU for this time so we * cancel it and clear NOHZ_BALANCE_KICK */ - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); return false; } @@ -900,7 +874,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP @@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process); * * - cpu_active must be a subset of cpu_online * - * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, * see __set_cpus_allowed_ptr(). At this point the newly online * CPU isn't yet part of the sched domains, and balancing will not * see it. @@ -1630,16 +1604,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { - schedstat_inc(rq->ttwu_local); - schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(rq->ttwu_local); + __schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd->ttwu_wake_remote); + __schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1647,14 +1621,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq->ttwu_count); - schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(rq->ttwu_count); + __schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->se.statistics.nr_wakeups_sync); } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2461,6 +2435,7 @@ void wake_up_new_task(struct task_struct *p) * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. */ + p->recent_used_cpu = task_cpu(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); @@ -2487,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS -static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; +static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); void preempt_notifier_inc(void) { - static_key_slow_inc(&preempt_notifier_key); + static_branch_inc(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_inc); void preempt_notifier_dec(void) { - static_key_slow_dec(&preempt_notifier_key); + static_branch_dec(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_dec); @@ -2507,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec); */ void preempt_notifier_register(struct preempt_notifier *notifier) { - if (!static_key_false(&preempt_notifier_key)) + if (!static_branch_unlikely(&preempt_notifier_key)) WARN(1, "registering preempt_notifier while notifiers disabled\n"); hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); @@ -2536,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { - if (static_key_false(&preempt_notifier_key)) + if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_in_preempt_notifiers(curr); } @@ -2554,7 +2529,7 @@ static __always_inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { - if (static_key_false(&preempt_notifier_key)) + if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_out_preempt_notifiers(curr, next); } @@ -2600,22 +2575,46 @@ static inline void finish_task(struct task_struct *prev) #endif } -static inline void finish_lock_switch(struct rq *rq) +static inline void +prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ + rq_unpin_lock(rq, rf); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; + rq->lock.owner = next; #endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ /* * If we are tracking spinlock dependencies then we have to * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - raw_spin_unlock_irq(&rq->lock); } +/* + * NOP if the arch has not defined these: + */ + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif + +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2698,23 +2697,27 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); - /* - * The membarrier system call requires a full memory barrier - * after storing to rq->curr, before going back to user-space. - * - * TODO: This smp_mb__after_unlock_lock can go away if PPC end - * up adding a full barrier to switch_mm(), or we should figure - * out if a smp_mb__after_unlock_lock is really the proper API - * to use. - */ - smp_mb__after_unlock_lock(); finish_task(prev); finish_lock_switch(rq); finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); - if (mm) + /* + * When switching through a kernel thread, the loop in + * membarrier_{private,global}_expedited() may have observed that + * kernel thread and not issued an IPI. It is therefore possible to + * schedule between user->kernel->user threads without passing though + * switch_mm(). Membarrier requires a barrier after storing to + * rq->curr, before returning to userspace, so provide them here: + * + * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly + * provided by mmdrop(), + * - a sync_core for SYNC_CORE. + */ + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); + } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -2818,6 +2821,13 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); + /* + * If mm is non-NULL, we pass through switch_mm(). If mm is + * NULL, we will pass through mmdrop() in finish_task_switch(). + * Both of these contain the full memory barrier required by + * membarrier after storing to rq->curr, before returning to + * user-space. + */ if (!mm) { next->active_mm = oldmm; mmgrab(oldmm); @@ -2832,14 +2842,7 @@ context_switch(struct rq *rq, struct task_struct *prev, rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ - rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -3020,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* - * 64-bit doesn't need locks to atomically read a 64bit value. + * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. * Reading ->on_cpu is racy, but this is ok. * @@ -3079,35 +3082,99 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); #endif - rq_last_tick_reset(rq); } #ifdef CONFIG_NO_HZ_FULL -/** - * scheduler_tick_max_deferment - * - * Keep at least one tick per second when a single - * active task is running because the scheduler doesn't - * yet completely support full dynticks environment. - * - * This makes sure that uptime, CFS vruntime, load - * balancing, etc... continue to move forward, even - * with a very low granularity. - * - * Return: Maximum deferment in nanoseconds. - */ -u64 scheduler_tick_max_deferment(void) + +struct tick_work { + int cpu; + struct delayed_work work; +}; + +static struct tick_work __percpu *tick_work_cpu; + +static void sched_tick_remote(struct work_struct *work) { - struct rq *rq = this_rq(); - unsigned long next, now = READ_ONCE(jiffies); + struct delayed_work *dwork = to_delayed_work(work); + struct tick_work *twork = container_of(dwork, struct tick_work, work); + int cpu = twork->cpu; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; - next = rq->last_sched_tick + HZ; + /* + * Handle the tick only if it appears the remote CPU is running in full + * dynticks mode. The check is racy by nature, but missing a tick or + * having one too much is no big deal because the scheduler tick updates + * statistics and checks timeslices in a time-independent way, regardless + * of when exactly it is running. + */ + if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { + struct task_struct *curr; + u64 delta; - if (time_before_eq(next, now)) - return 0; + rq_lock_irq(rq, &rf); + update_rq_clock(rq); + curr = rq->curr; + delta = rq_clock_task(rq) - curr->se.exec_start; + + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + rq_unlock_irq(rq, &rf); + } - return jiffies_to_nsecs(next - now); + /* + * Run the remote tick once per second (1Hz). This arbitrary + * frequency is large enough to avoid overload but short enough + * to keep scheduler internal stats reasonably up to date. + */ + queue_delayed_work(system_unbound_wq, dwork, HZ); } + +static void sched_tick_start(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sched_tick_stop(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + cancel_delayed_work_sync(&twork->work); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int __init sched_tick_offload_init(void) +{ + tick_work_cpu = alloc_percpu(struct tick_work); + BUG_ON(!tick_work_cpu); + + return 0; +} + +#else /* !CONFIG_NO_HZ_FULL */ +static inline void sched_tick_start(int cpu) { } +static inline void sched_tick_stop(int cpu) { } #endif #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -3354,6 +3421,9 @@ static void __sched notrace __schedule(bool preempt) * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). + * + * The membarrier system call requires a full memory barrier + * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -3401,17 +3471,16 @@ static void __sched notrace __schedule(bool preempt) /* * The membarrier system call requires each architecture * to have a full memory barrier after updating - * rq->curr, before returning to user-space. For TSO - * (e.g. x86), the architecture must provide its own - * barrier in switch_mm(). For weakly ordered machines - * for which spin_unlock() acts as a full memory - * barrier, finish_lock_switch() in common code takes - * care of this barrier. For weakly ordered machines for - * which spin_unlock() acts as a RELEASE barrier (only - * arm64 and PowerPC), arm64 has a full barrier in - * switch_to(), and PowerPC has - * smp_mb__after_unlock_lock() before - * finish_lock_switch(). + * rq->curr, before returning to user-space. + * + * Here are the schemes providing that barrier on the + * various architectures: + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. + * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - finish_lock_switch() for weakly-ordered + * architectures where spin_unlock is a full barrier, + * - switch_to() for arm64 (weakly-ordered, spin_unlock + * is a RELEASE barrier), */ ++*switch_count; @@ -4853,7 +4922,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); + unsigned int retlen = min(len, cpumask_size()); if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; @@ -4873,7 +4942,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, * * Return: 0. */ -SYSCALL_DEFINE0(sched_yield) +static void do_sched_yield(void) { struct rq_flags rf; struct rq *rq; @@ -4894,7 +4963,11 @@ SYSCALL_DEFINE0(sched_yield) sched_preempt_enable_no_resched(); schedule(); +} +SYSCALL_DEFINE0(sched_yield) +{ + do_sched_yield(); return 0; } @@ -4978,7 +5051,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); void __sched yield(void) { set_current_state(TASK_RUNNING); - sys_sched_yield(); + do_sched_yield(); } EXPORT_SYMBOL(yield); @@ -5487,6 +5560,7 @@ void idle_task_exit(void) if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; finish_arch_post_lock_switch(); } mmdrop(mm); @@ -5767,6 +5841,7 @@ int sched_cpu_starting(unsigned int cpu) { set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); + sched_tick_start(cpu); return 0; } @@ -5778,6 +5853,7 @@ int sched_cpu_dying(unsigned int cpu) /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); + sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); if (rq->rd) { @@ -5790,7 +5866,7 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(cpu); + nohz_balance_exit_idle(rq); hrtick_clear(rq); return 0; } @@ -6003,13 +6079,11 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_load_update_tick = jiffies; - rq->nohz_flags = 0; -#endif -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = 0; + rq->last_blocked_load_update_tick = jiffies; + atomic_set(&rq->nohz_flags, 0); #endif #endif /* CONFIG_SMP */ - init_rq_hrtick(rq); + hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); } @@ -6664,13 +6738,18 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) parent_quota = parent_b->hierarchical_quota; /* - * Ensure max(child_quota) <= parent_quota, inherit when no + * Ensure max(child_quota) <= parent_quota. On cgroup2, + * always take the min. On cgroup1, only inherit when no * limit is set: */ - if (quota == RUNTIME_INF) - quota = parent_quota; - else if (parent_quota != RUNTIME_INF && quota > parent_quota) - return -EINVAL; + if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { + quota = min(quota, parent_quota); + } else { + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } } cfs_b->hierarchical_quota = quota; @@ -7003,3 +7082,5 @@ const u32 sched_prio_to_wmult[40] = { /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; + +#undef CREATE_TRACE_POINTS diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 44ab32a4fab6..9fbb10383434 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,24 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/cgroup.h> -#include <linux/slab.h> -#include <linux/percpu.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/seq_file.h> -#include <linux/rcupdate.h> -#include <linux/kernel_stat.h> -#include <linux/err.h> - -#include "sched.h" - /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include "sched.h" -/* Time spent by the tasks of the cpu accounting group executing in ... */ +/* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { CPUACCT_STAT_USER, /* ... user mode */ CPUACCT_STAT_SYSTEM, /* ... kernel mode */ @@ -35,12 +24,12 @@ struct cpuacct_usage { u64 usages[CPUACCT_STAT_NSTATS]; }; -/* track cpu usage of a group of tasks and its child groups */ +/* track CPU usage of a group of tasks and its child groups */ struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - struct cpuacct_usage __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every CPU */ + struct cpuacct_usage __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; }; static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) @@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuacct, css) : NULL; } -/* return cpu accounting group to which this task belongs */ +/* Return CPU accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { return css_ca(task_css(tsk, cpuacct_cgrp_id)); @@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = { .cpuusage = &root_cpuacct_cpuusage, }; -/* create a new cpu accounting group */ +/* Create a new CPU accounting group */ static struct cgroup_subsys_state * cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -96,7 +85,7 @@ out: return ERR_PTR(-ENOMEM); } -/* destroy an existing cpu accounting group */ +/* Destroy an existing CPU accounting group */ static void cpuacct_css_free(struct cgroup_subsys_state *css) { struct cpuacct *ca = css_ca(css); @@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) #endif } -/* return total cpu usage (in nanoseconds) of a group */ +/* Return total CPU usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, enum cpuacct_stat_index index) { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8d9562d890d3..50316455ea66 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -10,11 +10,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ - -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include "cpudeadline.h" +#include "sched.h" static inline int parent(int i) { @@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx) return; /* adapted from lib/prio_heap.c */ - while(1) { + while (1) { u64 largest_dl; + l = left_child(idx); r = right_child(idx); largest = idx; @@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, return 1; } else { int best_cpu = cpudl_maximum(cp); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && @@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } /* - * cpudl_clear - remove a cpu from the cpudl max-heap + * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context - * @cpu: the target cpu + * @cpu: the target CPU * * Notes: assumes cpu_rq(cpu)->lock is locked * @@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu) /* * cpudl_set - update the cpudl max-heap * @cp: the cpudl max-heap context - * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu + * @cpu: the target CPU + * @dl: the new earliest deadline for this CPU * * Notes: assumes cpu_rq(cpu)->lock is locked * @@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { int new_idx = cp->size++; + cp->elements[new_idx].dl = dl; cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; @@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) /* * cpudl_set_freecpu - Set the cpudl.free_cpus * @cp: the cpudl max-heap context - * @cpu: rd attached cpu + * @cpu: rd attached CPU */ void cpudl_set_freecpu(struct cpudl *cp, int cpu) { @@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu) /* * cpudl_clear_freecpu - Clear the cpudl.free_cpus * @cp: the cpudl max-heap context - * @cpu: rd attached cpu + * @cpu: rd attached CPU */ void cpudl_clear_freecpu(struct cpudl *cp, int cpu) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index b010d26e108e..0adeda93b5fb 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,35 +1,26 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CPUDL_H -#define _LINUX_CPUDL_H -#include <linux/sched.h> -#include <linux/sched/deadline.h> - -#define IDX_INVALID -1 +#define IDX_INVALID -1 struct cpudl_item { - u64 dl; - int cpu; - int idx; + u64 dl; + int cpu; + int idx; }; struct cpudl { - raw_spinlock_t lock; - int size; - cpumask_var_t free_cpus; - struct cpudl_item *elements; + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; }; - #ifdef CONFIG_SMP -int cpudl_find(struct cpudl *cp, struct task_struct *p, - struct cpumask *later_mask); +int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_clear(struct cpudl *cp, int cpu); -int cpudl_init(struct cpudl *cp); +int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); #endif /* CONFIG_SMP */ - -#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc51442ecbc..5e54cbcae673 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -8,7 +8,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #include "sched.h" DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index dd062a1c8cf0..d2c6083304b4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -11,63 +11,56 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/cpufreq.h> -#include <linux/kthread.h> -#include <uapi/linux/sched/types.h> -#include <linux/slab.h> -#include <trace/events/power.h> - #include "sched.h" -#define SUGOV_KTHREAD_PRIORITY 50 +#include <trace/events/power.h> struct sugov_tunables { - struct gov_attr_set attr_set; - unsigned int rate_limit_us; + struct gov_attr_set attr_set; + unsigned int rate_limit_us; }; struct sugov_policy { - struct cpufreq_policy *policy; - - struct sugov_tunables *tunables; - struct list_head tunables_hook; - - raw_spinlock_t update_lock; /* For shared policies */ - u64 last_freq_update_time; - s64 freq_update_delay_ns; - unsigned int next_freq; - unsigned int cached_raw_freq; - - /* The next fields are only needed if fast switch cannot be used. */ - struct irq_work irq_work; - struct kthread_work work; - struct mutex work_lock; - struct kthread_worker worker; - struct task_struct *thread; - bool work_in_progress; - - bool need_freq_update; + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 freq_update_delay_ns; + unsigned int next_freq; + unsigned int cached_raw_freq; + + /* The next fields are only needed if fast switch cannot be used: */ + struct irq_work irq_work; + struct kthread_work work; + struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; + bool work_in_progress; + + bool need_freq_update; }; struct sugov_cpu { - struct update_util_data update_util; - struct sugov_policy *sg_policy; - unsigned int cpu; + struct update_util_data update_util; + struct sugov_policy *sg_policy; + unsigned int cpu; - bool iowait_boost_pending; - unsigned int iowait_boost; - unsigned int iowait_boost_max; + bool iowait_boost_pending; + unsigned int iowait_boost; + unsigned int iowait_boost_max; u64 last_update; - /* The fields below are only needed when sharing a policy. */ - unsigned long util_cfs; - unsigned long util_dl; - unsigned long max; - unsigned int flags; + /* The fields below are only needed when sharing a policy: */ + unsigned long util_cfs; + unsigned long util_dl; + unsigned long max; - /* The field below is for single-CPU policies only. */ + /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON - unsigned long saved_idle_calls; + unsigned long saved_idle_calls; #endif }; @@ -81,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) /* * Since cpufreq_update_util() is called with rq->lock held for - * the @target_cpu, our per-cpu data is fully serialized. + * the @target_cpu, our per-CPU data is fully serialized. * - * However, drivers cannot in general deal with cross-cpu + * However, drivers cannot in general deal with cross-CPU * requests, so while get_next_freq() will work, our * sugov_update_commit() call may not for the fast switching platforms. * @@ -113,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) } delta_ns = time - sg_policy->last_freq_update_time; + return delta_ns >= sg_policy->freq_update_delay_ns; } @@ -188,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { + struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util; + + if (rq->rt.rt_nr_running) { + util = sg_cpu->max; + } else { + util = sg_cpu->util_dl; + if (rq->cfs.h_nr_running) + util += sg_cpu->util_cfs; + } + /* * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); + return min(util, sg_cpu->max); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { - if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { + if (flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -262,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } #endif /* CONFIG_NO_HZ_COMMON */ +/* + * Make sugov_should_update_freq() ignore the rate limit when DL + * has increased the utilization. + */ +static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) +{ + if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) + sg_policy->need_freq_update = true; +} + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time); + sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; + ignore_dl_rate_limit(sg_cpu, sg_policy); + if (!sugov_should_update_freq(sg_policy, time)) return; busy = sugov_cpu_is_busy(sg_cpu); - if (flags & SCHED_CPUFREQ_RT) { - next_f = policy->cpuinfo.max_freq; - } else { - sugov_get_util(sg_cpu); - max = sg_cpu->max; - util = sugov_aggregate_util(sg_cpu); - sugov_iowait_boost(sg_cpu, &util, &max); - next_f = get_next_freq(sg_policy, util, max); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - */ - if (busy && next_f < sg_policy->next_freq) { - next_f = sg_policy->next_freq; + sugov_get_util(sg_cpu); + max = sg_cpu->max; + util = sugov_aggregate_util(sg_cpu); + sugov_iowait_boost(sg_cpu, &util, &max); + next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) { + next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; - } + /* Reset cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = 0; } + sugov_update_commit(sg_policy, time, next_f); } @@ -314,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) unsigned long j_util, j_max; s64 delta_ns; + sugov_get_util(j_sg_cpu); + /* * If the CFS CPU utilization was last updated before the * previous frequency update and the time elapsed between the @@ -327,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; - j_sg_cpu->util_cfs = 0; - if (j_sg_cpu->util_dl == 0) - continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) - return policy->cpuinfo.max_freq; j_max = j_sg_cpu->max; j_util = sugov_aggregate_util(j_sg_cpu); + sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); if (j_util * max > j_max * util) { util = j_util; max = j_max; } - - sugov_iowait_boost(j_sg_cpu, &util, &max); } return get_next_freq(sg_policy, util, max); } -static void sugov_update_shared(struct update_util_data *hook, u64 time, - unsigned int flags) +static void +sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; @@ -356,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_lock(&sg_policy->update_lock); - sugov_get_util(sg_cpu); - sg_cpu->flags = flags; - - sugov_set_iowait_boost(sg_cpu, time); + sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - if (sugov_should_update_freq(sg_policy, time)) { - if (flags & SCHED_CPUFREQ_RT) - next_f = sg_policy->policy->cpuinfo.max_freq; - else - next_f = sugov_next_freq_shared(sg_cpu, time); + ignore_dl_rate_limit(sg_cpu, sg_policy); + if (sugov_should_update_freq(sg_policy, time)) { + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } @@ -425,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) return sprintf(buf, "%u\n", tunables->rate_limit_us); } -static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, - size_t count) +static ssize_t +rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct sugov_tunables *tunables = to_sugov_tunables(attr_set); struct sugov_policy *sg_policy; @@ -481,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) { struct task_struct *thread; struct sched_attr attr = { - .size = sizeof(struct sched_attr), - .sched_policy = SCHED_DEADLINE, - .sched_flags = SCHED_FLAG_SUGOV, - .sched_nice = 0, - .sched_priority = 0, + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, /* * Fake (unused) bandwidth; workaround to "fix" * priority inheritance. @@ -627,10 +631,9 @@ fail: stop_kthread: sugov_kthread_stop(sg_policy); - -free_sg_policy: mutex_unlock(&global_tunables_lock); +free_sg_policy: sugov_policy_free(sg_policy); disable_fast_switch: @@ -665,21 +668,20 @@ static int sugov_start(struct cpufreq_policy *policy) struct sugov_policy *sg_policy = policy->governor_data; unsigned int cpu; - sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; - sg_policy->last_freq_update_time = 0; - sg_policy->next_freq = UINT_MAX; - sg_policy->work_in_progress = false; - sg_policy->need_freq_update = false; - sg_policy->cached_raw_freq = 0; + sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); memset(sg_cpu, 0, sizeof(*sg_cpu)); - sg_cpu->cpu = cpu; - sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = 0; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + sg_cpu->cpu = cpu; + sg_cpu->sg_policy = sg_policy; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { @@ -723,14 +725,14 @@ static void sugov_limits(struct cpufreq_policy *policy) } static struct cpufreq_governor schedutil_gov = { - .name = "schedutil", - .owner = THIS_MODULE, - .dynamic_switching = true, - .init = sugov_init, - .exit = sugov_exit, - .start = sugov_start, - .stop = sugov_stop, - .limits = sugov_limits, + .name = "schedutil", + .owner = THIS_MODULE, + .dynamic_switching = true, + .init = sugov_init, + .exit = sugov_exit, + .start = sugov_start, + .stop = sugov_stop, + .limits = sugov_limits, }; #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 2511aba36b89..daaadf939ccb 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -14,7 +14,7 @@ * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with - * a 2 dimensional bitmap (the first for priority class, the second for cpus + * a 2 dimensional bitmap (the first for priority class, the second for CPUs * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a @@ -26,12 +26,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ - -#include <linux/gfp.h> -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/slab.h> -#include "cpupri.h" +#include "sched.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ static int convert_prio(int prio) @@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, } /** - * cpupri_set - update the cpu priority setting + * cpupri_set - update the CPU priority setting * @cp: The cpupri context - * @cpu: The target cpu + * @cpu: The target CPU * @newpri: The priority (INVALID-RT99) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked @@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) return; /* - * If the cpu was currently mapped to a different value, we + * If the CPU was currently mapped to a different value, we * need to map it to the new value then remove the old value. * Note, we must add the new value first, otherwise we risk the * cpu being missed by the priority loop in cpupri_find. diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab050019071..7dc20a3232e7 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,32 +1,25 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CPUPRI_H -#define _LINUX_CPUPRI_H - -#include <linux/sched.h> #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) -#define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - atomic_t count; - cpumask_var_t mask; + atomic_t count; + cpumask_var_t mask; }; struct cpupri { - struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int *cpu_to_pri; + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, - struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp); +int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); #endif - -#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bac6ac9a4ec7..0796f938c4f0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,10 +1,6 @@ -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/tsacct_kern.h> -#include <linux/kernel_stat.h> -#include <linux/static_key.h> -#include <linux/context_tracking.h> -#include <linux/sched/cputime.h> +/* + * Simple CPU accounting cgroup controller + */ #include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index, } /* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update + * Account user CPU time to a process. + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in user space since the last update */ void account_user_time(struct task_struct *p, u64 cputime) { @@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime) } /* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update + * Account guest CPU time to a process. + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in virtual machine since the last update */ void account_guest_time(struct task_struct *p, u64 cputime) { @@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime) } /* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update + * Account system CPU time to a process and desired cpustat field + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in kernel space since the last update * @index: pointer to cpustat field that has to be updated */ void account_system_index_time(struct task_struct *p, @@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p, } /* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to + * Account system CPU time to a process. + * @p: the process that the CPU time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update + * @cputime: the CPU time spent in kernel space since the last update */ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) { @@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) /* * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait + * @cputime: the CPU time spent in involuntary wait */ void account_steal_time(u64 cputime) { @@ -216,7 +212,7 @@ void account_steal_time(u64 cputime) /* * Account for idle time. - * @cputime: the cpu time spent in idle wait + * @cputime: the CPU time spent in idle wait */ void account_idle_time(u64 cputime) { @@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to + * @p: the process that the CPU time gets accounted to * @user_tick: is the tick from userspace * @rq: the pointer to rq * @@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks) irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int nr_ticks) {} + struct rq *rq, int nr_ticks) { } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +# ifndef __ARCH_HAS_VTIME_TASK_SWITCH void vtime_common_task_switch(struct task_struct *prev) { if (is_idle_task(prev)) @@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev) vtime_flush(prev); arch_vtime_task_switch(prev); } -#endif - +# endif #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ @@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) *ut = cputime.utime; *st = cputime.stime; } -#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ + /* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to + * Account a single tick of CPU time. + * @p: the process that the CPU time gets accounted to * @user_tick: indicates if the tick is a user or a system tick */ void account_process_tick(struct task_struct *p, int user_tick) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9bb0e0c412ec..e7b3008b85bb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,9 +17,6 @@ */ #include "sched.h" -#include <linux/slab.h> -#include <uapi/linux/sched/types.h> - struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) @@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); + cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } static inline @@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); + cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } static inline @@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); -static inline void queue_push_tasks(struct rq *rq) +static inline void deadline_queue_push_tasks(struct rq *rq) { if (!has_pushable_dl_tasks(rq)) return; @@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq) queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); } -static inline void queue_pull_task(struct rq *rq) +static inline void deadline_queue_pull_task(struct rq *rq) { queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); } @@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p /* * If we cannot preempt any rq, fall back to pick any - * online cpu. + * online CPU: */ cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); if (cpu >= nr_cpu_ids) { /* - * Fail to find any suitable cpu. + * Failed to find any suitable CPU. * The task will never come back! */ BUG_ON(dl_bandwidth_enabled()); @@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq) { } -static inline void queue_push_tasks(struct rq *rq) +static inline void deadline_queue_push_tasks(struct rq *rq) { } -static inline void queue_pull_task(struct rq *rq) +static inline void deadline_queue_pull_task(struct rq *rq) { } #endif /* CONFIG_SMP */ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, - int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); /* * We are being explicitly informed that a new instance is starting, @@ -1153,6 +1149,7 @@ static void update_curr_dl(struct rq *rq) struct sched_dl_entity *dl_se = &curr->dl; u64 delta_exec, scaled_delta_exec; int cpu = cpu_of(rq); + u64 now; if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1165,7 +1162,8 @@ static void update_curr_dl(struct rq *rq) * natural solution, but the full ramifications of this * approach need further study. */ - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; @@ -1178,7 +1176,7 @@ static void update_curr_dl(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1562,7 +1560,7 @@ static void yield_task_dl(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP @@ -1761,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); - queue_push_tasks(rq); + deadline_queue_push_tasks(rq); return p; } @@ -1774,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) enqueue_pushable_dl_task(rq, p); } +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); @@ -1863,7 +1869,7 @@ static int find_later_rq(struct task_struct *task) /* * We have to consider system topology and task affinity - * first, then we can look for a suitable cpu. + * first, then we can look for a suitable CPU. */ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) return -1; @@ -1877,7 +1883,7 @@ static int find_later_rq(struct task_struct *task) * Now we check how well this matches with task's * affinity and system topology. * - * The last cpu where the task run is our first + * The last CPU where the task run is our first * guess, since it is most likely cache-hot there. */ if (cpumask_test_cpu(cpu, later_mask)) @@ -1907,9 +1913,9 @@ static int find_later_rq(struct task_struct *task) best_cpu = cpumask_first_and(later_mask, sched_domain_span(sd)); /* - * Last chance: if a cpu being in both later_mask + * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our - * choice. Of course, the latest possible cpu is + * choice. Of course, the latest possible CPU is * already under consideration through later_mask. */ if (best_cpu < nr_cpu_ids) { @@ -2065,7 +2071,7 @@ retry: if (task == next_task) { /* * The task is still there. We don't try - * again, some other cpu will pull it when ready. + * again, some other CPU will pull it when ready. */ goto out; } @@ -2298,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one - * from an overloaded cpu, if any. + * from an overloaded CPU, if any. */ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) return; - queue_pull_task(rq); + deadline_queue_pull_task(rq); } /* @@ -2325,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) - queue_push_tasks(rq); + deadline_queue_push_tasks(rq); #endif if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); @@ -2350,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, * or lowering its prio, so... */ if (!rq->dl.overloaded) - queue_pull_task(rq); + deadline_queue_pull_task(rq); /* * If we now have a earlier deadline task than p, @@ -2624,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p) { struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; - dl_se->dl_deadline = 0; - dl_se->dl_period = 0; - dl_se->flags = 0; - dl_se->dl_bw = 0; - dl_se->dl_density = 0; + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; + dl_se->dl_density = 0; - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - dl_se->dl_non_contending = 0; - dl_se->dl_overrun = 0; + dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; + dl_se->dl_overrun = 0; } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) @@ -2653,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { - unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, - cs_cpus_allowed); + unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; int cpus, ret; unsigned long flags; + dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); cpus = dl_bw_cpus(dest_cpu); overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) + if (overflow) { ret = -EBUSY; - else { + } else { /* * We reserve space for this task in the destination * root_domain, as we can't fail after this point. @@ -2679,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo } raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + return ret; } @@ -2699,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); + return ret; } @@ -2716,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu) overflow = __dl_overflow(dl_b, cpus, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + return overflow; } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1ca0130ed4f9..15b10e210a6b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1,7 +1,7 @@ /* * kernel/sched/debug.c * - * Print the CFS rbtree + * Print the CFS rbtree and other debugging details * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar * @@ -9,16 +9,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - -#include <linux/proc_fs.h> -#include <linux/sched/mm.h> -#include <linux/sched/task.h> -#include <linux/seq_file.h> -#include <linux/kallsyms.h> -#include <linux/utsname.h> -#include <linux/mempolicy.h> -#include <linux/debugfs.h> - #include "sched.h" static DEFINE_SPINLOCK(sched_debug_lock); @@ -32,7 +22,7 @@ static DEFINE_SPINLOCK(sched_debug_lock); if (m) \ seq_printf(m, x); \ else \ - printk(x); \ + pr_cont(x); \ } while (0) /* @@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) if (table == NULL) return NULL; - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); + set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[13] is terminator */ return table; @@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } -static cpumask_var_t sd_sysctl_cpus; -static struct ctl_table_header *sd_sysctl_header; +static cpumask_var_t sd_sysctl_cpus; +static struct ctl_table_header *sd_sysctl_header; void register_sched_domain_sysctl(void) { @@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group { struct sched_entity *se = tg->se[cpu]; -#define P(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define P_SCHEDSTAT(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) -#define PN(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN_SCHEDSTAT(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) +#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) +#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); + if (schedstat_enabled()) { PN_SCHEDSTAT(se->statistics.wait_start); PN_SCHEDSTAT(se->statistics.sleep_start); @@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN_SCHEDSTAT(se->statistics.wait_sum); P_SCHEDSTAT(se->statistics.wait_count); } + P(se->load.weight); P(se->runnable_weight); #ifdef CONFIG_SMP @@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg) return group_path; cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif @@ -501,12 +475,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - SEQ_printf(m, - "\nrunnable tasks:\n" - " S task PID tree-key switches prio" - " wait-time sum-exec sum-sleep\n" - "-------------------------------------------------------" - "----------------------------------------------------\n"); + SEQ_printf(m, "\n"); + SEQ_printf(m, "runnable tasks:\n"); + SEQ_printf(m, " S task PID tree-key switches prio" + " wait-time sum-exec sum-sleep\n"); + SEQ_printf(m, "-------------------------------------------------------" + "----------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -527,9 +501,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); #else - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:\n", cpu); #endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -567,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", + cfs_rq->avg.util_est.enqueued); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", @@ -595,9 +573,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #ifdef CONFIG_RT_GROUP_SCHED - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); + SEQ_printf(m, "\n"); + SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); #else - SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "rt_rq[%d]:\n", cpu); #endif #define P(x) \ @@ -624,7 +604,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { struct dl_bw *dl_bw; - SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "dl_rq[%d]:\n", cpu); #define PU(x) \ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) @@ -799,9 +780,9 @@ void sysrq_sched_debug_show(void) /* * This itererator needs some explanation. * It returns 1 for the header position. - * This means 2 is cpu 0. - * In a hotplugged system some cpus, including cpu 0, may be missing so we have - * to use cpumask_* to iterate over the cpus. + * This means 2 is CPU 0. + * In a hotplugged system some CPUs, including CPU 0, may be missing so we have + * to use cpumask_* to iterate over the CPUs. */ static void *sched_debug_start(struct seq_file *file, loff_t *offset) { @@ -821,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) if (n < nr_cpu_ids) return (void *)(unsigned long)(n + 2); + return NULL; } @@ -835,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data) } static const struct seq_operations sched_debug_sops = { - .start = sched_debug_start, - .next = sched_debug_next, - .stop = sched_debug_stop, - .show = sched_debug_show, + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, }; static int sched_debug_release(struct inode *inode, struct file *file) @@ -876,14 +858,10 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); -#define __P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) #ifdef CONFIG_NUMA_BALANCING @@ -1018,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); + P(se.avg.util_est.ewma); + P(se.avg.util_est.enqueued); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b6535987500..54dc31e7ab9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,25 +20,10 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ - -#include <linux/sched/mm.h> -#include <linux/sched/topology.h> - -#include <linux/latencytop.h> -#include <linux/cpumask.h> -#include <linux/cpuidle.h> -#include <linux/slab.h> -#include <linux/profile.h> -#include <linux/interrupt.h> -#include <linux/mempolicy.h> -#include <linux/migrate.h> -#include <linux/task_work.h> -#include <linux/sched/isolation.h> +#include "sched.h" #include <trace/events/sched.h> -#include "sched.h" - /* * Targeted preemption latency for CPU-bound tasks: * @@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; #ifdef CONFIG_SMP /* - * For asym packing, by default the lower numbered cpu has higher priority. + * For asym packing, by default the lower numbered CPU has higher priority. */ int __weak arch_asym_cpu_priority(int cpu) { @@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se); + attach_entity_load_avg(cfs_rq, se, 0); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -871,7 +856,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) likely(wait_start > prev_wait_start)) wait_start -= prev_wait_start; - schedstat_set(se->statistics.wait_start, wait_start); + __schedstat_set(se->statistics.wait_start, wait_start); } static inline void @@ -893,17 +878,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - schedstat_set(se->statistics.wait_start, delta); + __schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - schedstat_set(se->statistics.wait_max, + __schedstat_set(se->statistics.wait_max, max(schedstat_val(se->statistics.wait_max), delta)); - schedstat_inc(se->statistics.wait_count); - schedstat_add(se->statistics.wait_sum, delta); - schedstat_set(se->statistics.wait_start, 0); + __schedstat_inc(se->statistics.wait_count); + __schedstat_add(se->statistics.wait_sum, delta); + __schedstat_set(se->statistics.wait_start, 0); } static inline void @@ -928,10 +913,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta = 0; if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - schedstat_set(se->statistics.sleep_max, delta); + __schedstat_set(se->statistics.sleep_max, delta); - schedstat_set(se->statistics.sleep_start, 0); - schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(se->statistics.sleep_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); @@ -945,15 +930,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta = 0; if (unlikely(delta > schedstat_val(se->statistics.block_max))) - schedstat_set(se->statistics.block_max, delta); + __schedstat_set(se->statistics.block_max, delta); - schedstat_set(se->statistics.block_start, 0); - schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(se->statistics.block_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { if (tsk->in_iowait) { - schedstat_add(se->statistics.iowait_sum, delta); - schedstat_inc(se->statistics.iowait_count); + __schedstat_add(se->statistics.iowait_sum, delta); + __schedstat_inc(se->statistics.iowait_count); trace_sched_stat_iowait(tsk, delta); } @@ -1012,10 +997,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - schedstat_set(se->statistics.sleep_start, + __schedstat_set(se->statistics.sleep_start, rq_clock(rq_of(cfs_rq))); if (tsk->state & TASK_UNINTERRUPTIBLE) - schedstat_set(se->statistics.block_start, + __schedstat_set(se->statistics.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p) } /* - * The averaged statistics, shared & private, memory & cpu, + * The averaged statistics, shared & private, memory & CPU, * occupy the first half of the array. The second half of the * array is for current counters, which are averaged into the * first set by task_numa_placement. @@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env, * be incurred if the tasks were swapped. */ if (cur) { - /* Skip this swap candidate if cannot move to the source cpu */ + /* Skip this swap candidate if cannot move to the source CPU: */ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) goto unlock; @@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env, goto balance; } - /* Balance doesn't matter much if we're running a task per cpu */ + /* Balance doesn't matter much if we're running a task per CPU: */ if (imp > env->best_imp && src_rq->nr_running == 1 && dst_rq->nr_running == 1) goto assign; @@ -1676,7 +1661,7 @@ balance: */ if (!cur) { /* - * select_idle_siblings() uses an per-cpu cpumask that + * select_idle_siblings() uses an per-CPU cpumask that * can be used from IRQ context. */ local_irq_disable(); @@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { unsigned long interval = HZ; + unsigned long numa_migrate_retry; /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) @@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p) /* Periodically retry migrating the task to the preferred node */ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); - p->numa_migrate_retry = jiffies + interval; + numa_migrate_retry = jiffies + interval; + + /* + * Check that the new retry threshold is after the current one. If + * the retry is in the future, it implies that wake_affine has + * temporarily asked NUMA balancing to backoff from placement. + */ + if (numa_migrate_retry > p->numa_migrate_retry) + return; + + /* Safe to try placing the task on the preferred node */ + p->numa_migrate_retry = numa_migrate_retry; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio) } #ifdef CONFIG_FAIR_GROUP_SCHED -# ifdef CONFIG_SMP +#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) return clamp_t(long, runnable, MIN_SHARES, shares); } -# endif /* CONFIG_SMP */ +#endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq) { + if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * See cpu_util(). */ - cpufreq_update_util(rq, 0); + cpufreq_update_util(rq, flags); } } @@ -3246,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna } /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. + * This flag is used to synchronize util_avg updates with util_est updates. + * We map this information into the LSB bit of the utilization saved at + * dequeue time (i.e. util_est.dequeued). + */ +#define UTIL_AVG_UNCHANGED 0x1 + +static inline void cfs_se_util_change(struct sched_avg *avg) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Avoid store if the flag has been already set */ + enqueued = avg->util_est.enqueued; + if (!(enqueued & UTIL_AVG_UNCHANGED)) + return; + + /* Reset flag to report util_avg has been updated */ + enqueued &= ~UTIL_AVG_UNCHANGED; + WRITE_ONCE(avg->util_est.enqueued, enqueued); +} + +/* * sched_entity: * * task: @@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit cfs_rq->curr == se)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + cfs_se_util_change(&se->avg); return 1; } @@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) } /* - * Called within set_task_rq() right before setting a task's cpu. The + * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, * including the state of rq->lock, should be made. */ @@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* * runnable_sum can't be lower than running_sum - * As running sum is scale with cpu capacity wehreas the runnable sum + * As running sum is scale with CPU capacity wehreas the runnable sum * is not we rescale running_sum 1st */ running_sum = se->avg.util_sum / @@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #endif if (decayed) - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); return decayed; } @@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, flags); } /** @@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); } /* @@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s if (!se->avg.last_update_time && (flags & DO_ATTACH)) { - attach_entity_load_avg(cfs_rq, se); + /* + * DO_ATTACH means we're here from enqueue_entity(). + * !last_update_time means we've passed through + * migrate_task_rq_fair() indicating we migrated. + * + * IOW we're enqueueing a task on a new CPU. + */ + attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); update_tg_load_avg(cfs_rq, 0); } else if (decayed && (flags & UPDATE_TG)) @@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) static int idle_balance(struct rq *this_rq, struct rq_flags *rf); +static inline unsigned long task_util(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_avg); +} + +static inline unsigned long _task_util_est(struct task_struct *p) +{ + struct util_est ue = READ_ONCE(p->se.avg.util_est); + + return max(ue.ewma, ue.enqueued); +} + +static inline unsigned long task_util_est(struct task_struct *p) +{ + return max(task_util(p), _task_util_est(p)); +} + +static inline void util_est_enqueue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); +} + +/* + * Check if a (signed) value is within a specified (unsigned) margin, + * based on the observation that: + * + * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) + * + * NOTE: this only works when value + maring < INT_MAX. + */ +static inline bool within_margin(int value, int margin) +{ + return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); +} + +static void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +{ + long last_ewma_diff; + struct util_est ue; + + if (!sched_feat(UTIL_EST)) + return; + + /* + * Update root cfs_rq's estimated utilization + * + * If *p is the last task then the root cfs_rq's estimated utilization + * of a CPU is 0 by definition. + */ + ue.enqueued = 0; + if (cfs_rq->nr_running) { + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + } + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + + /* + * Skip update of task's estimated utilization when the task has not + * yet completed an activation, e.g. being migrated. + */ + if (!task_sleep) + return; + + /* + * If the PELT values haven't changed since enqueue time, + * skip the util_est update. + */ + ue = p->se.avg.util_est; + if (ue.enqueued & UTIL_AVG_UNCHANGED) + return; + + /* + * Skip update of task's estimated utilization when its EWMA is + * already ~1% close to its last activation value. + */ + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + last_ewma_diff = ue.enqueued - ue.ewma; + if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) + return; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by storing the current PELT value + * as ue.enqueued and by using this value to update the Exponential + * Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( last_ewma_diff ) + ewma(t-1) + * = w * (last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; + ue.ewma += last_ewma_diff; + ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + WRITE_ONCE(p->se.avg.util_est, ue); +} + #else /* CONFIG_SMP */ static inline int @@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); } static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) return 0; } +static inline void +util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} + #endif /* CONFIG_SMP */ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) add_nr_running(rq, task_delta); - /* determine whether we need to wake up potentially idle cpu */ + /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); } @@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) } /* - * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * Both these CPU hotplug callbacks race against unregister_fair_sched_group() * * The race is harmless, since modifying bandwidth settings of unhooked group * bits doesn't do much. @@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) */ cfs_rq->runtime_remaining = 1; /* - * Offline rq is schedulable till cpu is completely disabled + * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; @@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); + util_est_enqueue(&rq->cfs, p); hrtick_update(rq); } @@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); * * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load * - * If a cpu misses updates for n ticks (as it was idle) and update gets - * called on the n+1-th tick when cpu may be busy, then we have: + * If a CPU misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when CPU may be busy, then we have: * * load_n = (1 - 1/2^i)^n * load_0 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load @@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } return load; } + +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + int has_blocked; /* Idle CPUS has blocked load */ + unsigned long next_balance; /* in jiffy units */ + unsigned long next_blocked; /* Next update of blocked load in jiffies */ +} nohz ____cacheline_aligned; + #endif /* CONFIG_NO_HZ_COMMON */ /** @@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. * * Therefore we need to avoid the delta approach from the regular tick when @@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq) } /* - * Return a low guess at the load of a migration-source cpu weighted + * Return a low guess at the load of a migration-source CPU weighted * according to the scheduling class and "nice" value. * * We want to under-estimate the load of migration sources, to @@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type) } /* - * Return a high guess at the load of a migration-target cpu weighted + * Return a high guess at the load of a migration-target CPU weighted * according to the scheduling class and "nice" value. */ static unsigned long target_load(int cpu, int type) @@ -5692,27 +5855,31 @@ static int wake_wide(struct task_struct *p) * scheduling latency of the CPUs. This seems to work * for the overloaded case. */ - -static bool -wake_affine_idle(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int prev_cpu, int sync) +static int +wake_affine_idle(int this_cpu, int prev_cpu, int sync) { /* * If this_cpu is idle, it implies the wakeup is from interrupt * context. Only allow the move if cache is shared. Otherwise an * interrupt intensive workload could force all tasks onto one * node depending on the IO topology or IRQ affinity settings. + * + * If the prev_cpu is idle and cache affine then avoid a migration. + * There is no guarantee that the cache hot data from an interrupt + * is more important than cache hot data on the prev_cpu and from + * a cpufreq perspective, it's better to have higher utilisation + * on one CPU. */ if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) - return true; + return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; if (sync && cpu_rq(this_cpu)->nr_running == 1) - return true; + return this_cpu; - return false; + return nr_cpumask_bits; } -static bool +static int wake_affine_weight(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5720,13 +5887,12 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, unsigned long task_load; this_eff_load = target_load(this_cpu, sd->wake_idx); - prev_eff_load = source_load(prev_cpu, sd->wake_idx); if (sync) { unsigned long current_load = task_h_load(current); if (current_load > this_eff_load) - return true; + return this_cpu; this_eff_load -= current_load; } @@ -5738,36 +5904,87 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); + prev_eff_load = source_load(prev_cpu, sd->wake_idx); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= capacity_of(this_cpu); - return this_eff_load <= prev_eff_load; + /* + * If sync, adjust the weight of prev_eff_load such that if + * prev_eff == this_eff that select_idle_sibling() will consider + * stacking the wakee on top of the waker if no other CPU is + * idle. + */ + if (sync) + prev_eff_load += 1; + + return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; } +#ifdef CONFIG_NUMA_BALANCING +static void +update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) +{ + unsigned long interval; + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + /* If balancing has no preference then continue gathering data */ + if (p->numa_preferred_nid == -1) + return; + + /* + * If the wakeup is not affecting locality then it is neutral from + * the perspective of NUMA balacing so continue gathering data. + */ + if (cpu_to_node(prev_cpu) == cpu_to_node(target)) + return; + + /* + * Temporarily prevent NUMA balancing trying to place waker/wakee after + * wakee has been moved by wake_affine. This will potentially allow + * related tasks to converge and update their data placement. The + * 4 * numa_scan_period is to allow the two-pass filter to migrate + * hot data to the wakers node. + */ + interval = max(sysctl_numa_balancing_scan_delay, + p->numa_scan_period << 2); + p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); + + interval = max(sysctl_numa_balancing_scan_delay, + current->numa_scan_period << 2); + current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); +} +#else +static void +update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) +{ +} +#endif + static int wake_affine(struct sched_domain *sd, struct task_struct *p, - int prev_cpu, int sync) + int this_cpu, int prev_cpu, int sync) { - int this_cpu = smp_processor_id(); - bool affine = false; + int target = nr_cpumask_bits; - if (sched_feat(WA_IDLE) && !affine) - affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE)) + target = wake_affine_idle(this_cpu, prev_cpu, sync); - if (sched_feat(WA_WEIGHT) && !affine) - affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) + target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); - if (affine) { - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - } + if (target == nr_cpumask_bits) + return prev_cpu; - return affine; + update_wa_numa_placement(p, prev_cpu, target); + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + return target; } -static inline unsigned long task_util(struct task_struct *p); static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) @@ -5822,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - /* Bias balancing toward cpus of our domain */ + /* Bias balancing toward CPUs of our domain */ if (local_group) load = source_load(i, load_idx); else @@ -5852,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (min_runnable_load > (runnable_load + imbalance)) { /* * The runnable load is significantly smaller - * so we can pick this new cpu + * so we can pick this new CPU: */ min_runnable_load = runnable_load; min_avg_load = avg_load; @@ -5861,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, (100*min_avg_load > imbalance_scale*avg_load)) { /* * The runnable loads are close so take the - * blocked load into account through avg_load. + * blocked load into account through avg_load: */ min_avg_load = avg_load; idlest = group; @@ -5899,6 +6116,18 @@ skip_spare: if (!idlest) return NULL; + /* + * When comparing groups across NUMA domains, it's possible for the + * local domain to be very lightly loaded relative to the remote + * domains but "imbalance" skews the comparison making remote CPUs + * look much more favourable. When considering cross-domain, add + * imbalance to the runnable load on the remote node and consider + * staying local. + */ + if ((sd->flags & SD_NUMA) && + min_runnable_load + imbalance >= this_runnable_load) + return NULL; + if (min_runnable_load > (this_runnable_load + imbalance)) return NULL; @@ -5910,7 +6139,7 @@ skip_spare: } /* - * find_idlest_group_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. */ static int find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) @@ -5988,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p new_cpu = find_idlest_group_cpu(group, p, cpu); if (new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ + /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; continue; } - /* Now try balancing at a lower domain level of new_cpu */ + /* Now try balancing at a lower domain level of 'new_cpu': */ cpu = new_cpu; weight = sd->span_weight; sd = NULL; @@ -6003,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p if (tmp->flags & sd_flag) sd = tmp; } - /* while loop will break here if sd == NULL */ } return new_cpu; @@ -6193,17 +6421,32 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - int i; + int i, recent_used_cpu; if (idle_cpu(target)) return target; /* - * If the previous cpu is cache affine and idle, don't be stupid. + * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; + /* Check a recently used CPU as a potential idle candidate: */ + recent_used_cpu = p->recent_used_cpu; + if (recent_used_cpu != prev && + recent_used_cpu != target && + cpus_share_cache(recent_used_cpu, target) && + idle_cpu(recent_used_cpu) && + cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake: + */ + p->recent_used_cpu = prev; + return recent_used_cpu; + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -6223,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). +/** + * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * @cpu: the CPU to get the utilization of + * + * The unit of the return value must be the one of capacity so we can compare + * the utilization with the capacity of the CPU that is available for CFS task + * (ie cpu_capacity). * * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the * recent utilization of currently non-runnable tasks on a CPU. It represents @@ -6238,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * current capacity (capacity_curr <= capacity_orig) of the CPU because it is * the running time on this CPU scaled by capacity_curr. * + * The estimated utilization of a CPU is defined to be the maximum between its + * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks + * currently RUNNABLE on that CPU. + * This allows to properly represent the expected utilization of a CPU which + * has just got a big task running since a long sleep period. At the same time + * however it preserves the benefits of the "blocked utilization" in + * describing the potential for other tasks waking up on the same CPU. + * * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even * higher than capacity_orig because of unfortunate rounding in * cfs.avg.util_avg or just after migrating tasks and new task wakeups until @@ -6248,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * available capacity. We allow utilization to overshoot capacity_curr (but not * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). + * + * Return: the (estimated) utilization for the specified CPU */ -static unsigned long cpu_util(int cpu) +static inline unsigned long cpu_util(int cpu) { - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); + struct cfs_rq *cfs_rq; + unsigned int util; - return (util >= capacity) ? capacity : util; -} + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); -static inline unsigned long task_util(struct task_struct *p) -{ - return p->se.avg.util_avg; + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + return min_t(unsigned long, util, capacity_orig_of(cpu)); } /* - * cpu_util_wake: Compute cpu utilization with any contributions from + * cpu_util_wake: Compute CPU utilization with any contributions from * the waking task p removed. */ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) { - unsigned long util, capacity; + struct cfs_rq *cfs_rq; + unsigned int util; /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util(cpu); - capacity = capacity_orig_of(cpu); - util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + /* Discount task's blocked util from CPU's util */ + util -= min_t(unsigned int, util, task_util(p)); - return (util >= capacity) ? capacity : util; + /* + * Covered cases: + * + * a) if *p is the only task sleeping on this CPU, then: + * cpu_util (== task_util) > util_est (== 0) + * and thus we return: + * cpu_util_wake = (cpu_util - task_util) = 0 + * + * b) if other tasks are SLEEPING on this CPU, which is now exiting + * IDLE, then: + * cpu_util >= task_util + * cpu_util > util_est (== 0) + * and thus we discount *p's blocked utilization to return: + * cpu_util_wake = (cpu_util - task_util) >= 0 + * + * c) if other tasks are RUNNABLE on that CPU and + * util_est > cpu_util + * then we use util_est since it returns a more restrictive + * estimation of the spare capacity on that CPU, by just + * considering the expected utilization of tasks already + * runnable on that CPU. + * + * Cases a) and b) are covered by the above code, while case c) is + * covered by the following code when estimated utilization is + * enabled. + */ + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + /* + * Utilization (estimated) can exceed the CPU capacity, thus let's + * clamp to the maximum CPU capacity to ensure consistency with + * the cpu_util call. + */ + return min_t(unsigned long, util, capacity_orig_of(cpu)); } /* @@ -6309,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balances load by selecting the idlest cpu in the idlest group, or under - * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. + * Balances load by selecting the idlest CPU in the idlest group, or under + * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * - * Returns the target cpu number. + * Returns the target CPU number. * * preempt must be disabled. */ @@ -6323,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = wake_flags & WF_SYNC; + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); @@ -6337,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; /* - * If both cpu and prev_cpu are part of this domain, + * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && @@ -6357,8 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (cpu == prev_cpu) goto pick_cpu; - if (wake_affine(affine_sd, p, prev_cpu, sync)) - new_cpu = cpu; + new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); } if (sd && !(sd_flag & SD_BALANCE_FORK)) { @@ -6372,9 +6665,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (!sd) { pick_cpu: - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + if (want_affine) + current->recent_used_cpu = cpu; + } } else { new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } @@ -6386,9 +6682,9 @@ pick_cpu: static void detach_entity_cfs_rq(struct sched_entity *se); /* - * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. + * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p) { @@ -6717,7 +7013,7 @@ simple: p = task_of(se); -done: __maybe_unused +done: __maybe_unused; #ifdef CONFIG_SMP /* * Move the next running task to the front of @@ -6793,7 +7089,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } set_skip_buddy(se); @@ -6822,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * BASICS * * The purpose of load-balancing is to achieve the same basic fairness the - * per-cpu scheduler provides, namely provide a proportional amount of compute + * per-CPU scheduler provides, namely provide a proportional amount of compute * time to each task. This is expressed in the following equation: * * W_i,n/P_i == W_j,n/P_j for all i,j (1) * - * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight * W_i,0 is defined as: * * W_i,0 = \Sum_j w_i,j (2) * - * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous @@ -6840,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * C_i is the compute capacity of cpu i, typically it is the + * C_i is the compute capacity of CPU i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * @@ -6861,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * SCHED DOMAINS * * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) - * for all i,j solution, we create a tree of cpus that follows the hardware + * for all i,j solution, we create a tree of CPUs that follows the hardware * topology where each level pairs two lower groups (or better). This results - * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency - * of load-balance at each level inv. proportional to the number of cpus in + * of load-balance at each level inv. proportional to the number of CPUs in * the groups. * * This yields: @@ -6874,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * \Sum { --- * --- * 2^i } = O(n) (5) * i = 0 2^i 2^i * `- size of each group - * | | `- number of cpus doing load-balance + * | | `- number of CPUs doing load-balance * | `- freq * `- sum over all levels * @@ -6882,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * this makes (5) the runtime complexity of the balancer. * * An important property here is that each CPU is still (indirectly) connected - * to every other cpu in at most O(log n) steps: + * to every other CPU in at most O(log n) steps: * * The adjacency matrix of the resulting graph is given by: * @@ -6894,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * A^(log_2 n)_i,j != 0 for all i,j (7) * - * Showing there's indeed a path between every cpu in at most O(log n) steps. + * Showing there's indeed a path between every CPU in at most O(log n) steps. * The task movement gives a factor of O(m), giving a convergence complexity * of: * @@ -6904,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * WORK CONSERVING * * In order to avoid CPUs going idle while there's still work to do, new idle - * balancing is more aggressive and has the newly idle cpu iterate up the domain + * balancing is more aggressive and has the newly idle CPU iterate up the domain * tree itself instead of relying on other CPUs to bring it work. * * This adds some complexity to both (5) and (8) but it reduces the total idle @@ -6925,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) * - * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. * * The big problem is S_k, its a global sum needed to compute a local (W_i) * property. @@ -6942,6 +7238,8 @@ enum fbq_type { regular, remote, all }; #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 +#define LBF_NOHZ_STATS 0x10 +#define LBF_NOHZ_AGAIN 0x20 struct lb_env { struct sched_domain *sd; @@ -7089,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags |= LBF_SOME_PINNED; /* - * Remember if this task can be migrated to any other cpu in + * Remember if this task can be migrated to any other CPU in * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * @@ -7099,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) return 0; - /* Prevent to re-select dst_cpu via env's cpus */ + /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { env->flags |= LBF_DST_PINNED; @@ -7326,6 +7624,17 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->avg.load_avg) + return true; + + if (cfs_rq->avg.util_avg) + return true; + + return false; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7350,6 +7659,7 @@ static void update_blocked_averages(int cpu) struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; + bool done = true; rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -7379,7 +7689,17 @@ static void update_blocked_averages(int cpu) */ if (cfs_rq_is_decayed(cfs_rq)) list_del_leaf_cfs_rq(cfs_rq); + + /* Don't need periodic decay once load/util_avg are null */ + if (cfs_rq_has_blocked(cfs_rq)) + done = false; } + +#ifdef CONFIG_NO_HZ_COMMON + rq->last_blocked_load_update_tick = jiffies; + if (done) + rq->has_blocked_load = 0; +#endif rq_unlock_irqrestore(rq, &rf); } @@ -7439,6 +7759,11 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); +#ifdef CONFIG_NO_HZ_COMMON + rq->last_blocked_load_update_tick = jiffies; + if (!cfs_rq_has_blocked(cfs_rq)) + rq->has_blocked_load = 0; +#endif rq_unlock_irqrestore(rq, &rf); } @@ -7673,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * Group imbalance indicates (and tries to solve) the problem where balancing * groups is inadequate due to ->cpus_allowed constraints. * - * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a - * cpumask covering 1 cpu of the first group and 3 cpus of the second group. + * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a + * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. * Something like: * * { 0 1 2 3 } { 4 5 6 7 } @@ -7682,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * * If we were to balance group-wise we'd place two tasks in the first group and * two tasks in the second group. Clearly this is undesired as it will overload - * cpu 3 and leave one of the cpus in the second group unused. + * cpu 3 and leave one of the CPUs in the second group unused. * * The current solution to this issue is detecting the skew in the first group * by noticing the lower domain failed to reach balance and had difficulty @@ -7773,6 +8098,28 @@ group_type group_classify(struct sched_group *group, return group_other; } +static bool update_nohz_stats(struct rq *rq, bool force) +{ +#ifdef CONFIG_NO_HZ_COMMON + unsigned int cpu = rq->cpu; + + if (!rq->has_blocked_load) + return false; + + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return false; + + if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) + return true; + + update_blocked_averages(cpu); + + return rq->has_blocked_load; +#else + return false; +#endif +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -7795,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); - /* Bias balancing toward cpus of our domain */ + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) + env->flags |= LBF_NOHZ_AGAIN; + + /* Bias balancing toward CPUs of our domain: */ if (local_group) load = target_load(i, load_idx); else @@ -7881,7 +8231,7 @@ asym_packing: if (!(env->sd->flags & SD_ASYM_PACKING)) return true; - /* No ASYM_PACKING if target cpu is already busy */ + /* No ASYM_PACKING if target CPU is already busy */ if (env->idle == CPU_NOT_IDLE) return true; /* @@ -7894,7 +8244,7 @@ asym_packing: if (!sds->busiest) return true; - /* Prefer to move from lowest priority cpu's work */ + /* Prefer to move from lowest priority CPU's work */ if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu)) return true; @@ -7950,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; +#ifdef CONFIG_NO_HZ_COMMON + if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) + env->flags |= LBF_NOHZ_STATS; +#endif + load_idx = get_sd_load_idx(env->sd, env->idle); do { @@ -8003,6 +8358,15 @@ next_group: sg = sg->next; } while (sg != env->sd->groups); +#ifdef CONFIG_NO_HZ_COMMON + if ((env->flags & LBF_NOHZ_AGAIN) && + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { + + WRITE_ONCE(nohz.next_blocked, + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); + } +#endif + if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); @@ -8147,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages - * to ensure cpu-load equilibrium, look at wider averages. XXX + * to ensure CPU-load equilibrium, look at wider averages. XXX */ busiest->load_per_task = min(busiest->load_per_task, sds->avg_load); @@ -8166,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * If there aren't any idle cpus, avoid creating some. + * If there aren't any idle CPUs, avoid creating some. */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { @@ -8180,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * We're trying to get all the cpus to the average_load, so we don't + * We're trying to get all the CPUs to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load. At the same time, + * reduce the max loaded CPU below the average load. At the same time, * we also don't want to reduce the group load below the group * capacity. Thus we look for the minimum possible imbalance. */ @@ -8276,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group is not overloaded + * This CPU is idle. If the busiest group is not overloaded * and there is no imbalance between this and busiest group - * wrt idle cpus, it is balanced. The imbalance becomes + * wrt idle CPUs, it is balanced. The imbalance becomes * significant if the diff is greater than 1 otherwise we * might end up to just move the imbalance on another group */ @@ -8306,7 +8670,7 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the cpus in group. + * find_busiest_queue - find the busiest runqueue among the CPUs in the group. */ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) @@ -8350,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu capacity. + * which is not scaled with the CPU capacity. */ if (rq->nr_running == 1 && wl > env->imbalance && @@ -8358,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; /* - * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu capacity, so - * that the load can be moved away from the cpu that is + * For the load comparisons with the other CPU's, consider + * the weighted_cpuload() scaled with the CPU capacity, so + * that the load can be moved away from the CPU that is * potentially running at a lower capacity. * * Thus we're looking for max(wl_i / capacity_i), crosswise @@ -8431,13 +8795,13 @@ static int should_we_balance(struct lb_env *env) return 0; /* - * In the newly idle case, we will allow all the cpu's + * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. */ if (env->idle == CPU_NEWLY_IDLE) return 1; - /* Try to find first idle cpu */ + /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { if (!idle_cpu(cpu)) continue; @@ -8450,7 +8814,7 @@ static int should_we_balance(struct lb_env *env) balance_cpu = group_balance_cpu(sg); /* - * First idle cpu or the first cpu(busiest) in this sched group + * First idle CPU or the first CPU(busiest) in this sched group * is eligible for doing load balancing at this and above domains. */ return balance_cpu == env->dst_cpu; @@ -8559,7 +8923,7 @@ more_balance: * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group * where they can run. The upper limit on how many times we - * iterate on same src_cpu is dependent on number of cpus in our + * iterate on same src_cpu is dependent on number of CPUs in our * sched_group. * * This changes load balance semantics a bit on who can move @@ -8576,7 +8940,7 @@ more_balance: */ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { - /* Prevent to re-select dst_cpu via env's cpus */ + /* Prevent to re-select dst_cpu via env's CPUs */ cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); @@ -8638,9 +9002,10 @@ more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); - /* don't kick the active_load_balance_cpu_stop, - * if the curr task on busiest cpu can't be - * moved to this_cpu + /* + * Don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest CPU can't be + * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { raw_spin_unlock_irqrestore(&busiest->lock, @@ -8752,121 +9117,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) } /* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. - */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) -{ - unsigned long next_balance = jiffies + HZ; - int this_cpu = this_rq->cpu; - struct sched_domain *sd; - int pulled_task = 0; - u64 curr_cost = 0; - - /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. - */ - this_rq->idle_stamp = rq_clock(this_rq); - - /* - * Do not pull tasks towards !active CPUs... - */ - if (!cpu_active(this_cpu)) - return 0; - - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - rq_unpin_lock(this_rq, rf); - - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (sd) - update_next_balance(sd, &next_balance); - rcu_read_unlock(); - - goto out; - } - - raw_spin_unlock(&this_rq->lock); - - update_blocked_averages(this_cpu); - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - int continue_balancing = 1; - u64 t0, domain_cost; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); - break; - } - - if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); - - pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, - &continue_balancing); - - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; - - curr_cost += domain_cost; - } - - update_next_balance(sd, &next_balance); - - /* - * Stop searching for tasks to pull if there are - * now runnable tasks on this rq. - */ - if (pulled_task || this_rq->nr_running > 0) - break; - } - rcu_read_unlock(); - - raw_spin_lock(&this_rq->lock); - - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; - - /* - * While browsing the domains, we released the rq lock, a task could - * have been enqueued in the meantime. Since we're not going idle, - * pretend we pulled a task. - */ - if (this_rq->cfs.h_nr_running && !pulled_task) - pulled_task = 1; - -out: - /* Move the next balance forward */ - if (time_after(this_rq->next_balance, next_balance)) - this_rq->next_balance = next_balance; - - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) - pulled_task = -1; - - if (pulled_task) - this_rq->idle_stamp = 0; - - rq_repin_lock(this_rq, rf); - - return pulled_task; -} - -/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at * least 1 task to be running on each physical CPU where possible, and * avoids physical / logical imbalances. @@ -8890,7 +9141,7 @@ static int active_load_balance_cpu_stop(void *data) if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) goto out_unlock; - /* make sure the requested cpu hasn't gone down in the meantime */ + /* Make sure the requested CPU hasn't gone down in the meantime: */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) goto out_unlock; @@ -8902,7 +9153,7 @@ static int active_load_balance_cpu_stop(void *data) /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. + * Bjorn Helgaas on a 128-CPU setup. */ BUG_ON(busiest_rq == target_rq); @@ -8956,141 +9207,6 @@ out_unlock: return 0; } -static inline int on_null_domain(struct rq *rq) -{ - return unlikely(!rcu_dereference_sched(rq->sd)); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing - * needed, they will kick the idle load balancer, which then does idle - * load balancing for all the idle CPUs. - */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; - -static inline int find_new_ilb(void) -{ - int ilb = cpumask_first(nohz.idle_cpus_mask); - - if (ilb < nr_cpu_ids && idle_cpu(ilb)) - return ilb; - - return nr_cpu_ids; -} - -/* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). - */ -static void nohz_balancer_kick(void) -{ - int ilb_cpu; - - nohz.next_balance++; - - ilb_cpu = find_new_ilb(); - - if (ilb_cpu >= nr_cpu_ids) - return; - - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) - return; - /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target cpu which - * is idle. And the softirq performing nohz idle load balance - * will be run before returning from the IPI. - */ - smp_send_reschedule(ilb_cpu); - return; -} - -void nohz_balance_exit_idle(unsigned int cpu) -{ - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - /* - * Completely isolated CPUs don't ever set, so we must test. - */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } -} - -static inline void set_cpu_sd_state_busy(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || !sd->nohz_idle) - goto unlock; - sd->nohz_idle = 0; - - atomic_inc(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} - -void set_cpu_sd_state_idle(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || sd->nohz_idle) - goto unlock; - sd->nohz_idle = 1; - - atomic_dec(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} - -/* - * This routine will record that the cpu is going idle with tick stopped. - * This info will be used in performing idle load balancing in the future. - */ -void nohz_balance_enter_idle(int cpu) -{ - /* - * If this cpu is going down, then nothing needs to be done. - */ - if (!cpu_active(cpu)) - return; - - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) - return; - - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) - return; - - /* - * If we're a completely isolated CPU, we don't play. - */ - if (on_null_domain(cpu_rq(cpu))) - return; - - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); -} -#endif - static DEFINE_SPINLOCK(balancing); /* @@ -9120,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) int need_serialize, need_decay = 0; u64 max_cost = 0; - update_blocked_averages(cpu); - rcu_read_lock(); for_each_domain(cpu, sd) { /* @@ -9211,68 +9325,56 @@ out: } } +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + #ifdef CONFIG_NO_HZ_COMMON /* - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * idle load balancing details + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. */ -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) -{ - int this_cpu = this_rq->cpu; - struct rq *rq; - int balance_cpu; - /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; - int update_next_balance = 0; - if (idle != CPU_IDLE || - !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) - goto end; +static inline int find_new_ilb(void) +{ + int ilb = cpumask_first(nohz.idle_cpus_mask); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) - continue; + if (ilb < nr_cpu_ids && idle_cpu(ilb)) + return ilb; - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; + return nr_cpu_ids; +} - rq = cpu_rq(balance_cpu); +/* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void kick_ilb(unsigned int flags) +{ + int ilb_cpu; - /* - * If time for next balance is due, - * do the balance. - */ - if (time_after_eq(jiffies, rq->next_balance)) { - struct rq_flags rf; + nohz.next_balance++; - rq_lock_irq(rq, &rf); - update_rq_clock(rq); - cpu_load_update_idle(rq); - rq_unlock_irq(rq, &rf); + ilb_cpu = find_new_ilb(); - rebalance_domains(rq, CPU_IDLE); - } + if (ilb_cpu >= nr_cpu_ids) + return; - if (time_after(next_balance, rq->next_balance)) { - next_balance = rq->next_balance; - update_next_balance = 1; - } - } + flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); + if (flags & NOHZ_KICK_MASK) + return; /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target CPU which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; -end: - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); + smp_send_reschedule(ilb_cpu); } /* @@ -9286,36 +9388,41 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static void nohz_balancer_kick(struct rq *rq) { unsigned long now = jiffies; struct sched_domain_shared *sds; struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; - bool kick = false; + unsigned int flags = 0; if (unlikely(rq->idle_balance)) - return false; + return; - /* - * We may be recently in ticked or tickless idle mode. At the first - * busy tick after returning from idle, we will update the busy stats. - */ - set_cpu_sd_state_busy(); - nohz_balance_exit_idle(cpu); + /* + * We may be recently in ticked or tickless idle mode. At the first + * busy tick after returning from idle, we will update the busy stats. + */ + nohz_balance_exit_idle(rq); /* * None are in tickless mode and hence no need for NOHZ idle load * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return false; + return; + + if (READ_ONCE(nohz.has_blocked) && + time_after(now, READ_ONCE(nohz.next_blocked))) + flags = NOHZ_STATS_KICK; if (time_before(now, nohz.next_balance)) - return false; + goto out; - if (rq->nr_running >= 2) - return true; + if (rq->nr_running >= 2) { + flags = NOHZ_KICK_MASK; + goto out; + } rcu_read_lock(); sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); @@ -9326,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq) */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } @@ -9336,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq) if (sd) { if ((rq->cfs.h_nr_running >= 1) && check_cpu_capacity(rq, sd)) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } } @@ -9349,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq) continue; if (sched_asym_prefer(i, cpu)) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } } } unlock: rcu_read_unlock(); - return kick; +out: + if (flags) + kick_ilb(flags); +} + +static void set_cpu_sd_state_busy(int cpu) +{ + struct sched_domain *sd; + + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + atomic_inc(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +void nohz_balance_exit_idle(struct rq *rq) +{ + SCHED_WARN_ON(rq != this_rq()); + + if (likely(!rq->nohz_tick_stopped)) + return; + + rq->nohz_tick_stopped = 0; + cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + + set_cpu_sd_state_busy(rq->cpu); +} + +static void set_cpu_sd_state_idle(int cpu) +{ + struct sched_domain *sd; + + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + atomic_dec(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +/* + * This routine will record that the CPU is going idle with tick stopped. + * This info will be used in performing idle load balancing in the future. + */ +void nohz_balance_enter_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + SCHED_WARN_ON(cpu != smp_processor_id()); + + /* If this CPU is going down, then nothing needs to be done: */ + if (!cpu_active(cpu)) + return; + + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ + if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) + return; + + /* + * Can be set safely without rq->lock held + * If a clear happens, it will have evaluated last additions because + * rq->lock is held during the check and the clear + */ + rq->has_blocked_load = 1; + + /* + * The tick is still stopped but load could have been added in the + * meantime. We set the nohz.has_blocked flag to trig a check of the + * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear + * of nohz.has_blocked can only happen after checking the new load + */ + if (rq->nohz_tick_stopped) + goto out; + + /* If we're a completely isolated CPU, we don't play: */ + if (on_null_domain(rq)) + return; + + rq->nohz_tick_stopped = 1; + + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + + /* + * Ensures that if nohz_idle_balance() fails to observe our + * @idle_cpus_mask store, it must observe the @has_blocked + * store. + */ + smp_mb__after_atomic(); + + set_cpu_sd_state_idle(cpu); + +out: + /* + * Each time a cpu enter idle, we assume that it has blocked load and + * enable the periodic update of the load of idle cpus + */ + WRITE_ONCE(nohz.has_blocked, 1); +} + +/* + * Internal function that runs load balance for all idle cpus. The load balance + * can be a simple update of blocked load or a complete load balance with + * tasks movement depending of flags. + * The function returns false if the loop has stopped before running + * through all idle CPUs. + */ +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, + enum cpu_idle_type idle) +{ + /* Earliest time when we have to do rebalance again */ + unsigned long now = jiffies; + unsigned long next_balance = now + 60*HZ; + bool has_blocked_load = false; + int update_next_balance = 0; + int this_cpu = this_rq->cpu; + int balance_cpu; + int ret = false; + struct rq *rq; + + SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + + /* + * We assume there will be no idle load after this update and clear + * the has_blocked flag. If a cpu enters idle in the mean time, it will + * set the has_blocked flag and trig another update of idle load. + * Because a cpu that becomes idle, is added to idle_cpus_mask before + * setting the flag, we are sure to not clear the state and not + * check the load of an idle cpu. + */ + WRITE_ONCE(nohz.has_blocked, 0); + + /* + * Ensures that if we miss the CPU, we must see the has_blocked + * store from nohz_balance_enter_idle(). + */ + smp_mb(); + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) + continue; + + /* + * If this CPU gets work to do, stop the load balancing + * work being done for other CPUs. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + has_blocked_load = true; + goto abort; + } + + rq = cpu_rq(balance_cpu); + + has_blocked_load |= update_nohz_stats(rq, true); + + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + cpu_load_update_idle(rq); + rq_unlock_irqrestore(rq, &rf); + + if (flags & NOHZ_BALANCE_KICK) + rebalance_domains(rq, CPU_IDLE); + } + + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } + } + + /* Newly idle CPU doesn't need an update */ + if (idle != CPU_NEWLY_IDLE) { + update_blocked_averages(this_cpu); + has_blocked_load |= this_rq->has_blocked_load; + } + + if (flags & NOHZ_BALANCE_KICK) + rebalance_domains(this_rq, CPU_IDLE); + + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + + /* The full idle balance loop has been done */ + ret = true; + +abort: + /* There is still blocked load, enable periodic update */ + if (has_blocked_load) + WRITE_ONCE(nohz.has_blocked, 1); + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + + return ret; +} + +/* + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +{ + int this_cpu = this_rq->cpu; + unsigned int flags; + + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + return false; + + if (idle != CPU_IDLE) { + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); + return false; + } + + /* + * barrier, pairs with nohz_balance_enter_idle(), ensures ... + */ + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); + if (!(flags & NOHZ_KICK_MASK)) + return false; + + _nohz_idle_balance(this_rq, flags, idle); + + return true; +} + +static void nohz_newidle_balance(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu; + + /* + * This CPU doesn't want to be disturbed by scheduler + * housekeeping + */ + if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) + return; + + /* Will wake up very soon. No time for doing anything else*/ + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + + /* Don't need to update blocked load of idle CPUs*/ + if (!READ_ONCE(nohz.has_blocked) || + time_before(jiffies, READ_ONCE(nohz.next_blocked))) + return; + + raw_spin_unlock(&this_rq->lock); + /* + * This CPU is going to be idle and blocked load of idle CPUs + * need to be updated. Run the ilb locally as it is a good + * candidate for ilb instead of waking up another idle CPU. + * Kick an normal ilb if we failed to do the update. + */ + if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) + kick_ilb(NOHZ_STATS_KICK); + raw_spin_lock(&this_rq->lock); +} + +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void nohz_balancer_kick(struct rq *rq) { } + +static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +{ + return false; +} + +static inline void nohz_newidle_balance(struct rq *this_rq) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +{ + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; + struct sched_domain *sd; + int pulled_task = 0; + u64 curr_cost = 0; + + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + this_rq->idle_stamp = rq_clock(this_rq); + + /* + * Do not pull tasks towards !active CPUs... + */ + if (!cpu_active(this_cpu)) + return 0; + + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, &next_balance); + rcu_read_unlock(); + + nohz_newidle_balance(this_rq); + + goto out; + } + + raw_spin_unlock(&this_rq->lock); + + update_blocked_averages(this_cpu); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + int continue_balancing = 1; + u64 t0, domain_cost; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, &next_balance); + break; + } + + if (sd->flags & SD_BALANCE_NEWIDLE) { + t0 = sched_clock_cpu(this_cpu); + + pulled_task = load_balance(this_cpu, this_rq, + sd, CPU_NEWLY_IDLE, + &continue_balancing); + + domain_cost = sched_clock_cpu(this_cpu) - t0; + if (domain_cost > sd->max_newidle_lb_cost) + sd->max_newidle_lb_cost = domain_cost; + + curr_cost += domain_cost; + } + + update_next_balance(sd, &next_balance); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) + break; + } + rcu_read_unlock(); + + raw_spin_lock(&this_rq->lock); + + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) + this_rq->next_balance = next_balance; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) + this_rq->idle_stamp = 0; + + rq_repin_lock(this_rq, rf); + + return pulled_task; } -#else -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } -#endif /* * run_rebalance_domains is triggered when needed from the scheduler tick. @@ -9373,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) CPU_IDLE : CPU_NOT_IDLE; /* - * If this cpu has a pending nohz_balance_kick, then do the - * balancing on behalf of the other idle cpus whose ticks are + * If this CPU has a pending nohz_balance_kick, then do the + * balancing on behalf of the other idle CPUs whose ticks are * stopped. Do nohz_idle_balance *before* rebalance_domains to - * give the idle cpus a chance to load balance. Else we may + * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. */ - nohz_idle_balance(this_rq, idle); + if (nohz_idle_balance(this_rq, idle)) + return; + + /* normal load balance */ + update_blocked_averages(this_rq->cpu); rebalance_domains(this_rq, idle); } @@ -9395,10 +9909,8 @@ void trigger_load_balance(struct rq *rq) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); -#endif + + nohz_balancer_kick(rq); } static void rq_online_fair(struct rq *rq) @@ -9419,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ /* - * scheduler tick hitting a task of our scheduling class: + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { @@ -9570,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se); + attach_entity_load_avg(cfs_rq, se, 0); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } @@ -9972,6 +10489,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; + nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif #endif /* SMP */ diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..85ae8488039c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) SCHED_FEAT(WA_WEIGHT, true) SCHED_FEAT(WA_BIAS, true) + +/* + * UtilEstimation. Use estimated CPU utilization. + */ +SCHED_FEAT(UTIL_EST, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7dae9eb8c042..1a3e9bddd17b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,23 +1,14 @@ /* - * Generic entry point for the idle threads + * Generic entry points for the idle threads and + * implementation of the idle task scheduling class. + * + * (NOTE: these are not related to SCHED_IDLE batch scheduled + * tasks which are handled in sched/fair.c ) */ -#include <linux/sched.h> -#include <linux/sched/idle.h> -#include <linux/cpu.h> -#include <linux/cpuidle.h> -#include <linux/cpuhotplug.h> -#include <linux/tick.h> -#include <linux/mm.h> -#include <linux/stackprotector.h> -#include <linux/suspend.h> -#include <linux/livepatch.h> - -#include <asm/tlb.h> +#include "sched.h" #include <trace/events/power.h> -#include "sched.h" - /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable) static int __init cpu_idle_poll_setup(char *__unused) { cpu_idle_force_poll = 1; + return 1; } __setup("nohlt", cpu_idle_poll_setup); @@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup); static int __init cpu_idle_nopoll_setup(char *__unused) { cpu_idle_force_poll = 0; + return 1; } __setup("hlt", cpu_idle_nopoll_setup); @@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void) trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); stop_critical_timings(); + while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); + return 1; } @@ -146,13 +141,15 @@ static void cpuidle_idle_call(void) } /* - * Tell the RCU framework we are entering an idle section, - * so no more rcu read side critical sections and one more + * The RCU framework needs to be told that we are entering an idle + * section, so no more rcu read side critical sections and one more * step to the grace period */ - rcu_idle_enter(); if (cpuidle_not_available(drv, dev)) { + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + default_idle_call(); goto exit_idle; } @@ -169,20 +166,37 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle() || dev->use_deepest_state) { if (idle_should_enter_s2idle()) { + rcu_idle_enter(); + entered_state = cpuidle_enter_s2idle(drv, dev); if (entered_state > 0) { local_irq_enable(); goto exit_idle; } + + rcu_idle_exit(); } + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + next_state = cpuidle_find_deepest_state(drv, dev); call_cpuidle(drv, dev, next_state); } else { + bool stop_tick = true; + /* * Ask the cpuidle framework to choose a convenient idle state. */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev, &stop_tick); + + if (stop_tick) + tick_nohz_idle_stop_tick(); + else + tick_nohz_idle_retain_tick(); + + rcu_idle_enter(); + entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -227,6 +241,7 @@ static void do_idle(void) rmb(); if (cpu_is_offline(cpu)) { + tick_nohz_idle_stop_tick_protected(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -240,10 +255,12 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + tick_nohz_idle_restart_tick(); cpu_idle_poll(); - else + } else { cpuidle_idle_call(); + } arch_cpu_idle_exit(); } @@ -332,8 +349,8 @@ void cpu_startup_entry(enum cpuhp_state state) { /* * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (arm and sh have never invoked the canary - * init for the non boot cpus!). Will be fixed in 3.11 + * make this generic (ARM and SH have never invoked the canary + * init for the non boot CPUs!). Will be fixed in 3.11 */ #ifdef CONFIG_X86 /* @@ -350,3 +367,116 @@ void cpu_startup_entry(enum cpuhp_state state) while (1) do_idle(); } + +/* + * idle-task scheduling class. + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + return task_cpu(p); /* IDLE tasks as never migrated */ +} +#endif + +/* + * Idle tasks are unconditionally rescheduled: + */ +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +{ + resched_curr(rq); +} + +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + put_prev_task(rq, prev); + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); + + return rq->idle; +} + +/* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +static void +dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) +{ + raw_spin_unlock_irq(&rq->lock); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + raw_spin_lock_irq(&rq->lock); +} + +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ +static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_idle(struct rq *rq) +{ +} + +static void switched_to_idle(struct rq *rq, struct task_struct *p) +{ + BUG(); +} + +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +{ + BUG(); +} + +static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +static void update_curr_idle(struct rq *rq) +{ +} + +/* + * Simple, special scheduling class for the per-CPU idle tasks: + */ +const struct sched_class idle_sched_class = { + /* .next is NULL */ + /* no enqueue/yield_task for idle tasks */ + + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + + .check_preempt_curr = check_preempt_curr_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, +#endif + + .set_curr_task = set_curr_task_idle, + .task_tick = task_tick_idle, + + .get_rr_interval = get_rr_interval_idle, + + .prio_changed = prio_changed_idle, + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, +}; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c deleted file mode 100644 index d518664cce4f..000000000000 --- a/kernel/sched/idle_task.c +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - -/* - * idle-task scheduling class. - * - * (NOTE: these are not related to SCHED_IDLE tasks which are - * handled in sched/fair.c) - */ - -#ifdef CONFIG_SMP -static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) -{ - return task_cpu(p); /* IDLE tasks as never migrated */ -} -#endif /* CONFIG_SMP */ - -/* - * Idle tasks are unconditionally rescheduled: - */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) -{ - resched_curr(rq); -} - -static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - put_prev_task(rq, prev); - update_idle_core(rq); - schedstat_inc(rq->sched_goidle); - return rq->idle; -} - -/* - * It is not legal to sleep in the idle task - print a warning - * message if some code attempts to do it: - */ -static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) -{ - raw_spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - raw_spin_lock_irq(&rq->lock); -} - -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ - rq_last_tick_reset(rq); -} - -static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) -{ -} - -static void set_curr_task_idle(struct rq *rq) -{ -} - -static void switched_to_idle(struct rq *rq, struct task_struct *p) -{ - BUG(); -} - -static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) -{ - BUG(); -} - -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - -static void update_curr_idle(struct rq *rq) -{ -} - -/* - * Simple, special scheduling class for the per-CPU idle tasks: - */ -const struct sched_class idle_sched_class = { - /* .next is NULL */ - /* no enqueue/yield_task for idle tasks */ - - /* dequeue is not valid, we print a debug message there: */ - .dequeue_task = dequeue_task_idle, - - .check_preempt_curr = check_preempt_curr_idle, - - .pick_next_task = pick_next_task_idle, - .put_prev_task = put_prev_task_idle, - -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_idle, - .set_cpus_allowed = set_cpus_allowed_common, -#endif - - .set_curr_task = set_curr_task_idle, - .task_tick = task_tick_idle, - - .get_rr_interval = get_rr_interval_idle, - - .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, -}; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b71b436f59f2..e6802181900f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -3,15 +3,10 @@ * any CPU: unbound workqueues, timers, kthreads and any offloadable work. * * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker + * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ - -#include <linux/sched/isolation.h> -#include <linux/tick.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/static_key.h> -#include <linux/ctype.h> +#include "sched.h" DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); @@ -60,6 +55,9 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overriden); + if (housekeeping_flags & HK_FLAG_TICK) + sched_tick_offload_init(); + /* We need at least one CPU to handle housekeeping work */ WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } @@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; return housekeeping_setup(str, flags); } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 89a989e4d758..a171c1258109 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,10 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ - -#include <linux/export.h> -#include <linux/sched/loadavg.h> - #include "sched.h" /* @@ -32,29 +28,29 @@ * Due to a number of reasons the above turns in the mess below: * * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach + * serious number of CPUs, therefore we need to take a distributed approach * to calculating nr_active. * * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } * * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate + * can simply take per-CPU deltas and fold those into a global accumulate * to obtain the same result. See calc_load_fold_active(). * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * Furthermore, in order to avoid synchronizing all per-CPU delta folding * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. + * CPU to have completed this task. * * This places an upper-bound on the IRQ-off latency of the machine. Then * again, being late doesn't loose the delta, just wrecks the sample. * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because + * this would add another cross-CPU cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever CPU the task ran + * when it went into uninterruptible state and decrement on whatever CPU * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. + * all CPUs yields the correct result. * * This covers the NO_HZ=n code, for extra head-aches, see the comment below. */ @@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * Handle NO_HZ for the global load-average. * * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by + * load-average relies on per-CPU sampling from the tick, it is affected by * NO_HZ. * * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * entering NO_HZ state such that we can include this as an 'extra' CPU delta * when we read the global state. * * Obviously reality has to ruin such a delightfully simple scheme: @@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * busy state. * * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which + * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ + * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ * intervals. * * When making the ILB scale, we should try to pull this in as well. @@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp, } /* - * NO_HZ can leave us missing all per-cpu ticks calling + * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. @@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks) return; /* - * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. + * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ delta = calc_load_nohz_fold(); if (delta) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 9bcbacba82a8..76e0eaf4654e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -13,37 +13,117 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - -#include <linux/syscalls.h> -#include <linux/membarrier.h> -#include <linux/tick.h> -#include <linux/cpumask.h> -#include <linux/atomic.h> - -#include "sched.h" /* for cpu_rq(). */ +#include "sched.h" /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. */ -#define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) +#else +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 +#endif + +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ } -static int membarrier_private_expedited(void) +static int membarrier_global_expedited(void) { int cpu; bool fallback = false; cpumask_var_t tmpmask; - if (!(atomic_read(¤t->mm->membarrier_state) - & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) - return -EPERM; + if (num_online_cpus() == 1) + return 0; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ + return 0; +} + +static int membarrier_private_expedited(int flags) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) + return -EINVAL; + if (!(atomic_read(¤t->mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) + return -EPERM; + } else { + if (!(atomic_read(¤t->mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) + return -EPERM; + } if (num_online_cpus() == 1) return 0; @@ -102,24 +182,75 @@ static int membarrier_private_expedited(void) * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ + return 0; } -static void membarrier_register_private_expedited(void) +static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + if (atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) + return 0; + atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); + if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { + /* + * For single mm user, single threaded process, we can + * simply issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that + * no memory access following registration is reordered + * before registration. + */ + smp_mb(); + } else { + /* + * For multi-mm user threads, we need to ensure all + * future scheduler executions will observe the new + * thread flag state for this mm. + */ + synchronize_sched(); + } + atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, + &mm->membarrier_state); + + return 0; +} + +static int membarrier_register_private_expedited(int flags) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) + return -EINVAL; + state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } + /* * We need to consider threads belonging to different thread * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) - & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) - return; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, - &mm->membarrier_state); + if (atomic_read(&mm->membarrier_state) & state) + return 0; + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); + if (flags & MEMBARRIER_FLAG_SYNC_CORE) + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, + &mm->membarrier_state); + if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { + /* + * Ensure all future scheduler executions will observe the + * new thread flag state for this process. + */ + synchronize_sched(); + } + atomic_or(state, &mm->membarrier_state); + + return 0; } /** @@ -159,21 +290,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) int cmd_mask = MEMBARRIER_CMD_BITMASK; if (tick_nohz_full_enabled()) - cmd_mask &= ~MEMBARRIER_CMD_SHARED; + cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; return cmd_mask; } - case MEMBARRIER_CMD_SHARED: - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + case MEMBARRIER_CMD_GLOBAL: + /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ if (tick_nohz_full_enabled()) return -EINVAL; if (num_online_cpus() > 1) synchronize_sched(); return 0; + case MEMBARRIER_CMD_GLOBAL_EXPEDITED: + return membarrier_global_expedited(); + case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: + return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(); + return membarrier_private_expedited(0); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: - membarrier_register_private_expedited(); - return 0; + return membarrier_register_private_expedited(0); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); default: return -EINVAL; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 862a513adca3..7aef6b4e885a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,12 +3,8 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ - #include "sched.h" -#include <linux/slab.h> -#include <linux/irq_work.h> - int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head); static void push_rt_tasks(struct rq *); static void pull_rt_task(struct rq *); -static inline void queue_push_tasks(struct rq *rq) +static inline void rt_queue_push_tasks(struct rq *rq) { if (!has_pushable_tasks(rq)) return; @@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq) queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); } -static inline void queue_pull_task(struct rq *rq) +static inline void rt_queue_pull_task(struct rq *rq) { queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); } @@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq) { } -static inline void queue_push_tasks(struct rq *rq) +static inline void rt_queue_push_tasks(struct rq *rq) { } #endif /* CONFIG_SMP */ @@ -843,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) continue; raw_spin_lock(&rq->lock); + update_rq_clock(rq); + if (rt_rq->rt_time) { u64 runtime; @@ -863,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq_clock_skip_update(rq, false); + rq_clock_cancel_skipupdate(rq); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; @@ -951,24 +949,23 @@ static void update_curr_rt(struct rq *rq) struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; u64 delta_exec; + u64 now; if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, SCHED_CPUFREQ_RT); - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1003,6 +1000,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; + + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq, 0); } static void @@ -1019,6 +1019,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) add_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 1; + + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq, 0); } #if defined CONFIG_SMP @@ -1451,9 +1454,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) return; /* - * There appears to be other cpus that can accept - * current and none to run 'p', so lets reschedule - * to try and push current away: + * There appear to be other CPUs that can accept + * the current task but none can run 'p', so lets reschedule + * to try and push the current task away: */ requeue_task_rt(rq, p, 1); resched_curr(rq); @@ -1567,7 +1570,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); - queue_push_tasks(rq); + rt_queue_push_tasks(rq); return p; } @@ -1594,12 +1597,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) if (!task_running(rq, p) && cpumask_test_cpu(cpu, &p->cpus_allowed)) return 1; + return 0; } /* * Return the highest pushable rq's task, which is suitable to be executed - * on the cpu, NULL otherwise + * on the CPU, NULL otherwise */ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { @@ -1637,11 +1641,11 @@ static int find_lowest_rq(struct task_struct *task) return -1; /* No targets found */ /* - * At this point we have built a mask of cpus representing the + * At this point we have built a mask of CPUs representing the * lowest priority tasks in the system. Now we want to elect * the best one based on our affinity and topology. * - * We prioritize the last cpu that the task executed on since + * We prioritize the last CPU that the task executed on since * it is most likely cache-hot in that location. */ if (cpumask_test_cpu(cpu, lowest_mask)) @@ -1649,7 +1653,7 @@ static int find_lowest_rq(struct task_struct *task) /* * Otherwise, we consult the sched_domains span maps to figure - * out which cpu is logically closest to our hot cache data. + * out which CPU is logically closest to our hot cache data. */ if (!cpumask_test_cpu(this_cpu, lowest_mask)) this_cpu = -1; /* Skip this_cpu opt if not among lowest */ @@ -1690,6 +1694,7 @@ static int find_lowest_rq(struct task_struct *task) cpu = cpumask_any(lowest_mask); if (cpu < nr_cpu_ids) return cpu; + return -1; } @@ -1825,7 +1830,7 @@ retry: * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue * to push it to. Do not retry in this case, since - * other cpus will pull from us when ready. + * other CPUs will pull from us when ready. */ goto out; } @@ -1907,9 +1912,8 @@ static void push_rt_tasks(struct rq *rq) * the rt_loop_next will cause the iterator to perform another scan. * */ -static int rto_next_cpu(struct rq *rq) +static int rto_next_cpu(struct root_domain *rd) { - struct root_domain *rd = rq->rd; int next; int cpu; @@ -1918,7 +1922,7 @@ static int rto_next_cpu(struct rq *rq) * rt_next_cpu() will simply return the first CPU found in * the rto_mask. * - * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * If rto_next_cpu() is called with rto_cpu is a valid CPU, it * will return the next CPU found in the rto_mask. * * If there are no more CPUs left in the rto_mask, then a check is made @@ -1979,25 +1983,30 @@ static void tell_cpu_to_push(struct rq *rq) raw_spin_lock(&rq->rd->rto_lock); /* - * The rto_cpu is updated under the lock, if it has a valid cpu + * The rto_cpu is updated under the lock, if it has a valid CPU * then the IPI is still running and will continue due to the * update to loop_next, and nothing needs to be done here. * Otherwise it is finishing up and an ipi needs to be sent. */ if (rq->rd->rto_cpu < 0) - cpu = rto_next_cpu(rq); + cpu = rto_next_cpu(rq->rd); raw_spin_unlock(&rq->rd->rto_lock); rto_start_unlock(&rq->rd->rto_loop_start); - if (cpu >= 0) + if (cpu >= 0) { + /* Make sure the rd does not get freed while pushing */ + sched_get_rd(rq->rd); irq_work_queue_on(&rq->rd->rto_push_work, cpu); + } } /* Called from hardirq context */ void rto_push_irq_work_func(struct irq_work *work) { + struct root_domain *rd = + container_of(work, struct root_domain, rto_push_work); struct rq *rq; int cpu; @@ -2013,18 +2022,20 @@ void rto_push_irq_work_func(struct irq_work *work) raw_spin_unlock(&rq->lock); } - raw_spin_lock(&rq->rd->rto_lock); + raw_spin_lock(&rd->rto_lock); /* Pass the IPI to the next rt overloaded queue */ - cpu = rto_next_cpu(rq); + cpu = rto_next_cpu(rd); - raw_spin_unlock(&rq->rd->rto_lock); + raw_spin_unlock(&rd->rto_lock); - if (cpu < 0) + if (cpu < 0) { + sched_put_rd(rd); return; + } /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rq->rd->rto_push_work, cpu); + irq_work_queue_on(&rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ @@ -2097,7 +2108,7 @@ static void pull_rt_task(struct rq *this_rq) /* * There's a chance that p is higher in priority - * than what's currently running on its cpu. + * than what's currently running on its CPU. * This is just that p is wakeing up and hasn't * had a chance to schedule. We only pull * p if it is lower in priority than the @@ -2179,7 +2190,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; - queue_pull_task(rq); + rt_queue_pull_task(rq); } void __init init_sched_rt_class(void) @@ -2210,7 +2221,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) - queue_push_tasks(rq); + rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); @@ -2234,7 +2245,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * may need to pull tasks to this runqueue. */ if (oldprio < p->prio) - queue_pull_task(rq); + rt_queue_pull_task(rq); /* * If there's a higher priority task waiting to run @@ -2284,6 +2295,14 @@ static void watchdog(struct rq *rq, struct task_struct *p) static inline void watchdog(struct rq *rq, struct task_struct *p) { } #endif +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { struct sched_rt_entity *rt_se = &p->rt; @@ -2677,6 +2696,7 @@ int sched_rr_handler(struct ctl_table *table, int write, msecs_to_jiffies(sysctl_sched_rr_timeslice); } mutex_unlock(&mutex); + return ret; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e95505e23c6..15750c222ca2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,39 +1,73 @@ /* SPDX-License-Identifier: GPL-2.0 */ - +/* + * Scheduler internal types and methods: + */ #include <linux/sched.h> + #include <linux/sched/autogroup.h> -#include <linux/sched/sysctl.h> -#include <linux/sched/topology.h> -#include <linux/sched/rt.h> -#include <linux/sched/deadline.h> #include <linux/sched/clock.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/signal.h> -#include <linux/sched/numa_balancing.h> -#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> #include <linux/sched/cpufreq.h> -#include <linux/sched/stat.h> -#include <linux/sched/nohz.h> +#include <linux/sched/cputime.h> +#include <linux/sched/deadline.h> #include <linux/sched/debug.h> #include <linux/sched/hotplug.h> +#include <linux/sched/idle.h> +#include <linux/sched/init.h> +#include <linux/sched/isolation.h> +#include <linux/sched/jobctl.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/mm.h> +#include <linux/sched/nohz.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/prio.h> +#include <linux/sched/rt.h> +#include <linux/sched/signal.h> +#include <linux/sched/stat.h> +#include <linux/sched/sysctl.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> -#include <linux/sched/cputime.h> -#include <linux/sched/init.h> +#include <linux/sched/topology.h> +#include <linux/sched/user.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/xacct.h> + +#include <uapi/linux/sched/types.h> -#include <linux/u64_stats_sync.h> -#include <linux/kernel_stat.h> #include <linux/binfmts.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/compat.h> +#include <linux/context_tracking.h> +#include <linux/cpufreq.h> +#include <linux/cpuidle.h> +#include <linux/cpuset.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/delayacct.h> +#include <linux/init_task.h> +#include <linux/kprobes.h> +#include <linux/kthread.h> +#include <linux/membarrier.h> +#include <linux/migrate.h> +#include <linux/mmu_context.h> +#include <linux/nmi.h> +#include <linux/proc_fs.h> +#include <linux/prefetch.h> +#include <linux/profile.h> +#include <linux/rcupdate_wait.h> +#include <linux/security.h> +#include <linux/stackprotector.h> #include <linux/stop_machine.h> -#include <linux/irq_work.h> -#include <linux/tick.h> -#include <linux/slab.h> -#include <linux/cgroup.h> +#include <linux/suspend.h> +#include <linux/swait.h> +#include <linux/syscalls.h> +#include <linux/task_work.h> +#include <linux/tsacct_kern.h> + +#include <asm/tlb.h> #ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> +# include <asm/paravirt.h> #endif #include "cpupri.h" @@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } * and does not change the user-interface for setting shares/weights. * * We increase resolution only if we have enough bits to allow this increased - * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are - * pretty high and the returns do not justify the increased costs. + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. * - * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to - * increase coverage and consistency always enable it on 64bit platforms. + * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to + * increase coverage and consistency always enable it on 64-bit platforms. */ #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) @@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } * 10 -> just above 1us * 9 -> just above 0.5us */ -#define DL_SCALE (10) +#define DL_SCALE 10 /* - * These are the 'tuning knobs' of the scheduler: + * Single value that denotes runtime == period, ie unlimited time. */ - -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) +#define RUNTIME_INF ((u64)~0ULL) static inline int idle_policy(int policy) { @@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p); * control. */ struct dl_bandwidth { - raw_spinlock_t dl_runtime_lock; - u64 dl_runtime; - u64 dl_period; + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; }; static inline int dl_bandwidth_enabled(void) @@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void) } struct dl_bw { - raw_spinlock_t lock; - u64 bw, total_bw; + raw_spinlock_t lock; + u64 bw; + u64 total_bw; }; static inline void __dl_update(struct dl_bw *dl_b, s64 bw); @@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } -void dl_change_utilization(struct task_struct *p, u64 new_bw); +extern void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); -extern int sched_dl_global_validate(void); +extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); -extern int sched_dl_overflow(struct task_struct *p, int policy, - const struct sched_attr *attr); +extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); -extern int dl_task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed); -extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, - const struct cpumask *trial); +extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED @@ -300,32 +328,36 @@ extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH - raw_spinlock_t lock; - ktime_t period; - u64 quota, runtime; - s64 hierarchical_quota; - u64 runtime_expires; - - int idle, period_active; - struct hrtimer period_timer, slack_timer; - struct list_head throttled_cfs_rq; - - /* statistics */ - int nr_periods, nr_throttled; - u64 throttled_time; + raw_spinlock_t lock; + ktime_t period; + u64 quota; + u64 runtime; + s64 hierarchical_quota; + u64 runtime_expires; + + int idle; + int period_active; + struct hrtimer period_timer; + struct hrtimer slack_timer; + struct list_head throttled_cfs_rq; + + /* Statistics: */ + int nr_periods; + int nr_throttled; + u64 throttled_time; #endif }; -/* task group related information */ +/* Task group related information */ struct task_group { struct cgroup_subsys_state css; #ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; + /* schedulable entities of this group on each CPU */ + struct sched_entity **se; + /* runqueue "owned" by this group on each CPU */ + struct cfs_rq **cfs_rq; + unsigned long shares; #ifdef CONFIG_SMP /* @@ -333,29 +365,29 @@ struct task_group { * it in its own cacheline separated from the fields above which * will also be accessed at each tick. */ - atomic_long_t load_avg ____cacheline_aligned; + atomic_long_t load_avg ____cacheline_aligned; #endif #endif #ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; - struct rt_bandwidth rt_bandwidth; + struct rt_bandwidth rt_bandwidth; #endif - struct rcu_head rcu; - struct list_head list; + struct rcu_head rcu; + struct list_head list; - struct task_group *parent; - struct list_head siblings; - struct list_head children; + struct task_group *parent; + struct list_head siblings; + struct list_head children; #ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; + struct autogroup *autogroup; #endif - struct cfs_bandwidth cfs_bandwidth; + struct cfs_bandwidth cfs_bandwidth; }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -369,8 +401,8 @@ struct task_group { * (The default weight is 1024 - so there's no practical * limitation from this.) */ -#define MIN_SHARES (1UL << 1) -#define MAX_SHARES (1UL << 18) +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) #endif typedef int (*tg_visitor)(struct task_group *, void *); @@ -443,35 +475,39 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { - struct load_weight load; - unsigned long runnable_weight; - unsigned int nr_running, h_nr_running; + struct load_weight load; + unsigned long runnable_weight; + unsigned int nr_running; + unsigned int h_nr_running; - u64 exec_clock; - u64 min_vruntime; + u64 exec_clock; + u64 min_vruntime; #ifndef CONFIG_64BIT - u64 min_vruntime_copy; + u64 min_vruntime_copy; #endif - struct rb_root_cached tasks_timeline; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last, *skip; + struct sched_entity *curr; + struct sched_entity *next; + struct sched_entity *last; + struct sched_entity *skip; #ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; + unsigned int nr_spread_over; #endif #ifdef CONFIG_SMP /* * CFS load tracking */ - struct sched_avg avg; + struct sched_avg avg; #ifndef CONFIG_64BIT - u64 load_last_update_time_copy; + u64 load_last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; @@ -482,9 +518,9 @@ struct cfs_rq { } removed; #ifdef CONFIG_FAIR_GROUP_SCHED - unsigned long tg_load_avg_contrib; - long propagate; - long prop_runnable_sum; + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; /* * h_load = weight * f(tg) @@ -492,36 +528,38 @@ struct cfs_rq { * Where f(tg) is the recursive weight fraction assigned to * this group. */ - unsigned long h_load; - u64 last_h_load_update; - struct sched_entity *h_load_next; + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. + * This list is used during load balance. */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; - u64 runtime_expires; - s64 runtime_remaining; - - u64 throttled_clock, throttled_clock_task; - u64 throttled_clock_task_time; - int throttled, throttle_count; - struct list_head throttled_list; + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_clock; + u64 throttled_clock_task; + u64 throttled_clock_task_time; + int throttled; + int throttle_count; + struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void) /* Real-Time classes' related field in a runqueue: */ struct rt_rq { - struct rt_prio_array active; - unsigned int rt_nr_running; - unsigned int rr_nr_running; + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { - int curr; /* highest queued rt task prio */ + int curr; /* highest queued rt task prio */ #ifdef CONFIG_SMP - int next; /* next highest */ + int next; /* next highest */ #endif } highest_prio; #endif #ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; - int overloaded; - struct plist_head pushable_tasks; + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ - int rt_queued; + int rt_queued; - int rt_throttled; - u64 rt_time; - u64 rt_runtime; + int rt_throttled; + u64 rt_time; + u64 rt_runtime; /* Nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; + unsigned long rt_nr_boosted; - struct rq *rq; - struct task_group *tg; + struct rq *rq; + struct task_group *tg; #endif }; /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ - struct rb_root_cached root; + struct rb_root_cached root; - unsigned long dl_nr_running; + unsigned long dl_nr_running; #ifdef CONFIG_SMP /* @@ -586,28 +624,28 @@ struct dl_rq { * should migrate somewhere else. */ struct { - u64 curr; - u64 next; + u64 curr; + u64 next; } earliest_dl; - unsigned long dl_nr_migratory; - int overloaded; + unsigned long dl_nr_migratory; + int overloaded; /* * Tasks on this rq that can be pushed away. They are kept in * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ - struct rb_root_cached pushable_dl_tasks_root; + struct rb_root_cached pushable_dl_tasks_root; #else - struct dl_bw dl_bw; + struct dl_bw dl_bw; #endif /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a * task blocks */ - u64 running_bw; + u64 running_bw; /* * Utilization of the tasks "assigned" to this runqueue (including @@ -618,14 +656,14 @@ struct dl_rq { * This is needed to compute the "inactive utilization" for the * runqueue (inactive utilization = this_bw - running_bw). */ - u64 this_bw; - u64 extra_bw; + u64 this_bw; + u64 extra_bw; /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. */ - u64 bw_ratio; + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b) /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new + * fully partitioning the member CPUs from any other cpuset. Whenever a new * exclusive cpuset is created, we also create and attach a new root-domain * object. * */ struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; /* Indicate more than one runnable task for any CPU */ - bool overload; + bool overload; /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). */ - cpumask_var_t dlo_mask; - atomic_t dlo_count; - struct dl_bw dl_bw; - struct cpudl cpudl; + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; #ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. */ - struct irq_work rto_push_work; - raw_spinlock_t rto_lock; + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; /* These are only updated and read within rto_lock */ - int rto_loop; - int rto_cpu; + int rto_loop; + int rto_cpu; /* These atomics are updated outside of a lock */ - atomic_t rto_loop_next; - atomic_t rto_loop_start; + atomic_t rto_loop_next; + atomic_t rto_loop_start; #endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_var_t rto_mask; - struct cpupri cpupri; + cpumask_var_t rto_mask; + struct cpupri cpupri; - unsigned long max_cpu_capacity; + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -691,6 +729,8 @@ extern struct mutex sched_domains_mutex; extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); @@ -706,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work); */ struct rq { /* runqueue lock: */ - raw_spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ - unsigned int nr_running; + unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING - unsigned int nr_numa_running; - unsigned int nr_preferred_running; + unsigned int nr_numa_running; + unsigned int nr_preferred_running; #endif #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP - unsigned long last_load_update_tick; + unsigned long last_load_update_tick; + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; #endif /* CONFIG_SMP */ - unsigned long nohz_flags; + unsigned int nohz_tick_stopped; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_NO_HZ_FULL - unsigned long last_sched_tick; -#endif - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; - struct cfs_rq cfs; - struct rt_rq rt; - struct dl_rq dl; + /* capture load from *all* tasks on this CPU: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; - struct list_head *tmp_alone_branch; + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -749,94 +790,98 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned long nr_uninterruptible; - struct task_struct *curr, *idle, *stop; - unsigned long next_balance; - struct mm_struct *prev_mm; + struct task_struct *curr; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; - unsigned int clock_update_flags; - u64 clock; - u64 clock_task; + unsigned int clock_update_flags; + u64 clock; + u64 clock_task; - atomic_t nr_iowait; + atomic_t nr_iowait; #ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; + struct root_domain *rd; + struct sched_domain *sd; - unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; - struct callback_head *balance_callback; + struct callback_head *balance_callback; + + unsigned char idle_balance; - unsigned char idle_balance; /* For active balancing */ - int active_balance; - int push_cpu; - struct cpu_stop_work active_balance_work; - /* cpu of this runqueue: */ - int cpu; - int online; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + + /* CPU of this runqueue: */ + int cpu; + int online; struct list_head cfs_tasks; - u64 rt_avg; - u64 age_stamp; - u64 idle_stamp; - u64 avg_idle; + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; /* This is used to determine avg_idle's max value */ - u64 max_idle_balance_cost; + u64 max_idle_balance_cost; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; + u64 prev_irq_time; #endif #ifdef CONFIG_PARAVIRT - u64 prev_steal_time; + u64 prev_steal_time; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; + u64 prev_steal_time_rq; #endif /* calc_load related fields */ - unsigned long calc_load_update; - long calc_load_active; + unsigned long calc_load_update; + long calc_load_active; #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP - int hrtick_csd_pending; - call_single_data_t hrtick_csd; + int hrtick_csd_pending; + call_single_data_t hrtick_csd; #endif - struct hrtimer hrtick_timer; + struct hrtimer hrtick_timer; #endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ - unsigned int yld_count; + unsigned int yld_count; /* schedule() stats */ - unsigned int sched_count; - unsigned int sched_goidle; + unsigned int sched_count; + unsigned int sched_goidle; /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; + unsigned int ttwu_count; + unsigned int ttwu_local; #endif #ifdef CONFIG_SMP - struct llist_head wake_list; + struct llist_head wake_list; #endif #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ - struct cpuidle_state *idle_state; + struct cpuidle_state *idle_state; #endif }; @@ -902,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq) * one position though, because the next rq_unpin_lock() will shift it * back. */ -#define RQCF_REQ_SKIP 0x01 -#define RQCF_ACT_SKIP 0x02 -#define RQCF_UPDATED 0x04 +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 static inline void assert_clock_updated(struct rq *rq) { @@ -931,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -static inline void rq_clock_skip_update(struct rq *rq, bool skip) +static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_held(&rq->lock); - if (skip) - rq->clock_update_flags |= RQCF_REQ_SKIP; - else - rq->clock_update_flags &= ~RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttoling, which is the only time a skip + * request is cancelled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; } struct rq_flags { @@ -1057,12 +1109,12 @@ extern void sched_ttwu_pending(void); /** * highest_flag_domain - Return highest sched_domain containing flag. - * @cpu: The cpu whose highest level of sched domain is to + * @cpu: The CPU whose highest level of sched domain is to * be returned. * @flag: The flag to check for the highest sched_domain - * for the given cpu. + * for the given CPU. * - * Returns the highest sched_domain of a cpu which contains the given flag. + * Returns the highest sched_domain of a CPU which contains the given flag. */ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) { @@ -1097,30 +1149,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { - atomic_t ref; + atomic_t ref; /* * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ - unsigned long capacity; - unsigned long min_capacity; /* Min per-CPU capacity in group */ - unsigned long next_update; - int imbalance; /* XXX unrelated to capacity but shared group state */ + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ #ifdef CONFIG_SCHED_DEBUG - int id; + int id; #endif - unsigned long cpumask[0]; /* balance mask */ + unsigned long cpumask[0]; /* Balance mask */ }; struct sched_group { - struct sched_group *next; /* Must be a circular list */ - atomic_t ref; + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; - unsigned int group_weight; + unsigned int group_weight; struct sched_group_capacity *sgc; - int asym_prefer_cpu; /* cpu of highest priority in group */ + int asym_prefer_cpu; /* CPU of highest priority in group */ /* * The CPUs this group covers. @@ -1129,7 +1181,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[0]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) @@ -1146,8 +1198,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) } /** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. + * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. + * @group: The group whose first CPU is to be returned. */ static inline unsigned int group_first_cpu(struct sched_group *group) { @@ -1347,19 +1399,12 @@ static inline int task_on_rq_migrating(struct task_struct *p) return p->on_rq == TASK_ON_RQ_MIGRATING; } -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_post_lock_switch -# define finish_arch_post_lock_switch() do { } while (0) -#endif - /* * wake flags */ -#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* child wakeup after fork */ -#define WF_MIGRATED 0x4 /* internal use, task got migrated */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1370,11 +1415,11 @@ static inline int task_on_rq_migrating(struct task_struct *p) * slice expiry etc. */ -#define WEIGHT_IDLEPRIO 3 -#define WMULT_IDLEPRIO 1431655765 +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 -extern const int sched_prio_to_weight[40]; -extern const u32 sched_prio_to_wmult[40]; +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: @@ -1396,9 +1441,9 @@ extern const u32 sched_prio_to_wmult[40]; */ #define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -1420,10 +1465,10 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); - void (*yield_task) (struct rq *rq); - bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); /* * It is the responsibility of the pick_next_task() method that will @@ -1433,16 +1478,16 @@ struct sched_class { * May return RETRY_TASK when it finds a higher prio class has runnable * tasks. */ - struct task_struct * (*pick_next_task) (struct rq *rq, - struct task_struct *prev, - struct rq_flags *rf); - void (*put_prev_task) (struct rq *rq, struct task_struct *p); + struct task_struct * (*pick_next_task)(struct rq *rq, + struct task_struct *prev, + struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p); - void (*task_woken) (struct rq *this_rq, struct task_struct *task); + void (*task_woken)(struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask); @@ -1451,31 +1496,31 @@ struct sched_class { void (*rq_offline)(struct rq *rq); #endif - void (*set_curr_task) (struct rq *rq); - void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); - void (*task_fork) (struct task_struct *p); - void (*task_dead) (struct task_struct *p); + void (*set_curr_task)(struct rq *rq); + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + void (*task_fork)(struct task_struct *p); + void (*task_dead)(struct task_struct *p); /* * The switched_from() call is allowed to drop rq->lock, therefore we * cannot assume the switched_from/switched_to pair is serliazed by * rq->lock. They are however serialized by p->pi_lock. */ - void (*switched_from) (struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + int oldprio); - unsigned int (*get_rr_interval) (struct rq *rq, - struct task_struct *task); + unsigned int (*get_rr_interval)(struct rq *rq, + struct task_struct *task); - void (*update_curr) (struct rq *rq); + void (*update_curr)(struct rq *rq); -#define TASK_SET_GROUP 0 -#define TASK_MOVE_GROUP 1 +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_change_group) (struct task_struct *p, int type); + void (*task_change_group)(struct task_struct *p, int type); #endif }; @@ -1524,6 +1569,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { SCHED_WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; } #else @@ -1562,9 +1608,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); -#define BW_SHIFT 20 -#define BW_UNIT (1 << BW_SHIFT) -#define RATIO_SHIFT 8 +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1572,6 +1618,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); +extern int __init sched_tick_offload_init(void); /* * Tick may be needed by tasks in the runqueue depending on their policy and @@ -1596,6 +1643,7 @@ static inline void sched_update_tick_dependency(struct rq *rq) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } #else +static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } #endif @@ -1622,13 +1670,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -static inline void rq_last_tick_reset(struct rq *rq) -{ -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = jiffies; -#endif -} - extern void update_rq_clock(struct rq *rq); extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1819,8 +1860,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) /* * Unfair double_lock_balance: Optimizes throughput at the expense of * latency by eliminating extra atomic operations when the locks are - * already in proper order on entry. This favors lower cpu-ids and will - * grant the double lock to lower cpus over higher ids under contention, + * already in proper order on entry. This favors lower CPU-ids and will + * grant the double lock to lower CPUs over higher ids under contention, * regardless of entry order into the function. */ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) @@ -1852,7 +1893,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) { if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ + /* printk() doesn't work well under rq->lock */ raw_spin_unlock(&this_rq->lock); BUG_ON(1); } @@ -2003,16 +2044,19 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON -enum rq_nohz_flag_bits { - NOHZ_TICK_STOPPED, - NOHZ_BALANCE_KICK, -}; +#define NOHZ_BALANCE_KICK_BIT 0 +#define NOHZ_STATS_KICK_BIT 1 + +#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) + +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -extern void nohz_balance_exit_idle(unsigned int cpu); +extern void nohz_balance_exit_idle(struct rq *rq); #else -static inline void nohz_balance_exit_idle(unsigned int cpu) { } +static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif @@ -2111,15 +2155,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity -#ifndef arch_scale_freq_invariant -#define arch_scale_freq_invariant() (true) -#endif -#else /* arch_scale_freq_capacity */ -#define arch_scale_freq_invariant() (false) +# ifndef arch_scale_freq_invariant +# define arch_scale_freq_invariant() true +# endif +#else +# define arch_scale_freq_invariant() false #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - static inline unsigned long cpu_util_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2127,7 +2170,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq) static inline unsigned long cpu_util_cfs(struct rq *rq) { - return rq->cfs.avg.util_avg; -} + unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + + if (sched_feat(UTIL_EST)) { + util = max_t(unsigned long, util, + READ_ONCE(rq->cfs.avg.util_est.enqueued)); + } + return util; +} #endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 940b1fa1d2ce..ab112cbfd7c8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -1,14 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 - -#include <linux/slab.h> -#include <linux/fs.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> - +/* + * /proc/schedstat implementation + */ #include "sched.h" /* - * bump this up when changing the output format or the meaning of an existing + * Current schedstat API version. + * + * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ #define SCHEDSTAT_VERSION 15 @@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v) * This itererator needs some explanation. * It returns 1 for the header position. * This means 2 is cpu 0. - * In a hotplugged system some cpus, including cpu 0, may be missing so we have - * to use cpumask_* to iterate over the cpus. + * In a hotplugged system some CPUs, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the CPUs. */ static void *schedstat_start(struct seq_file *file, loff_t *offset) { @@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset) if (n < nr_cpu_ids) return (void *)(unsigned long)(n + 2); + return NULL; } static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) { (*offset)++; + return schedstat_start(file, offset); } @@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = { static int __init proc_schedstat_init(void) { proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; } subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baf500d12b7c..8aea199a39b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -30,29 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) -#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) -#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) -#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -#define schedstat_val(var) (var) -#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) - -#else /* !CONFIG_SCHEDSTATS */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long long delta) -{} -static inline void -rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) -{} -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long long delta) -{} -#define schedstat_enabled() 0 -#define schedstat_inc(var) do { } while (0) -#define schedstat_add(var, amt) do { } while (0) -#define schedstat_set(var, val) do { } while (0) -#define schedstat_val(var) 0 -#define schedstat_val_or_zero(var) 0 +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) + +#else /* !CONFIG_SCHEDSTATS: */ +static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } +# define schedstat_enabled() 0 +# define __schedstat_inc(var) do { } while (0) +# define schedstat_inc(var) do { } while (0) +# define __schedstat_add(var, amt) do { } while (0) +# define schedstat_add(var, amt) do { } while (0) +# define __schedstat_set(var, val) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +# define schedstat_val(var) 0 +# define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO @@ -63,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) /* * We are interested in knowing how long it was from the *first* time a - * task was queued to the time that it finally hit a cpu, we call this routine - * from dequeue_task() to account for possible rq->clock skew across cpus. The - * delta taken on each cpu would annul the skew. + * task was queued to the time that it finally hit a CPU, we call this routine + * from dequeue_task() to account for possible rq->clock skew across CPUs. The + * delta taken on each CPU would annul the skew. */ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { @@ -81,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) } /* - * Called when a task finally hits the cpu. We can now calculate how + * Called when a task finally hits the CPU. We can now calculate how * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ @@ -106,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) */ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { - if (unlikely(sched_info_on())) + if (unlikely(sched_info_on())) { if (!t->sched_info.last_queued) t->sched_info.last_queued = rq_clock(rq); + } } /* @@ -121,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) */ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) { - unsigned long long delta = rq_clock(rq) - - t->sched_info.last_arrival; + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; rq_sched_info_depart(rq, delta); @@ -136,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* - * prev now departs the cpu. It's not interesting to record + * prev now departs the CPU. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ @@ -150,18 +149,19 @@ __sched_info_switch(struct rq *rq, if (next != rq->idle) sched_info_arrive(rq, next); } + static inline void -sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { if (unlikely(sched_info_on())) __sched_info_switch(rq, prev, next); } -#else -#define sched_info_queued(rq, t) do { } while (0) -#define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(rq, t) do { } while (0) -#define sched_info_depart(rq, t) do { } while (0) -#define sched_info_arrive(rq, next) do { } while (0) -#define sched_info_switch(rq, t, next) do { } while (0) + +#else /* !CONFIG_SCHED_INFO: */ +# define sched_info_queued(rq, t) do { } while (0) +# define sched_info_reset_dequeued(t) do { } while (0) +# define sched_info_dequeued(rq, t) do { } while (0) +# define sched_info_depart(rq, t) do { } while (0) +# define sched_info_arrive(rq, next) do { } while (0) +# define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 210b1f2146ff..c183b790ca54 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - /* * stop-task scheduling class. * @@ -9,6 +7,7 @@ * * See kernel/stop_machine.c */ +#include "sched.h" #ifdef CONFIG_SMP static int @@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) cgroup_account_cputime(curr, delta_exec); } +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 9ff1555341ed..b6fb2c3b3ff7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/sched/signal.h> -#include <linux/swait.h> +/* + * <linux/swait.h> (simple wait queues ) implementation: + */ +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 034cbed7f88b..64cc564f5255 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,10 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include <linux/sched.h> -#include <linux/mutex.h> -#include <linux/sched/isolation.h> - #include "sched.h" DEFINE_MUTEX(sched_domains_mutex); @@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); return -1; } @@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_span(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); @@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); + printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); return 0; } @@ -259,6 +251,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); } +void sched_get_rd(struct root_domain *rd) +{ + atomic_inc(&rd->refcount); +} + +void sched_put_rd(struct root_domain *rd) +{ + if (!atomic_dec_and_test(&rd->refcount)) + return; + + call_rcu_sched(&rd->rcu, free_rootdomain); +} + static int init_rootdomain(struct root_domain *rd) { if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) @@ -582,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg) * are not. * * This leads to a few particularly weird cases where the sched_domain's are - * not of the same number for each cpu. Consider: + * not of the same number for each CPU. Consider: * * NUMA-2 0-3 0-3 * groups: {0-2},{1-3} {1-3},{0-2} @@ -767,7 +772,7 @@ fail: * ^ ^ ^ ^ * `-' `-' * - * The sched_domains are per-cpu and have a two way link (parent & child) and + * The sched_domains are per-CPU and have a two way link (parent & child) and * denote the ever growing mask of CPUs belonging to that level of topology. * * Each sched_domain has a circular (double) linked list of sched_group's, each @@ -1008,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) d->rd = alloc_rootdomain(); if (!d->rd) return sa_sd; + return sa_rootdomain; } @@ -1034,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd) } #ifdef CONFIG_NUMA -static int sched_domains_numa_levels; enum numa_topology_type sched_numa_topology_type; -static int *sched_domains_numa_distance; -int sched_max_numa_distance; -static struct cpumask ***sched_domains_numa_masks; -static int sched_domains_curr_level; + +static int sched_domains_numa_levels; +static int sched_domains_curr_level; + +int sched_max_numa_distance; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; #endif /* @@ -1061,11 +1069,11 @@ static int sched_domains_curr_level; * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUCAPACITY | \ + (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ - SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * @@ -1615,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); #endif - /* Fixup, ensure @sd has at least @child cpus. */ + /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), sched_domain_span(child)); @@ -1707,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); + return ret; } @@ -1811,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, return 1; tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, new ? (new + idx_new) : &tmp, sizeof(struct sched_domain_attr)); @@ -1916,4 +1926,3 @@ match2: mutex_unlock(&sched_domains_mutex); } - diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 929ecb7d6b78..928be527477e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -3,14 +3,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include <linux/init.h> -#include <linux/export.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/mm.h> -#include <linux/wait.h> -#include <linux/hash.h> -#include <linux/kthread.h> +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { @@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, break; } } + return nr_exclusive; } @@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) spin_unlock(&wq->lock); schedule(); spin_lock(&wq->lock); + return 0; } EXPORT_SYMBOL(do_wait_intr); @@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) spin_unlock_irq(&wq->lock); schedule(); spin_lock_irq(&wq->lock); + return 0; } EXPORT_SYMBOL(do_wait_intr_irq); @@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i if (ret) list_del_init(&wq_entry->entry); + return ret; } EXPORT_SYMBOL(autoremove_wake_function); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 84cb3acd9260..c67c6d24adc2 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,10 +1,7 @@ /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include <linux/wait_bit.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/hash.h> +#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) @@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync wait_bit->key.bit_nr != key->bit_nr || test_bit(key->bit_nr, key->flags)) return 0; - else - return autoremove_wake_function(wq_entry, mode, sync, key); + + return autoremove_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(wake_bit_function); @@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout( DEFINE_WAIT_BIT(wq_entry, word, bit); wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); @@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq_head)) __wake_up(wq_head, TASK_NORMAL, 1, &key); } @@ -148,108 +149,55 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +wait_queue_head_t *__var_waitqueue(void *p) { - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); + return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS); } +EXPORT_SYMBOL(__var_waitqueue); -static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, - void *arg) +static int +var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *arg) { struct wait_bit_key *key = arg; - struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - atomic_t *val = key->flags; + struct wait_bit_queue_entry *wbq_entry = + container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) + if (wbq_entry->key.flags != key->flags || + wbq_entry->key.bit_nr != key->bit_nr) return 0; - return autoremove_wake_function(wq_entry, mode, sync, key); -} -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - wait_atomic_t_action_f action, unsigned int mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); - val = wbq_entry->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val, mode); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &wbq_entry->wq_entry); - return ret; + return autoremove_wake_function(wq_entry, mode, sync, key); } -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue_entry name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wq_entry = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .entry = \ - LIST_HEAD_INIT((name).wq_entry.entry), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, - wait_atomic_t_action_f action, - unsigned int mode) +void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags) { - struct wait_queue_head *wq_head = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wq_entry, p); - - return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); + *wbq_entry = (struct wait_bit_queue_entry){ + .key = { + .flags = (var), + .bit_nr = -1, + }, + .wq_entry = { + .private = current, + .func = var_wake_function, + .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), + }, + }; } -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); +EXPORT_SYMBOL(init_wait_var_entry); -__sched int atomic_t_wait(atomic_t *counter, unsigned int mode) +void wake_up_var(void *var) { - schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; + __wake_up_bit(__var_waitqueue(var), var, -1); } -EXPORT_SYMBOL(atomic_t_wait); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); +EXPORT_SYMBOL(wake_up_var); __sched int bit_wait(struct wait_bit_key *word, int mode) { schedule(); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL(bit_wait); @@ -259,6 +207,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode) io_schedule(); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL(bit_wait_io); @@ -266,11 +215,13 @@ EXPORT_SYMBOL(bit_wait_io); __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); @@ -278,11 +229,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3153c9ea51bf..dc77548167ef 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -978,49 +978,68 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) } #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) -long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, - void __user *data) +static struct seccomp_filter *get_nth_filter(struct task_struct *task, + unsigned long filter_off) { - struct seccomp_filter *filter; - struct sock_fprog_kern *fprog; - long ret; - unsigned long count = 0; - - if (!capable(CAP_SYS_ADMIN) || - current->seccomp.mode != SECCOMP_MODE_DISABLED) { - return -EACCES; - } + struct seccomp_filter *orig, *filter; + unsigned long count; + /* + * Note: this is only correct because the caller should be the (ptrace) + * tracer of the task, otherwise lock_task_sighand is needed. + */ spin_lock_irq(&task->sighand->siglock); + if (task->seccomp.mode != SECCOMP_MODE_FILTER) { - ret = -EINVAL; - goto out; + spin_unlock_irq(&task->sighand->siglock); + return ERR_PTR(-EINVAL); } - filter = task->seccomp.filter; - while (filter) { - filter = filter->prev; + orig = task->seccomp.filter; + __get_seccomp_filter(orig); + spin_unlock_irq(&task->sighand->siglock); + + count = 0; + for (filter = orig; filter; filter = filter->prev) count++; - } if (filter_off >= count) { - ret = -ENOENT; + filter = ERR_PTR(-ENOENT); goto out; } - count -= filter_off; - filter = task->seccomp.filter; - while (filter && count > 1) { - filter = filter->prev; + count -= filter_off; + for (filter = orig; filter && count > 1; filter = filter->prev) count--; - } if (WARN_ON(count != 1 || !filter)) { - /* The filter tree shouldn't shrink while we're using it. */ - ret = -ENOENT; + filter = ERR_PTR(-ENOENT); goto out; } + __get_seccomp_filter(filter); + +out: + __put_seccomp_filter(orig); + return filter; +} + +long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, + void __user *data) +{ + struct seccomp_filter *filter; + struct sock_fprog_kern *fprog; + long ret; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + filter = get_nth_filter(task, filter_off); + if (IS_ERR(filter)) + return PTR_ERR(filter); + fprog = filter->prog->orig_prog; if (!fprog) { /* This must be a new non-cBPF filter, since we save @@ -1035,17 +1054,46 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - __get_seccomp_filter(filter); - spin_unlock_irq(&task->sighand->siglock); - if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; +out: __put_seccomp_filter(filter); return ret; +} -out: - spin_unlock_irq(&task->sighand->siglock); +long seccomp_get_metadata(struct task_struct *task, + unsigned long size, void __user *data) +{ + long ret; + struct seccomp_filter *filter; + struct seccomp_metadata kmd = {}; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + size = min_t(unsigned long, size, sizeof(kmd)); + + if (size < sizeof(kmd.filter_off)) + return -EINVAL; + + if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off))) + return -EFAULT; + + filter = get_nth_filter(task, kmd.filter_off); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + if (filter->log) + kmd.flags |= SECCOMP_FILTER_FLAG_LOG; + + ret = size; + if (copy_to_user(data, &kmd, size)) + ret = -EFAULT; + + __put_seccomp_filter(filter); return ret; } #endif diff --git a/kernel/signal.c b/kernel/signal.c index c6e4c83dc090..d4ccea599692 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -770,7 +770,7 @@ static int check_kill_permission(int sig, struct siginfo *info, } } - return security_task_kill(t, info, sig, 0); + return security_task_kill(t, info, sig, NULL); } /** @@ -1361,7 +1361,7 @@ static int kill_as_cred_perm(const struct cred *cred, /* like kill_pid_info(), but doesn't use uid/euid of "current" */ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, - const struct cred *cred, u32 secid) + const struct cred *cred) { int ret = -EINVAL; struct task_struct *p; @@ -1380,7 +1380,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, ret = -EPERM; goto out_unlock; } - ret = security_task_kill(p, info, sig, secid); + ret = security_task_kill(p, info, sig, cred); if (ret) goto out_unlock; @@ -2844,10 +2844,6 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) if ((sig == SIGFPE) && (si_code == FPE_FIXME)) layout = SIL_FAULT; #endif -#ifdef BUS_FIXME - if ((sig == SIGBUS) && (si_code == BUS_FIXME)) - layout = SIL_FAULT; -#endif } return layout; } @@ -3573,9 +3569,8 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(sigaltstack, - const compat_stack_t __user *, uss_ptr, - compat_stack_t __user *, uoss_ptr) +static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, + compat_stack_t __user *uoss_ptr) { stack_t uss, uoss; int ret; @@ -3602,9 +3597,16 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, return ret; } +COMPAT_SYSCALL_DEFINE2(sigaltstack, + const compat_stack_t __user *, uss_ptr, + compat_stack_t __user *, uoss_ptr) +{ + return do_compat_sigaltstack(uss_ptr, uoss_ptr); +} + int compat_restore_altstack(const compat_stack_t __user *uss) { - int err = compat_sys_sigaltstack(uss, NULL); + int err = do_compat_sigaltstack(uss, NULL); /* squash all but -EFAULT for now */ return err == -EFAULT ? err : 0; } @@ -3629,11 +3631,20 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) /** * sys_sigpending - examine pending signals - * @set: where mask of pending signal is returned + * @uset: where mask of pending signal is returned */ -SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) +SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { - return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); + sigset_t set; + int err; + + if (sizeof(old_sigset_t) > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); + if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) + err = -EFAULT; + return err; } #ifdef CONFIG_COMPAT diff --git a/kernel/softirq.c b/kernel/softirq.c index 24d243ef8e71..177de3640c78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -460,40 +460,46 @@ struct tasklet_head { static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); -void __tasklet_schedule(struct tasklet_struct *t) +static void __tasklet_schedule_common(struct tasklet_struct *t, + struct tasklet_head __percpu *headp, + unsigned int softirq_nr) { + struct tasklet_head *head; unsigned long flags; local_irq_save(flags); + head = this_cpu_ptr(headp); t->next = NULL; - *__this_cpu_read(tasklet_vec.tail) = t; - __this_cpu_write(tasklet_vec.tail, &(t->next)); - raise_softirq_irqoff(TASKLET_SOFTIRQ); + *head->tail = t; + head->tail = &(t->next); + raise_softirq_irqoff(softirq_nr); local_irq_restore(flags); } + +void __tasklet_schedule(struct tasklet_struct *t) +{ + __tasklet_schedule_common(t, &tasklet_vec, + TASKLET_SOFTIRQ); +} EXPORT_SYMBOL(__tasklet_schedule); void __tasklet_hi_schedule(struct tasklet_struct *t) { - unsigned long flags; - - local_irq_save(flags); - t->next = NULL; - *__this_cpu_read(tasklet_hi_vec.tail) = t; - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); - raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_restore(flags); + __tasklet_schedule_common(t, &tasklet_hi_vec, + HI_SOFTIRQ); } EXPORT_SYMBOL(__tasklet_hi_schedule); -static __latent_entropy void tasklet_action(struct softirq_action *a) +static void tasklet_action_common(struct softirq_action *a, + struct tasklet_head *tl_head, + unsigned int softirq_nr) { struct tasklet_struct *list; local_irq_disable(); - list = __this_cpu_read(tasklet_vec.head); - __this_cpu_write(tasklet_vec.head, NULL); - __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); + list = tl_head->head; + tl_head->head = NULL; + tl_head->tail = &tl_head->head; local_irq_enable(); while (list) { @@ -515,47 +521,21 @@ static __latent_entropy void tasklet_action(struct softirq_action *a) local_irq_disable(); t->next = NULL; - *__this_cpu_read(tasklet_vec.tail) = t; - __this_cpu_write(tasklet_vec.tail, &(t->next)); - __raise_softirq_irqoff(TASKLET_SOFTIRQ); + *tl_head->tail = t; + tl_head->tail = &t->next; + __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } } -static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { - struct tasklet_struct *list; - - local_irq_disable(); - list = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, NULL); - __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); - local_irq_enable(); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, - &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } + tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); +} - local_irq_disable(); - t->next = NULL; - *__this_cpu_read(tasklet_hi_vec.tail) = t; - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +{ + tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } void tasklet_init(struct tasklet_struct *t, diff --git a/kernel/sys.c b/kernel/sys.c index f2289de20e19..ad692183dfe9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -69,6 +69,8 @@ #include <asm/io.h> #include <asm/unistd.h> +#include "uid16.h" + #ifndef SET_UNALIGN_CTL # define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif @@ -340,7 +342,7 @@ out_unlock: * operations (as far as semantic preservation is concerned). */ #ifdef CONFIG_MULTIUSER -SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +long __sys_setregid(gid_t rgid, gid_t egid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -392,12 +394,17 @@ error: return retval; } +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +{ + return __sys_setregid(rgid, egid); +} + /* * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ -SYSCALL_DEFINE1(setgid, gid_t, gid) +long __sys_setgid(gid_t gid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -429,6 +436,11 @@ error: return retval; } +SYSCALL_DEFINE1(setgid, gid_t, gid) +{ + return __sys_setgid(gid); +} + /* * change the user struct in a credentials set to match the new UID */ @@ -473,7 +485,7 @@ static int set_user(struct cred *new) * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ -SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +long __sys_setreuid(uid_t ruid, uid_t euid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -533,6 +545,11 @@ error: return retval; } +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +{ + return __sys_setreuid(ruid, euid); +} + /* * setuid() is implemented like SysV with SAVED_IDS * @@ -544,7 +561,7 @@ error: * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ -SYSCALL_DEFINE1(setuid, uid_t, uid) +long __sys_setuid(uid_t uid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -586,12 +603,17 @@ error: return retval; } +SYSCALL_DEFINE1(setuid, uid_t, uid) +{ + return __sys_setuid(uid); +} + /* * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ -SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -656,6 +678,11 @@ error: return retval; } +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +{ + return __sys_setresuid(ruid, euid, suid); +} + SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); @@ -678,7 +705,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ /* * Same as above, but for rgid, egid, sgid. */ -SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -730,6 +757,11 @@ error: return retval; } +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +{ + return __sys_setresgid(rgid, egid, sgid); +} + SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); @@ -757,7 +789,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ -SYSCALL_DEFINE1(setfsuid, uid_t, uid) +long __sys_setfsuid(uid_t uid) { const struct cred *old; struct cred *new; @@ -793,10 +825,15 @@ change_okay: return old_fsuid; } +SYSCALL_DEFINE1(setfsuid, uid_t, uid) +{ + return __sys_setfsuid(uid); +} + /* * Samma pÃ¥ svenska.. */ -SYSCALL_DEFINE1(setfsgid, gid_t, gid) +long __sys_setfsgid(gid_t gid) { const struct cred *old; struct cred *new; @@ -830,6 +867,11 @@ change_okay: commit_creds(new); return old_fsgid; } + +SYSCALL_DEFINE1(setfsgid, gid_t, gid) +{ + return __sys_setfsgid(gid); +} #endif /* CONFIG_MULTIUSER */ /** @@ -1027,7 +1069,7 @@ out: return err; } -SYSCALL_DEFINE1(getpgid, pid_t, pid) +static int do_getpgid(pid_t pid) { struct task_struct *p; struct pid *grp; @@ -1055,11 +1097,16 @@ out: return retval; } +SYSCALL_DEFINE1(getpgid, pid_t, pid) +{ + return do_getpgid(pid); +} + #ifdef __ARCH_WANT_SYS_GETPGRP SYSCALL_DEFINE0(getpgrp) { - return sys_getpgid(0); + return do_getpgid(0); } #endif @@ -1103,7 +1150,7 @@ static void set_special_pids(struct pid *pid) change_pid(curr, PIDTYPE_PGID, pid); } -SYSCALL_DEFINE0(setsid) +int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); @@ -1136,6 +1183,11 @@ out: return err; } +SYSCALL_DEFINE0(setsid) +{ + return ksys_setsid(); +} + DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index b5189762d275..9791364925dc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -5,6 +5,11 @@ #include <asm/unistd.h> +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */ +#include <asm/syscall_wrapper.h> +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + /* we can't #include <linux/syscalls.h> here, but tell gcc to not warn with -Wmissing-prototypes */ asmlinkage long sys_ni_syscall(void); @@ -17,245 +22,411 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -cond_syscall(sys_quotactl); -cond_syscall(sys32_quotactl); -cond_syscall(sys_acct); -cond_syscall(sys_lookup_dcookie); -cond_syscall(compat_sys_lookup_dcookie); -cond_syscall(sys_swapon); -cond_syscall(sys_swapoff); -cond_syscall(sys_kexec_load); -cond_syscall(compat_sys_kexec_load); -cond_syscall(sys_kexec_file_load); -cond_syscall(sys_init_module); -cond_syscall(sys_finit_module); -cond_syscall(sys_delete_module); -cond_syscall(sys_socketpair); -cond_syscall(sys_bind); -cond_syscall(sys_listen); -cond_syscall(sys_accept); -cond_syscall(sys_accept4); -cond_syscall(sys_connect); -cond_syscall(sys_getsockname); -cond_syscall(sys_getpeername); -cond_syscall(sys_sendto); -cond_syscall(sys_send); -cond_syscall(sys_recvfrom); -cond_syscall(sys_recv); -cond_syscall(sys_socket); -cond_syscall(sys_setsockopt); -cond_syscall(compat_sys_setsockopt); -cond_syscall(sys_getsockopt); -cond_syscall(compat_sys_getsockopt); -cond_syscall(sys_shutdown); -cond_syscall(sys_sendmsg); -cond_syscall(sys_sendmmsg); -cond_syscall(compat_sys_sendmsg); -cond_syscall(compat_sys_sendmmsg); -cond_syscall(sys_recvmsg); -cond_syscall(sys_recvmmsg); -cond_syscall(compat_sys_recvmsg); -cond_syscall(compat_sys_recv); -cond_syscall(compat_sys_recvfrom); -cond_syscall(compat_sys_recvmmsg); -cond_syscall(sys_socketcall); -cond_syscall(sys_futex); -cond_syscall(compat_sys_futex); -cond_syscall(sys_set_robust_list); -cond_syscall(compat_sys_set_robust_list); -cond_syscall(sys_get_robust_list); -cond_syscall(compat_sys_get_robust_list); -cond_syscall(sys_epoll_create); -cond_syscall(sys_epoll_create1); -cond_syscall(sys_epoll_ctl); -cond_syscall(sys_epoll_wait); -cond_syscall(sys_epoll_pwait); -cond_syscall(compat_sys_epoll_pwait); -cond_syscall(sys_semget); -cond_syscall(sys_semop); -cond_syscall(sys_semtimedop); -cond_syscall(compat_sys_semtimedop); -cond_syscall(sys_semctl); -cond_syscall(compat_sys_semctl); -cond_syscall(sys_msgget); -cond_syscall(sys_msgsnd); -cond_syscall(compat_sys_msgsnd); -cond_syscall(sys_msgrcv); -cond_syscall(compat_sys_msgrcv); -cond_syscall(sys_msgctl); -cond_syscall(compat_sys_msgctl); -cond_syscall(sys_shmget); -cond_syscall(sys_shmat); -cond_syscall(compat_sys_shmat); -cond_syscall(sys_shmdt); -cond_syscall(sys_shmctl); -cond_syscall(compat_sys_shmctl); -cond_syscall(sys_mq_open); -cond_syscall(sys_mq_unlink); -cond_syscall(sys_mq_timedsend); -cond_syscall(sys_mq_timedreceive); -cond_syscall(sys_mq_notify); -cond_syscall(sys_mq_getsetattr); -cond_syscall(compat_sys_mq_open); -cond_syscall(compat_sys_mq_timedsend); -cond_syscall(compat_sys_mq_timedreceive); -cond_syscall(compat_sys_mq_notify); -cond_syscall(compat_sys_mq_getsetattr); -cond_syscall(sys_mbind); -cond_syscall(sys_get_mempolicy); -cond_syscall(sys_set_mempolicy); -cond_syscall(compat_sys_mbind); -cond_syscall(compat_sys_get_mempolicy); -cond_syscall(compat_sys_set_mempolicy); -cond_syscall(sys_add_key); -cond_syscall(sys_request_key); -cond_syscall(sys_keyctl); -cond_syscall(compat_sys_keyctl); -cond_syscall(compat_sys_socketcall); -cond_syscall(sys_inotify_init); -cond_syscall(sys_inotify_init1); -cond_syscall(sys_inotify_add_watch); -cond_syscall(sys_inotify_rm_watch); -cond_syscall(sys_migrate_pages); -cond_syscall(sys_move_pages); -cond_syscall(sys_chown16); -cond_syscall(sys_fchown16); -cond_syscall(sys_getegid16); -cond_syscall(sys_geteuid16); -cond_syscall(sys_getgid16); -cond_syscall(sys_getgroups16); -cond_syscall(sys_getresgid16); -cond_syscall(sys_getresuid16); -cond_syscall(sys_getuid16); -cond_syscall(sys_lchown16); -cond_syscall(sys_setfsgid16); -cond_syscall(sys_setfsuid16); -cond_syscall(sys_setgid16); -cond_syscall(sys_setgroups16); -cond_syscall(sys_setregid16); -cond_syscall(sys_setresgid16); -cond_syscall(sys_setresuid16); -cond_syscall(sys_setreuid16); -cond_syscall(sys_setuid16); -cond_syscall(sys_sgetmask); -cond_syscall(sys_ssetmask); -cond_syscall(sys_vm86old); -cond_syscall(sys_vm86); -cond_syscall(sys_modify_ldt); -cond_syscall(sys_ipc); -cond_syscall(compat_sys_ipc); -cond_syscall(compat_sys_sysctl); -cond_syscall(sys_flock); -cond_syscall(sys_io_setup); -cond_syscall(sys_io_destroy); -cond_syscall(sys_io_submit); -cond_syscall(sys_io_cancel); -cond_syscall(sys_io_getevents); -cond_syscall(compat_sys_io_setup); -cond_syscall(compat_sys_io_submit); -cond_syscall(compat_sys_io_getevents); -cond_syscall(sys_sysfs); -cond_syscall(sys_syslog); -cond_syscall(sys_process_vm_readv); -cond_syscall(sys_process_vm_writev); -cond_syscall(compat_sys_process_vm_readv); -cond_syscall(compat_sys_process_vm_writev); -cond_syscall(sys_uselib); -cond_syscall(sys_fadvise64); -cond_syscall(sys_fadvise64_64); -cond_syscall(sys_madvise); -cond_syscall(sys_setuid); -cond_syscall(sys_setregid); -cond_syscall(sys_setgid); -cond_syscall(sys_setreuid); -cond_syscall(sys_setresuid); -cond_syscall(sys_getresuid); -cond_syscall(sys_setresgid); -cond_syscall(sys_getresgid); -cond_syscall(sys_setgroups); -cond_syscall(sys_getgroups); -cond_syscall(sys_setfsuid); -cond_syscall(sys_setfsgid); -cond_syscall(sys_capget); -cond_syscall(sys_capset); -cond_syscall(sys_copy_file_range); - -/* arch-specific weak syscall entries */ -cond_syscall(sys_pciconfig_read); -cond_syscall(sys_pciconfig_write); -cond_syscall(sys_pciconfig_iobase); -cond_syscall(compat_sys_s390_ipc); -cond_syscall(ppc_rtas); -cond_syscall(sys_spu_run); -cond_syscall(sys_spu_create); -cond_syscall(sys_subpage_prot); -cond_syscall(sys_s390_pci_mmio_read); -cond_syscall(sys_s390_pci_mmio_write); - -/* mmu depending weak syscall entries */ -cond_syscall(sys_mprotect); -cond_syscall(sys_msync); -cond_syscall(sys_mlock); -cond_syscall(sys_munlock); -cond_syscall(sys_mlockall); -cond_syscall(sys_munlockall); -cond_syscall(sys_mlock2); -cond_syscall(sys_mincore); -cond_syscall(sys_madvise); -cond_syscall(sys_mremap); -cond_syscall(sys_remap_file_pages); -cond_syscall(compat_sys_move_pages); -cond_syscall(compat_sys_migrate_pages); - -/* block-layer dependent */ -cond_syscall(sys_bdflush); -cond_syscall(sys_ioprio_set); -cond_syscall(sys_ioprio_get); - -/* New file descriptors */ -cond_syscall(sys_signalfd); -cond_syscall(sys_signalfd4); -cond_syscall(compat_sys_signalfd); -cond_syscall(compat_sys_signalfd4); -cond_syscall(sys_timerfd_create); -cond_syscall(sys_timerfd_settime); -cond_syscall(sys_timerfd_gettime); -cond_syscall(compat_sys_timerfd_settime); -cond_syscall(compat_sys_timerfd_gettime); -cond_syscall(sys_eventfd); -cond_syscall(sys_eventfd2); -cond_syscall(sys_memfd_create); -cond_syscall(sys_userfaultfd); - -/* performance counters: */ -cond_syscall(sys_perf_event_open); - -/* fanotify! */ -cond_syscall(sys_fanotify_init); -cond_syscall(sys_fanotify_mark); -cond_syscall(compat_sys_fanotify_mark); +#ifndef COND_SYSCALL +#define COND_SYSCALL(name) cond_syscall(sys_##name) +#endif /* COND_SYSCALL */ + +#ifndef COND_SYSCALL_COMPAT +#define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name) +#endif /* COND_SYSCALL_COMPAT */ + +/* + * This list is kept in the same order as include/uapi/asm-generic/unistd.h. + * Architecture specific entries go below, followed by deprecated or obsolete + * system calls. + */ + +COND_SYSCALL(io_setup); +COND_SYSCALL_COMPAT(io_setup); +COND_SYSCALL(io_destroy); +COND_SYSCALL(io_submit); +COND_SYSCALL_COMPAT(io_submit); +COND_SYSCALL(io_cancel); +COND_SYSCALL(io_getevents); +COND_SYSCALL_COMPAT(io_getevents); + +/* fs/xattr.c */ + +/* fs/dcache.c */ + +/* fs/cookies.c */ +COND_SYSCALL(lookup_dcookie); +COND_SYSCALL_COMPAT(lookup_dcookie); + +/* fs/eventfd.c */ +COND_SYSCALL(eventfd2); + +/* fs/eventfd.c */ +COND_SYSCALL(epoll_create1); +COND_SYSCALL(epoll_ctl); +COND_SYSCALL(epoll_pwait); +COND_SYSCALL_COMPAT(epoll_pwait); + +/* fs/fcntl.c */ + +/* fs/inotify_user.c */ +COND_SYSCALL(inotify_init1); +COND_SYSCALL(inotify_add_watch); +COND_SYSCALL(inotify_rm_watch); + +/* fs/ioctl.c */ + +/* fs/ioprio.c */ +COND_SYSCALL(ioprio_set); +COND_SYSCALL(ioprio_get); + +/* fs/locks.c */ +COND_SYSCALL(flock); + +/* fs/namei.c */ + +/* fs/namespace.c */ + +/* fs/nfsctl.c */ + +/* fs/open.c */ + +/* fs/pipe.c */ + +/* fs/quota.c */ +COND_SYSCALL(quotactl); + +/* fs/readdir.c */ + +/* fs/read_write.c */ + +/* fs/sendfile.c */ + +/* fs/select.c */ + +/* fs/signalfd.c */ +COND_SYSCALL(signalfd4); +COND_SYSCALL_COMPAT(signalfd4); + +/* fs/splice.c */ + +/* fs/stat.c */ + +/* fs/sync.c */ + +/* fs/timerfd.c */ +COND_SYSCALL(timerfd_create); +COND_SYSCALL(timerfd_settime); +COND_SYSCALL_COMPAT(timerfd_settime); +COND_SYSCALL(timerfd_gettime); +COND_SYSCALL_COMPAT(timerfd_gettime); + +/* fs/utimes.c */ + +/* kernel/acct.c */ +COND_SYSCALL(acct); + +/* kernel/capability.c */ +COND_SYSCALL(capget); +COND_SYSCALL(capset); + +/* kernel/exec_domain.c */ + +/* kernel/exit.c */ + +/* kernel/fork.c */ + +/* kernel/futex.c */ +COND_SYSCALL(futex); +COND_SYSCALL_COMPAT(futex); +COND_SYSCALL(set_robust_list); +COND_SYSCALL_COMPAT(set_robust_list); +COND_SYSCALL(get_robust_list); +COND_SYSCALL_COMPAT(get_robust_list); + +/* kernel/hrtimer.c */ + +/* kernel/itimer.c */ + +/* kernel/kexec.c */ +COND_SYSCALL(kexec_load); +COND_SYSCALL_COMPAT(kexec_load); + +/* kernel/module.c */ +COND_SYSCALL(init_module); +COND_SYSCALL(delete_module); + +/* kernel/posix-timers.c */ + +/* kernel/printk.c */ +COND_SYSCALL(syslog); + +/* kernel/ptrace.c */ + +/* kernel/sched/core.c */ + +/* kernel/signal.c */ + +/* kernel/sys.c */ +COND_SYSCALL(setregid); +COND_SYSCALL(setgid); +COND_SYSCALL(setreuid); +COND_SYSCALL(setuid); +COND_SYSCALL(setresuid); +COND_SYSCALL(getresuid); +COND_SYSCALL(setresgid); +COND_SYSCALL(getresgid); +COND_SYSCALL(setfsuid); +COND_SYSCALL(setfsgid); +COND_SYSCALL(setgroups); +COND_SYSCALL(getgroups); + +/* kernel/time.c */ + +/* kernel/timer.c */ + +/* ipc/mqueue.c */ +COND_SYSCALL(mq_open); +COND_SYSCALL_COMPAT(mq_open); +COND_SYSCALL(mq_unlink); +COND_SYSCALL(mq_timedsend); +COND_SYSCALL_COMPAT(mq_timedsend); +COND_SYSCALL(mq_timedreceive); +COND_SYSCALL_COMPAT(mq_timedreceive); +COND_SYSCALL(mq_notify); +COND_SYSCALL_COMPAT(mq_notify); +COND_SYSCALL(mq_getsetattr); +COND_SYSCALL_COMPAT(mq_getsetattr); + +/* ipc/msg.c */ +COND_SYSCALL(msgget); +COND_SYSCALL(msgctl); +COND_SYSCALL_COMPAT(msgctl); +COND_SYSCALL(msgrcv); +COND_SYSCALL_COMPAT(msgrcv); +COND_SYSCALL(msgsnd); +COND_SYSCALL_COMPAT(msgsnd); + +/* ipc/sem.c */ +COND_SYSCALL(semget); +COND_SYSCALL(semctl); +COND_SYSCALL_COMPAT(semctl); +COND_SYSCALL(semtimedop); +COND_SYSCALL_COMPAT(semtimedop); +COND_SYSCALL(semop); + +/* ipc/shm.c */ +COND_SYSCALL(shmget); +COND_SYSCALL(shmctl); +COND_SYSCALL_COMPAT(shmctl); +COND_SYSCALL(shmat); +COND_SYSCALL_COMPAT(shmat); +COND_SYSCALL(shmdt); + +/* net/socket.c */ +COND_SYSCALL(socket); +COND_SYSCALL(socketpair); +COND_SYSCALL(bind); +COND_SYSCALL(listen); +COND_SYSCALL(accept); +COND_SYSCALL(connect); +COND_SYSCALL(getsockname); +COND_SYSCALL(getpeername); +COND_SYSCALL(setsockopt); +COND_SYSCALL_COMPAT(setsockopt); +COND_SYSCALL(getsockopt); +COND_SYSCALL_COMPAT(getsockopt); +COND_SYSCALL(sendto); +COND_SYSCALL(shutdown); +COND_SYSCALL(recvfrom); +COND_SYSCALL_COMPAT(recvfrom); +COND_SYSCALL(sendmsg); +COND_SYSCALL_COMPAT(sendmsg); +COND_SYSCALL(recvmsg); +COND_SYSCALL_COMPAT(recvmsg); + +/* mm/filemap.c */ + +/* mm/nommu.c, also with MMU */ +COND_SYSCALL(mremap); + +/* security/keys/keyctl.c */ +COND_SYSCALL(add_key); +COND_SYSCALL(request_key); +COND_SYSCALL(keyctl); +COND_SYSCALL_COMPAT(keyctl); + +/* arch/example/kernel/sys_example.c */ + +/* mm/fadvise.c */ +COND_SYSCALL(fadvise64_64); + +/* mm/, CONFIG_MMU only */ +COND_SYSCALL(swapon); +COND_SYSCALL(swapoff); +COND_SYSCALL(mprotect); +COND_SYSCALL(msync); +COND_SYSCALL(mlock); +COND_SYSCALL(munlock); +COND_SYSCALL(mlockall); +COND_SYSCALL(munlockall); +COND_SYSCALL(mincore); +COND_SYSCALL(madvise); +COND_SYSCALL(remap_file_pages); +COND_SYSCALL(mbind); +COND_SYSCALL_COMPAT(mbind); +COND_SYSCALL(get_mempolicy); +COND_SYSCALL_COMPAT(get_mempolicy); +COND_SYSCALL(set_mempolicy); +COND_SYSCALL_COMPAT(set_mempolicy); +COND_SYSCALL(migrate_pages); +COND_SYSCALL_COMPAT(migrate_pages); +COND_SYSCALL(move_pages); +COND_SYSCALL_COMPAT(move_pages); + +COND_SYSCALL(perf_event_open); +COND_SYSCALL(accept4); +COND_SYSCALL(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg); + +/* + * Architecture specific syscalls: see further below + */ + +/* fanotify */ +COND_SYSCALL(fanotify_init); +COND_SYSCALL(fanotify_mark); /* open by handle */ -cond_syscall(sys_name_to_handle_at); -cond_syscall(sys_open_by_handle_at); -cond_syscall(compat_sys_open_by_handle_at); +COND_SYSCALL(name_to_handle_at); +COND_SYSCALL(open_by_handle_at); +COND_SYSCALL_COMPAT(open_by_handle_at); + +COND_SYSCALL(sendmmsg); +COND_SYSCALL_COMPAT(sendmmsg); +COND_SYSCALL(process_vm_readv); +COND_SYSCALL_COMPAT(process_vm_readv); +COND_SYSCALL(process_vm_writev); +COND_SYSCALL_COMPAT(process_vm_writev); /* compare kernel pointers */ -cond_syscall(sys_kcmp); +COND_SYSCALL(kcmp); + +COND_SYSCALL(finit_module); /* operate on Secure Computing state */ -cond_syscall(sys_seccomp); +COND_SYSCALL(seccomp); + +COND_SYSCALL(memfd_create); /* access BPF programs and maps */ -cond_syscall(sys_bpf); +COND_SYSCALL(bpf); /* execveat */ -cond_syscall(sys_execveat); +COND_SYSCALL(execveat); + +COND_SYSCALL(userfaultfd); /* membarrier */ -cond_syscall(sys_membarrier); +COND_SYSCALL(membarrier); + +COND_SYSCALL(mlock2); + +COND_SYSCALL(copy_file_range); /* memory protection keys */ -cond_syscall(sys_pkey_mprotect); -cond_syscall(sys_pkey_alloc); -cond_syscall(sys_pkey_free); +COND_SYSCALL(pkey_mprotect); +COND_SYSCALL(pkey_alloc); +COND_SYSCALL(pkey_free); + + +/* + * Architecture specific weak syscall entries. + */ + +/* pciconfig: alpha, arm, arm64, ia64, sparc */ +COND_SYSCALL(pciconfig_read); +COND_SYSCALL(pciconfig_write); +COND_SYSCALL(pciconfig_iobase); + +/* sys_socketcall: arm, mips, x86, ... */ +COND_SYSCALL(socketcall); +COND_SYSCALL_COMPAT(socketcall); + +/* compat syscalls for arm64, x86, ... */ +COND_SYSCALL_COMPAT(sysctl); +COND_SYSCALL_COMPAT(fanotify_mark); + +/* x86 */ +COND_SYSCALL(vm86old); +COND_SYSCALL(modify_ldt); +COND_SYSCALL_COMPAT(quotactl32); +COND_SYSCALL(vm86); +COND_SYSCALL(kexec_file_load); + +/* s390 */ +COND_SYSCALL(s390_pci_mmio_read); +COND_SYSCALL(s390_pci_mmio_write); +COND_SYSCALL_COMPAT(s390_ipc); + +/* powerpc */ +cond_syscall(ppc_rtas); +COND_SYSCALL(spu_run); +COND_SYSCALL(spu_create); +COND_SYSCALL(subpage_prot); + + +/* + * Deprecated system calls which are still defined in + * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch + */ + +/* __ARCH_WANT_SYSCALL_NO_FLAGS */ +COND_SYSCALL(epoll_create); +COND_SYSCALL(inotify_init); +COND_SYSCALL(eventfd); +COND_SYSCALL(signalfd); +COND_SYSCALL_COMPAT(signalfd); + +/* __ARCH_WANT_SYSCALL_OFF_T */ +COND_SYSCALL(fadvise64); + +/* __ARCH_WANT_SYSCALL_DEPRECATED */ +COND_SYSCALL(epoll_wait); +COND_SYSCALL(recv); +COND_SYSCALL_COMPAT(recv); +COND_SYSCALL(send); +COND_SYSCALL(bdflush); +COND_SYSCALL(uselib); + + +/* + * The syscalls below are not found in include/uapi/asm-generic/unistd.h + */ + +/* obsolete: SGETMASK_SYSCALL */ +COND_SYSCALL(sgetmask); +COND_SYSCALL(ssetmask); + +/* obsolete: SYSFS_SYSCALL */ +COND_SYSCALL(sysfs); + +/* obsolete: __ARCH_WANT_SYS_IPC */ +COND_SYSCALL(ipc); +COND_SYSCALL_COMPAT(ipc); + +/* obsolete: UID16 */ +COND_SYSCALL(chown16); +COND_SYSCALL(fchown16); +COND_SYSCALL(getegid16); +COND_SYSCALL(geteuid16); +COND_SYSCALL(getgid16); +COND_SYSCALL(getgroups16); +COND_SYSCALL(getresgid16); +COND_SYSCALL(getresuid16); +COND_SYSCALL(getuid16); +COND_SYSCALL(lchown16); +COND_SYSCALL(setfsgid16); +COND_SYSCALL(setfsuid16); +COND_SYSCALL(setgid16); +COND_SYSCALL(setgroups16); +COND_SYSCALL(setregid16); +COND_SYSCALL(setresgid16); +COND_SYSCALL(setresuid16); +COND_SYSCALL(setreuid16); +COND_SYSCALL(setuid16); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..6a78cf70761d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -218,6 +218,8 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -251,6 +253,10 @@ extern struct ctl_table random_table[]; extern struct ctl_table epoll_table[]; #endif +#ifdef CONFIG_FW_LOADER_USER_HELPER +extern struct ctl_table firmware_config_table[]; +#endif + #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT int sysctl_legacy_va_layout; #endif @@ -746,6 +752,13 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = usermodehelper_table, }, +#ifdef CONFIG_FW_LOADER_USER_HELPER + { + .procname = "firmware_config", + .mode = 0555, + .child = firmware_config_table, + }, +#endif { .procname = "overflowuid", .data = &overflowuid, @@ -1327,7 +1340,7 @@ static struct ctl_table vm_table[] = { { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirty_expire_interval), + .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, .extra1 = &zero, @@ -1374,13 +1387,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "nr_overcommit_hugepages", .data = NULL, @@ -1819,8 +1825,7 @@ static struct ctl_table fs_table[] = { .data = &pipe_max_size, .maxlen = sizeof(pipe_max_size), .mode = 0644, - .proc_handler = &pipe_proc_fn, - .extra1 = &pipe_min_size, + .proc_handler = proc_dopipe_max_size, }, { .procname = "pipe-user-pages-hard", @@ -2506,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, } #endif +/** + * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_dointvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_dointvec_minmax() handler. + */ struct do_proc_dointvec_minmax_conv_param { int *min; int *max; @@ -2549,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success. + * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2562,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +/** + * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_douintvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_douintvec_minmax() handler. + */ struct do_proc_douintvec_minmax_conv_param { unsigned int *min; unsigned int *max; @@ -2609,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * - * Returns 0 on success. + * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2622,29 +2645,17 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } -struct do_proc_dopipe_max_size_conv_param { - unsigned int *min; -}; - static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { - struct do_proc_dopipe_max_size_conv_param *param = data; - if (write) { unsigned int val; - if (*lvalp > UINT_MAX) - return -EINVAL; - val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; - if (param->min && *param->min > val) - return -ERANGE; - *valp = val; } else { unsigned int val = *valp; @@ -2654,14 +2665,11 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, return 0; } -int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { - struct do_proc_dopipe_max_size_conv_param param = { - .min = (unsigned int *) table->extra1, - }; return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, ¶m); + do_proc_dopipe_max_size_conv, NULL); } static void validate_coredump_safety(void) @@ -3167,12 +3175,6 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } -int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3216,7 +3218,6 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); -EXPORT_SYMBOL_GPL(proc_dopipe_max_size); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4559e914452b..4e62a4a8fa91 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -194,11 +194,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) { struct task_struct *tsk; - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); + tsk = find_get_task_by_vpid(pid); if (!tsk) return -ESRCH; fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6b5f19223d6..78eabc41eaa6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -113,16 +113,6 @@ config NO_HZ_FULL endchoice -config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default (except CPU 0)" - depends on NO_HZ_FULL - help - If the user doesn't pass the nohz_full boot option to - define the range of full dynticks CPUs, consider that all - CPUs in the system are full dynticks by default. - Note the boot CPU will still be kept outside the range to - handle the timekeeping duty. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ec09ce9a6012..639321bf2e39 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -326,6 +326,17 @@ static int alarmtimer_resume(struct device *dev) } #endif +static void +__alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + timerqueue_init(&alarm->node); + alarm->timer.function = alarmtimer_fired; + alarm->function = function; + alarm->type = type; + alarm->state = ALARMTIMER_STATE_INACTIVE; +} + /** * alarm_init - Initialize an alarm structure * @alarm: ptr to alarm to be initialized @@ -335,13 +346,9 @@ static int alarmtimer_resume(struct device *dev) void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { - timerqueue_init(&alarm->node); hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); - alarm->timer.function = alarmtimer_fired; - alarm->function = function; - alarm->type = type; - alarm->state = ALARMTIMER_STATE_INACTIVE; + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); } EXPORT_SYMBOL_GPL(alarm_init); @@ -719,6 +726,8 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&alarm->timer); + if (!alarm->data) return 0; @@ -740,6 +749,15 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, return -ERESTART_RESTARTBLOCK; } +static void +alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); +} + /** * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep * @restart: ptr to restart block @@ -752,7 +770,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) ktime_t exp = restart->nanosleep.expires; struct alarm alarm; - alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); return alarmtimer_do_nsleep(&alarm, exp, type); } @@ -784,7 +802,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (!capable(CAP_WAKE_ALARM)) return -EPERM; - alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 65f9e3f24dde..0e974cface0b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -594,6 +594,9 @@ static void __clocksource_select(bool skipcur) if (!best) return; + if (!strlen(override_name)) + goto found; + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (skipcur && cs == curr_clocksource) @@ -625,6 +628,7 @@ static void __clocksource_select(bool skipcur) break; } +found: if (curr_clocksource != best && !timekeeping_notify(best)) { pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; @@ -853,16 +857,16 @@ EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** - * sysfs_show_current_clocksources - sysfs interface for current clocksource + * current_clocksource_show - sysfs interface for current clocksource * @dev: unused * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing current clocksource. */ -static ssize_t -sysfs_show_current_clocksources(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t current_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) { ssize_t count = 0; @@ -891,7 +895,7 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) } /** - * sysfs_override_clocksource - interface for manually overriding clocksource + * current_clocksource_store - interface for manually overriding clocksource * @dev: unused * @attr: unused * @buf: name of override clocksource @@ -900,9 +904,9 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) * Takes input from sysfs interface for manually overriding the default * clocksource selection. */ -static ssize_t sysfs_override_clocksource(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t current_clocksource_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { ssize_t ret; @@ -916,9 +920,10 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +static DEVICE_ATTR_RW(current_clocksource); /** - * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * unbind_clocksource_store - interface for manually unbinding clocksource * @dev: unused * @attr: unused * @buf: unused @@ -926,7 +931,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, * * Takes input from sysfs interface for manually unbinding a clocksource. */ -static ssize_t sysfs_unbind_clocksource(struct device *dev, +static ssize_t unbind_clocksource_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -950,19 +955,19 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, return ret ? ret : count; } +static DEVICE_ATTR_WO(unbind_clocksource); /** - * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * available_clocksource_show - sysfs interface for listing clocksource * @dev: unused * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing registered clocksources */ -static ssize_t -sysfs_show_available_clocksources(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t available_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) { struct clocksource *src; ssize_t count = 0; @@ -986,17 +991,15 @@ sysfs_show_available_clocksources(struct device *dev, return count; } +static DEVICE_ATTR_RO(available_clocksource); -/* - * Sysfs setup bits: - */ -static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, - sysfs_override_clocksource); - -static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); - -static DEVICE_ATTR(available_clocksource, 0444, - sysfs_show_available_clocksources, NULL); +static struct attribute *clocksource_attrs[] = { + &dev_attr_current_clocksource.attr, + &dev_attr_unbind_clocksource.attr, + &dev_attr_available_clocksource.attr, + NULL +}; +ATTRIBUTE_GROUPS(clocksource); static struct bus_type clocksource_subsys = { .name = "clocksource", @@ -1006,6 +1009,7 @@ static struct bus_type clocksource_subsys = { static struct device device_clocksource = { .id = 0, .bus = &clocksource_subsys, + .groups = clocksource_groups, }; static int __init init_clocksource_sysfs(void) @@ -1014,17 +1018,7 @@ static int __init init_clocksource_sysfs(void) if (!error) error = device_register(&device_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_current_clocksource); - if (!error) - error = device_create_file(&device_clocksource, - &dev_attr_unbind_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_available_clocksource); + return error; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ae0c8a411fe7..eda1210ce50f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -37,7 +37,6 @@ #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> -#include <linux/kallsyms.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/seq_file.h> @@ -92,11 +91,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_real, }, { - .index = HRTIMER_BASE_BOOTTIME, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - }, - { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, @@ -112,11 +106,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_real, }, { - .index = HRTIMER_BASE_BOOTTIME_SOFT, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - }, - { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, @@ -130,7 +119,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_BOOTTIME] = HRTIMER_BASE_MONOTONIC, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -491,6 +480,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { @@ -503,9 +493,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + next = timerqueue_iterate_next(next); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + if (timer->is_soft) cpu_base->softirq_next_timer = timer; else @@ -549,7 +552,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } @@ -557,7 +561,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); } return expires_next; @@ -566,14 +571,12 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_boot, offs_tai); + offs_real, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; - base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; @@ -1203,6 +1206,39 @@ u64 hrtimer_get_next_event(void) return expires; } + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (__hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8d70da1b9a0d..a09ded765f6c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,7 +31,7 @@ /* USER_HZ period (usecs): */ -unsigned long tick_usec = TICK_USEC; +unsigned long tick_usec = USER_TICK_USEC; /* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 94ad46d50b56..fe56c4e06c51 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -74,7 +74,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) __poll_t result = 0; if (!clk) - return POLLERR; + return EPOLLERR; if (clk->ops.poll) result = clk->ops.poll(clk, fp, wait); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index b258bee13b02..e0dbae98db9d 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -19,6 +19,11 @@ #include <linux/posix-timers.h> #include <linux/compat.h> +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override SYS_NI and COMPAT_SYS_NI */ +#include <asm/syscall_wrapper.h> +#endif + asmlinkage long sys_ni_posix_timers(void) { pr_err_once("process %d (%s) attempted a POSIX timer syscall " @@ -27,8 +32,13 @@ asmlinkage long sys_ni_posix_timers(void) return -ENOSYS; } +#ifndef SYS_NI #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) +#endif + +#ifndef COMPAT_SYS_NI #define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) +#endif SYS_NI(timer_create); SYS_NI(timer_gettime); @@ -73,6 +83,8 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) case CLOCK_BOOTTIME: get_monotonic_boottime64(tp); break; + case CLOCK_MONOTONIC_ACTIVE: + ktime_get_active_ts64(tp); default: return -EINVAL; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 75043046914e..b6899b5060bd 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -50,6 +50,7 @@ #include <linux/export.h> #include <linux/hashtable.h> #include <linux/compat.h> +#include <linux/nospec.h> #include "timekeeping.h" #include "posix-timers.h" @@ -251,15 +252,16 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - get_monotonic_boottime64(tp); + timekeeping_clocktai64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_monotonic_active(clockid_t which_clock, + struct timespec64 *tp) { - timekeeping_clocktai64(tp); + ktime_get_active_ts64(tp); return 0; } @@ -1315,19 +1317,9 @@ static const struct k_clock clock_tai = { .timer_arm = common_hrtimer_arm, }; -static const struct k_clock clock_boottime = { +static const struct k_clock clock_monotonic_active = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining = common_hrtimer_remaining, - .timer_try_to_cancel = common_hrtimer_try_to_cancel, - .timer_arm = common_hrtimer_arm, + .clock_get = posix_get_monotonic_active, }; static const struct k_clock * const posix_clocks[] = { @@ -1338,19 +1330,24 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, - [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_BOOTTIME] = &clock_monotonic, [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, + [CLOCK_MONOTONIC_ACTIVE] = &clock_monotonic_active, }; static const struct k_clock *clockid_to_kclock(const clockid_t id) { - if (id < 0) + clockid_t idx = id; + + if (id < 0) { return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; + } - if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) + if (id >= ARRAY_SIZE(posix_clocks)) return NULL; - return posix_clocks[id]; + + return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 49edc1c4f3e6..099572ca4a8f 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -419,6 +419,19 @@ void tick_suspend_local(void) clockevents_shutdown(td->evtdev); } +static void tick_forward_next_period(void) +{ + ktime_t delta, now = ktime_get(); + u64 n; + + delta = ktime_sub(now, tick_next_period); + n = ktime_divns(delta, tick_period); + tick_next_period += n * tick_period; + if (tick_next_period < now) + tick_next_period += tick_period; + tick_sched_forward_next_period(); +} + /** * tick_resume_local - Resume the local tick device * @@ -431,6 +444,8 @@ void tick_resume_local(void) struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); + tick_forward_next_period(); + clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..21efab7485ca 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,6 +141,12 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +extern void tick_sched_forward_next_period(void); +#else +static inline void tick_sched_forward_next_period(void) { } +#endif + /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733eff83..646645e981f9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -52,6 +52,15 @@ struct tick_sched *tick_get_tick_sched(int cpu) static ktime_t last_jiffies_update; /* + * Called after resume. Make sure that jiffies are not fast forwarded due to + * clock monotonic being forwarded by the suspended time. + */ +void tick_sched_forward_next_period(void) +{ + last_jiffies_update = tick_next_period; +} + +/* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) @@ -113,8 +122,7 @@ static ktime_t tick_init_jiffy_update(void) return period; } - -static void tick_sched_do_timer(ktime_t now) +static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -134,6 +142,9 @@ static void tick_sched_do_timer(ktime_t now) /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); + + if (ts->inidle) + ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) @@ -405,30 +416,12 @@ static int tick_nohz_cpu_down(unsigned int cpu) return 0; } -static int tick_nohz_init_all(void) -{ - int err = -1; - -#ifdef CONFIG_NO_HZ_FULL_ALL - if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); - return err; - } - err = 0; - cpumask_setall(tick_nohz_full_mask); - tick_nohz_full_running = true; -#endif - return err; -} - void __init tick_nohz_init(void) { int cpu, ret; - if (!tick_nohz_full_running) { - if (tick_nohz_init_all() < 0) - return; - } + if (!tick_nohz_full_running) + return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe @@ -481,9 +474,18 @@ static int __init setup_tick_nohz(char *str) __setup("nohz=", setup_tick_nohz); -int tick_nohz_tick_stopped(void) +bool tick_nohz_tick_stopped(void) { - return __this_cpu_read(tick_cpu_sched.tick_stopped); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->tick_stopped; +} + +bool tick_nohz_tick_stopped_cpu(int cpu) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + return ts->tick_stopped; } /** @@ -539,14 +541,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) sched_clock_idle_wakeup_event(); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static void tick_nohz_start_idle(struct tick_sched *ts) { - ktime_t now = ktime_get(); - - ts->idle_entrytime = now; + ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); - return now; } /** @@ -655,13 +654,10 @@ static inline bool local_timer_softirq_pending(void) return local_softirq_pending() & TIMER_SOFTIRQ; } -static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long seq, basejiff; - ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { @@ -670,6 +666,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; + ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work @@ -714,53 +711,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { - tick = 0; + ts->timer_expires = 0; goto out; } } /* + * If this CPU is the one which had the do_timer() duty last, we limit + * the sleep time to the timekeeping max_deferment value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu != tick_do_timer_cpu && + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + delta = KTIME_MAX; + + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; + + ts->timer_expires = min_t(u64, expires, next_tick); + +out: + return ts->timer_expires; +} + +static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 basemono = ts->timer_expires_base; + u64 expires = ts->timer_expires; + ktime_t tick = expires; + + /* Make sure we won't be trying to stop it twice in a row. */ + ts->timer_expires_base = 0; + + /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this CPU - * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferment value. - * Otherwise we can sleep as long as we want. + * was the one which had the do_timer() duty last. */ - delta = timekeeping_max_deferment(); if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - delta = KTIME_MAX; ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - delta = KTIME_MAX; } -#ifdef CONFIG_NO_HZ_FULL - /* Limit the tick delta to the maximum scheduler deferment */ - if (!ts->inidle) - delta = min(delta, scheduler_tick_max_deferment()); -#endif - - /* Calculate the next expiry time */ - if (delta < (KTIME_MAX - basemono)) - expires = basemono + delta; - else - expires = KTIME_MAX; - - expires = min_t(u64, expires, next_tick); - tick = expires; - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) - goto out; + return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", @@ -794,7 +801,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); - goto out; + return; } hrtimer_set_expires(&ts->sched_timer, tick); @@ -803,15 +810,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); -out: - /* - * Update the estimated sleep length until the next timer - * (not only the tick). - */ - ts->sleep_length = ktime_sub(dev->next_event, now); - return tick; } +static void tick_nohz_retain_tick(struct tick_sched *ts) +{ + ts->timer_expires_base = 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +{ + if (tick_nohz_next_event(ts, cpu)) + tick_nohz_stop_tick(ts, cpu); + else + tick_nohz_retain_tick(ts); +} +#endif /* CONFIG_NO_HZ_FULL */ + static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -847,7 +862,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) return; if (can_stop_full_tick(cpu, ts)) - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); #endif @@ -873,10 +888,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { - ts->sleep_length = NSEC_PER_SEC / HZ; + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; - } if (need_resched()) return false; @@ -911,61 +924,80 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_enter(struct tick_sched *ts) +static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { - ktime_t now, expires; + ktime_t expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); + /* + * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the + * tick timer expiration time is known already. + */ + if (ts->timer_expires_base) + expires = ts->timer_expires; + else if (can_stop_idle_tick(cpu, ts)) + expires = tick_nohz_next_event(ts, cpu); + else + return; - if (can_stop_idle_tick(cpu, ts)) { + ts->idle_calls++; + + if (expires > 0LL) { int was_stopped = ts->tick_stopped; - ts->idle_calls++; + tick_nohz_stop_tick(ts, cpu); - expires = tick_nohz_stop_sched_tick(ts, now, cpu); - if (expires > 0LL) { - ts->idle_sleeps++; - ts->idle_expires = expires; - } + ts->idle_sleeps++; + ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } + } else { + tick_nohz_retain_tick(ts); } } /** - * tick_nohz_idle_enter - stop the idle tick from the idle task + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick - * Called when we start the idle loop. - * - * The arch is responsible of calling: + */ +void tick_nohz_idle_stop_tick(void) +{ + __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); +} + +void tick_nohz_idle_retain_tick(void) +{ + tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); + /* + * Undo the effect of get_next_timer_interrupt() called from + * tick_nohz_next_event(). + */ + timer_clear_idle(); +} + +/** + * tick_nohz_idle_enter - prepare for entering idle on the current CPU * - * - rcu_idle_enter() after its last use of RCU before the CPU is put - * to sleep. - * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); - /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ - set_cpu_sd_state_idle(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); + + WARN_ON_ONCE(ts->timer_expires_base); + ts->inidle = 1; - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); local_irq_enable(); } @@ -983,21 +1015,62 @@ void tick_nohz_irq_exit(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** - * tick_nohz_get_sleep_length - return the length of the current sleep + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + */ +bool tick_nohz_idle_got_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->got_idle_tick) { + ts->got_idle_tick = 0; + return true; + } + return false; +} + +/** + * tick_nohz_get_sleep_length - return the expected length of the current sleep + * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled */ -ktime_t tick_nohz_get_sleep_length(void) +ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + /* + * The idle entry time is expected to be a sufficient approximation of + * the current time at this point. + */ + ktime_t now = ts->idle_entrytime; + ktime_t next_event; + + WARN_ON_ONCE(!ts->inidle); + + *delta_next = ktime_sub(dev->next_event, now); - return ts->sleep_length; + if (!can_stop_idle_tick(cpu, ts)) + return *delta_next; + + next_event = tick_nohz_next_event(ts, cpu); + if (!next_event) + return *delta_next; + + /* + * If the next highres timer to expire is earlier than next_event, the + * idle governor needs to know that. + */ + next_event = min_t(u64, next_event, + hrtimer_next_event_without(&ts->sched_timer)); + + return ktime_sub(next_event, now); } /** @@ -1046,6 +1119,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #endif } +static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) +{ + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); +} + +void tick_nohz_idle_restart_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, ktime_get()); +} + /** * tick_nohz_idle_exit - restart the idle tick from the idle task * @@ -1056,24 +1143,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; + idle_active = ts->idle_active; + tick_stopped = ts->tick_stopped; - if (ts->idle_active || ts->tick_stopped) + if (idle_active || tick_stopped) now = ktime_get(); - if (ts->idle_active) + if (idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); - tick_nohz_account_idle_ticks(ts); - } + if (tick_stopped) + __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); } @@ -1089,7 +1178,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) dev->next_event = KTIME_MAX; - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); /* No need to reprogram if we are running tickless */ @@ -1184,7 +1273,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 954b43dbf21c..6de959a854b2 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -38,31 +38,37 @@ enum tick_nohz_mode { * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @sleep_length: Duration of the current idle sleep + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set */ struct tick_sched { struct hrtimer sched_timer; unsigned long check_clocks; enum tick_nohz_mode nohz_mode; + + unsigned int inidle : 1; + unsigned int tick_stopped : 1; + unsigned int idle_active : 1; + unsigned int do_timer_last : 1; + unsigned int got_idle_tick : 1; + ktime_t last_tick; ktime_t next_tick; - int inidle; - int tick_stopped; unsigned long idle_jiffies; unsigned long idle_calls; unsigned long idle_sleeps; - int idle_active; ktime_t idle_entrytime; ktime_t idle_waketime; ktime_t idle_exittime; ktime_t idle_sleeptime; ktime_t iowait_sleeptime; - ktime_t sleep_length; unsigned long last_jiffies; + u64 timer_expires; + u64 timer_expires_base; u64 next_timer; ktime_t idle_expires; - int do_timer_last; atomic_t tick_dep_mask; }; diff --git a/kernel/time/time.c b/kernel/time/time.c index bd4e6c7dd689..3044d48ebe56 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -488,6 +488,18 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); +struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +{ + struct timespec64 ts = ns_to_timespec64(nsec); + struct __kernel_old_timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_kernel_old_timeval); + /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cd03317e7b57..ca90219a1e73 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -138,7 +138,12 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* Update both bases so mono and raw stay coupled. */ + tk->tkr_mono.base += delta; + tk->tkr_raw.base += delta; + + /* Accumulate time spent in suspend */ + tk->time_suspended += delta; } /* @@ -332,6 +337,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; + tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ @@ -467,36 +473,6 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); -/** - * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. - * - * To keep it NMI safe since we're accessing from tracing, we're not using a - * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: - * - * (1) Its possible that a timestamp be taken after the boot offset is updated - * but before the timekeeper is updated. If this happens, the new boot offset - * is added to the old timekeeping making the clock appear to update slightly - * earlier: - * CPU 0 CPU 1 - * timekeeping_inject_sleeptime64() - * __timekeeping_inject_sleeptime(tk, delta); - * timestamp(); - * timekeeping_update(tk, TK_CLEAR_NTP...); - * - * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be - * partially updated. Since the tk->offs_boot update is a rare event, this - * should be a rare occurrence which postprocessing should be able to handle. - */ -u64 notrace ktime_get_boot_fast_ns(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); -} -EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); - - /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ @@ -788,7 +764,6 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, - [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; @@ -886,6 +861,39 @@ void ktime_get_ts64(struct timespec64 *ts) EXPORT_SYMBOL_GPL(ktime_get_ts64); /** + * ktime_get_active_ts64 - Get the active non-suspended monotonic clock + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime clock and + * the wall_to_monotonic offset, subtracts the accumulated suspend time and + * stores the result in normalized timespec64 format in the variable + * pointed to by @ts. + */ +void ktime_get_active_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono, tsusp; + u64 nsec, nssusp; + unsigned int seq; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(&tk->tkr_mono); + tomono = tk->wall_to_monotonic; + nssusp = tk->time_suspended; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); + tsusp = ns_to_timespec64(nssusp); + *ts = timespec64_sub(*ts, tsusp); +} + +/** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non @@ -1585,7 +1593,6 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, return; } tk_xtime_add(tk, delta); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } @@ -1799,20 +1806,19 @@ device_initcall(timekeeping_init_ops); */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, - bool negative, - int adj_scale) + s32 mult_adj) { s64 interval = tk->cycle_interval; - s32 mult_adj = 1; - if (negative) { - mult_adj = -mult_adj; + if (mult_adj == 0) { + return; + } else if (mult_adj == -1) { interval = -interval; - offset = -offset; + offset = -offset; + } else if (mult_adj != 1) { + interval *= mult_adj; + offset *= mult_adj; } - mult_adj <<= adj_scale; - interval <<= adj_scale; - offset <<= adj_scale; /* * So the following can be confusing. @@ -1860,8 +1866,6 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplfies to: * xtime_nsec -= offset - * - * XXX - TODO: Doc ntp_error calculation. */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ @@ -1872,89 +1876,38 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; - tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; } /* - * Calculate the multiplier adjustment needed to match the frequency - * specified by NTP + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. */ -static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, - s64 offset) +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - s64 interval = tk->cycle_interval; - s64 xinterval = tk->xtime_interval; - u32 base = tk->tkr_mono.clock->mult; - u32 max = tk->tkr_mono.clock->maxadj; - u32 cur_adj = tk->tkr_mono.mult; - s64 tick_error; - bool negative; - u32 adj_scale; - - /* Remove any current error adj from freq calculation */ - if (tk->ntp_err_mult) - xinterval -= tk->cycle_interval; - - tk->ntp_tick = ntp_tick_length(); - - /* Calculate current error per tick */ - tick_error = ntp_tick_length() >> tk->ntp_error_shift; - tick_error -= (xinterval + tk->xtime_remainder); - - /* Don't worry about correcting it if its small */ - if (likely((tick_error >= 0) && (tick_error <= interval))) - return; - - /* preserve the direction of correction */ - negative = (tick_error < 0); + u32 mult; - /* If any adjustment would pass the max, just return */ - if (negative && (cur_adj - 1) <= (base - max)) - return; - if (!negative && (cur_adj + 1) >= (base + max)) - return; /* - * Sort out the magnitude of the correction, but - * avoid making so large a correction that we go - * over the max adjustment. + * Determine the multiplier from the current NTP tick length. + * Avoid expensive division when the tick length doesn't change. */ - adj_scale = 0; - tick_error = abs(tick_error); - while (tick_error > interval) { - u32 adj = 1 << (adj_scale + 1); - - /* Check if adjustment gets us within 1 unit from the max */ - if (negative && (cur_adj - adj) <= (base - max)) - break; - if (!negative && (cur_adj + adj) >= (base + max)) - break; - - adj_scale++; - tick_error >>= 1; + if (likely(tk->ntp_tick == ntp_tick_length())) { + mult = tk->tkr_mono.mult - tk->ntp_err_mult; + } else { + tk->ntp_tick = ntp_tick_length(); + mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - + tk->xtime_remainder, tk->cycle_interval); } - /* scale the corrections */ - timekeeping_apply_adjustment(tk, offset, negative, adj_scale); -} + /* + * If the clock is behind the NTP time, increase the multiplier by 1 + * to catch up with it. If it's ahead and there was a remainder in the + * tick division, the clock will slow down. Otherwise it will stay + * ahead until the tick length changes to a non-divisible value. + */ + tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; + mult += tk->ntp_err_mult; -/* - * Adjust the timekeeper's multiplier to the correct frequency - * and also to reduce the accumulated error value. - */ -static void timekeeping_adjust(struct timekeeper *tk, s64 offset) -{ - /* Correct for the current frequency error */ - timekeeping_freqadjust(tk, offset); - - /* Next make a small adjustment to fix any cumulative error */ - if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { - tk->ntp_err_mult = 1; - timekeeping_apply_adjustment(tk, offset, 0, 0); - } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { - /* Undo any existing error adjustment */ - timekeeping_apply_adjustment(tk, offset, 1, 0); - tk->ntp_err_mult = 0; - } + timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) @@ -1971,18 +1924,15 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * - * Now, since we already accumulated the second, cannot simply roll - * the accumulated second back, since the NTP subsystem has been - * notified via second_overflow. So instead we push xtime_nsec forward - * by the amount we underflowed, and add that amount into the error. - * - * We'll correct this error next time through this function, when - * xtime_nsec is not as small. + * Now, since we have already accumulated the second and the NTP + * subsystem has been notified via second_overflow(), we need to skip + * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { - s64 neg = -(s64)tk->tkr_mono.xtime_nsec; - tk->tkr_mono.xtime_nsec = 0; - tk->ntp_error += neg << tk->ntp_error_shift; + tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << + tk->tkr_mono.shift; + tk->xtime_sec--; + tk->skip_second_overflow = 1; } } @@ -2005,6 +1955,15 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; + /* + * Skip NTP update if this second was accumulated before, + * i.e. xtime_nsec underflowed in timekeeping_adjust() + */ + if (unlikely(tk->skip_second_overflow)) { + tk->skip_second_overflow = 0; + continue; + } + /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { @@ -2121,7 +2080,7 @@ void update_wall_time(void) shift--; } - /* correct the clock when NTP error is too big */ + /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* @@ -2166,7 +2125,7 @@ out: void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); + ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended); *ts = ktime_to_timespec64(t); } @@ -2236,7 +2195,6 @@ void do_timer(unsigned long ticks) * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset - * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the @@ -2246,7 +2204,7 @@ void do_timer(unsigned long ticks) * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_boot, ktime_t *offs_tai) + ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -2263,7 +2221,6 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 7a9b4eb7a1d5..79b67f5e0343 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -6,7 +6,6 @@ */ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_boot, ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index fdbeeb02dde9..cf5c0828ee31 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -31,6 +31,4 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif -extern time64_t __ktime_get_real_seconds(void); - #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 48150ab42de9..4a4fd567fb26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1894,6 +1894,12 @@ int timers_dead_cpu(unsigned int cpu) raw_spin_lock_irq(&new_base->lock); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + /* + * The current CPUs base clock might be stale. Update it + * before moving the timers over. + */ + forward_timer_base(new_base); + BUG_ON(old_base->running_timer); for (i = 0; i < WHEEL_SIZE; i++) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f54dc62b599c..c4f0f2e4126e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -530,6 +530,15 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on FUNCTION_ERROR_INJECTION + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE @@ -597,7 +606,10 @@ config HIST_TRIGGERS event activity as an initial guide for further investigation using more advanced tools. - See Documentation/trace/events.txt. + Inter-event tracing of quantities such as latencies is also + supported using hist triggers under this option. + + See Documentation/trace/histogram.txt. If in doubt, say N. config MMIOTRACE_TEST diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 40207c2a4113..d88e96d4e12c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include <linux/filter.h> #include <linux/uaccess.h> #include <linux/ctype.h> +#include <linux/kprobes.h> +#include <linux/error-injection.h> + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,23 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + regs_set_return_value(regs, rc); + override_function_with_return(regs); + return 0; +} + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; +#endif + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -224,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(1 /* Fake ip will not be printed. */, \ + __trace_printk(0 /* Fake ip */, \ fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ @@ -503,7 +524,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -547,7 +569,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -556,13 +579,18 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + case BPF_FUNC_override_return: + return &bpf_override_return_proto; +#endif default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) @@ -636,7 +664,43 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, +static const struct bpf_func_proto * +tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + default: + return tracing_func_proto(func_id, prog); + } +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); + return true; +} + +const struct bpf_verifier_ops tracepoint_verifier_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + +BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, struct bpf_perf_event_value *, buf, u32, size) { int err = -EINVAL; @@ -653,8 +717,8 @@ clear: return err; } -static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { - .func = bpf_perf_prog_read_value_tp, +static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { + .func = bpf_perf_prog_read_value, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -662,7 +726,8 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -670,39 +735,99 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; case BPF_FUNC_perf_prog_read_value: - return &bpf_perf_prog_read_value_proto_tp; + return &bpf_perf_prog_read_value_proto; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info) +/* + * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp + * to avoid potential recursive reuse issue when/if tracepoints are added + * inside bpf_*_event_output and/or bpf_get_stack_id + */ +static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags, void *, data, u64, size) { - if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return ____bpf_perf_event_output(regs, map, flags, data, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { + .func = bpf_perf_event_output_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { + .func = bpf_get_stackid_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_raw_tp; + default: + return tracing_func_proto(func_id, prog); + } +} + +static bool raw_tp_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + /* largest tracepoint in the kernel has 12 args */ + if (off < 0 || off >= sizeof(__u64) * 12) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; - - BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); return true; } -const struct bpf_verifier_ops tracepoint_verifier_ops = { - .get_func_proto = tp_prog_func_proto, - .is_valid_access = tp_prog_is_valid_access, +const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_prog_is_valid_access, }; -const struct bpf_prog_ops tracepoint_prog_ops = { +const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, - sample_period); + const int size_u64 = sizeof(u64); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -713,8 +838,13 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): - bpf_ctx_record_field_size(info, size_sp); - if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + case bpf_ctx_range(struct bpf_perf_event_data, addr): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; default: @@ -741,6 +871,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct perf_sample_data, period, 8, target_size)); break; + case offsetof(struct bpf_perf_event_data, addr): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, addr, 8, + target_size)); + break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, @@ -754,7 +892,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, } const struct bpf_verifier_ops perf_event_verifier_ops = { - .get_func_proto = tp_prog_func_proto, + .get_func_proto = pe_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; @@ -773,6 +911,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* + * Kprobe override only works if they are on the function entry, + * and only if they are on the opt-in list. + */ + if (prog->kprobe_override && + (!trace_kprobe_on_func_entry(event->tp_event) || + !trace_kprobe_error_injectable(event->tp_event))) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) @@ -825,3 +972,131 @@ void perf_event_detach_bpf_prog(struct perf_event *event) unlock: mutex_unlock(&bpf_event_mutex); } + +int perf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + struct perf_event_query_bpf __user *uquery = info; + struct perf_event_query_bpf query = {}; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + if (copy_from_user(&query, uquery, sizeof(query))) + return -EFAULT; + if (query.ids_len > BPF_TRACE_MAX_PROGS) + return -E2BIG; + + mutex_lock(&bpf_event_mutex); + ret = bpf_prog_array_copy_info(event->tp_event->prog_array, + uquery->ids, + query.ids_len, + &uquery->prog_cnt); + mutex_unlock(&bpf_event_mutex); + + return ret; +} + +extern struct bpf_raw_event_map __start__bpf_raw_tp[]; +extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; + +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + struct bpf_raw_event_map *btp = __start__bpf_raw_tp; + + for (; btp < __stop__bpf_raw_tp; btp++) { + if (!strcmp(btp->tp->name, name)) + return btp; + } + return NULL; +} + +static __always_inline +void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +{ + rcu_read_lock(); + preempt_disable(); + (void) BPF_PROG_RUN(prog, args); + preempt_enable(); + rcu_read_unlock(); +} + +#define UNPACK(...) __VA_ARGS__ +#define REPEAT_1(FN, DL, X, ...) FN(X) +#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) +#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) +#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) +#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) +#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) +#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) +#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) +#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) +#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) +#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) +#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) +#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) + +#define SARG(X) u64 arg##X +#define COPY(X) args[X] = arg##X + +#define __DL_COM (,) +#define __DL_SEM (;) + +#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + +#define BPF_TRACE_DEFN_x(x) \ + void bpf_trace_run##x(struct bpf_prog *prog, \ + REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ + { \ + u64 args[x]; \ + REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ + __bpf_trace_run(prog, args); \ + } \ + EXPORT_SYMBOL_GPL(bpf_trace_run##x) +BPF_TRACE_DEFN_x(1); +BPF_TRACE_DEFN_x(2); +BPF_TRACE_DEFN_x(3); +BPF_TRACE_DEFN_x(4); +BPF_TRACE_DEFN_x(5); +BPF_TRACE_DEFN_x(6); +BPF_TRACE_DEFN_x(7); +BPF_TRACE_DEFN_x(8); +BPF_TRACE_DEFN_x(9); +BPF_TRACE_DEFN_x(10); +BPF_TRACE_DEFN_x(11); +BPF_TRACE_DEFN_x(12); + +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + struct tracepoint *tp = btp->tp; + + /* + * check that program doesn't access arguments beyond what's + * available in this tracepoint + */ + if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) + return -EINVAL; + + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); +} + +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = __bpf_probe_register(btp, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} + +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 554b517c61a0..16bbf062018f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3902,14 +3902,13 @@ static bool module_exists(const char *module) { /* All modules have the symbol __this_module */ const char this_mod[] = "__this_module"; - const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; - char modname[modname_size + 1]; + char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; - n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); - if (n > modname_size) + if (n > sizeof(modname) - 1) return false; val = module_kallsyms_lookup_name(modname); @@ -4456,7 +4455,6 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, func_g.type = filter_parse_regex(glob, strlen(glob), &func_g.search, ¬); func_g.len = strlen(func_g.search); - func_g.search = glob; /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -5015,7 +5013,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) parser = &iter->parser; if (trace_parser_loaded(parser)) { - parser->buffer[parser->idx] = 0; ftrace_match_records(iter->hash, parser->buffer, parser->idx); } @@ -5329,7 +5326,6 @@ ftrace_graph_release(struct inode *inode, struct file *file) parser = &fgd->parser; if (trace_parser_loaded((parser))) { - parser->buffer[parser->idx] = 0; ret = ftrace_graph_set_hash(fgd->new_hash, parser->buffer); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ca6930e0d25e..c9cb9767d49b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -22,6 +22,7 @@ #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> +#include <linux/oom.h> #include <asm/local.h> @@ -41,6 +42,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\ttime_stamp : type == %d\n", + RINGBUF_TYPE_TIME_STAMP); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); @@ -140,12 +143,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s) enum { RB_LEN_TIME_EXTEND = 8, - RB_LEN_TIME_STAMP = 16, + RB_LEN_TIME_STAMP = 8, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) +#define extended_time(event) \ + (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -209,7 +215,7 @@ rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + if (extended_time(event)) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); @@ -231,7 +237,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); length = rb_event_length(event); @@ -248,7 +254,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static __always_inline void * rb_event_data(struct ring_buffer_event *event) { - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ @@ -275,6 +281,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +/** + * ring_buffer_event_time_stamp - return the event's extended timestamp + * @event: the event to get the timestamp of + * + * Returns the extended timestamp associated with a data event. + * An extended time_stamp is a 64-bit timestamp represented + * internally in a special way that makes the best use of space + * contained within a ring buffer event. This function decodes + * it and maps it to a straight u64 value. + */ +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) +{ + u64 ts; + + ts = event->array[0]; + ts <<= TS_SHIFT; + ts += event->time_delta; + + return ts; +} + /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ @@ -451,6 +478,7 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + unsigned long nest; local_t entries_bytes; local_t entries; local_t overrun; @@ -488,6 +516,7 @@ struct ring_buffer { u64 (*clock)(void); struct rb_irq_work irq_work; + bool time_stamp_abs; }; struct ring_buffer_iter { @@ -627,7 +656,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. * - * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers, * zero otherwise. */ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, @@ -665,7 +694,7 @@ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; return 0; } @@ -1134,30 +1163,60 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { struct buffer_page *bpage, *tmp; + bool user_thread = current->mm != NULL; + gfp_t mflags; long i; + /* + * Check if the available memory is there first. + * Note, si_mem_available() only gives us a rough estimate of available + * memory. It may not be accurate. But we don't care, we just want + * to prevent doing any allocation when it is obvious that it is + * not going to succeed. + */ + i = si_mem_available(); + if (i < nr_pages) + return -ENOMEM; + + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; + + /* + * If a user thread allocates too much, and si_mem_available() + * reports there's enough memory, even though there is not. + * Make sure the OOM killer kills this thread. This can happen + * even with RETRY_MAYFAIL because another task may be doing + * an allocation after this task has taken all memory. + * This is the task the OOM killer needs to take out during this + * loop, even if it was triggered by an allocation somewhere else. + */ + if (user_thread) + set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; - /* - * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is not - * destabilized. - */ + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_RETRY_MAYFAIL, - cpu_to_node(cpu)); + mflags, cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); + page = alloc_pages_node(cpu_to_node(cpu), mflags, 0); if (!page) goto free_pages; bpage->page = page_address(page); rb_init_page(bpage->page); + + if (user_thread && fatal_signal_pending(current)) + goto free_pages; } + if (user_thread) + clear_current_oom_origin(); return 0; @@ -1166,6 +1225,8 @@ free_pages: list_del_init(&bpage->list); free_buffer_page(bpage); } + if (user_thread) + clear_current_oom_origin(); return -ENOMEM; } @@ -1382,6 +1443,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, buffer->clock = clock; } +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs) +{ + buffer->time_stamp_abs = abs; +} + +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer) +{ + return buffer->time_stamp_abs; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static inline unsigned long rb_page_entries(struct buffer_page *bpage) @@ -2206,12 +2277,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path, do not inline */ static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { - event->type_len = RINGBUF_TYPE_TIME_EXTEND; + if (abs) + event->type_len = RINGBUF_TYPE_TIME_STAMP; + else + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* Not the first event on the page? */ - if (rb_event_index(event)) { + /* Not the first event on the page, or not delta? */ + if (abs || rb_event_index(event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2254,7 +2328,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * add it to the start of the resevered space. */ if (unlikely(info->add_timestamp)) { - event = rb_add_time_stamp(event, delta); + bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); + + event = rb_add_time_stamp(event, info->delta, abs); length -= RB_LEN_TIME_EXTEND; delta = 0; } @@ -2442,7 +2518,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer static inline void rb_event_discard(struct ring_buffer_event *event) { - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); /* array[0] holds the actual length for the discarded event */ @@ -2486,10 +2562,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); cpu_buffer->write_stamp += delta; + } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + delta = ring_buffer_event_time_stamp(event); + cpu_buffer->write_stamp = delta; } else cpu_buffer->write_stamp += event->time_delta; } @@ -2581,10 +2658,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << bit))) + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) return 1; - val |= (1 << bit); + val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; return 0; @@ -2593,7 +2670,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->current_context &= cpu_buffer->current_context - 1; + cpu_buffer->current_context &= + cpu_buffer->current_context - (1 << cpu_buffer->nest); +} + +/* The recursive locking above uses 4 bits */ +#define NESTED_BITS 4 + +/** + * ring_buffer_nest_start - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * The ring buffer has a safty mechanism to prevent recursion. + * But there may be a case where a trace needs to be done while + * tracing something else. In this case, calling this function + * will allow this function to nest within a currently active + * ring_buffer_lock_reserve(). + * + * Call this function before calling another ring_buffer_lock_reserve() and + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_start(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* Enabled by ring_buffer_nest_end() */ + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recusive locking */ + cpu_buffer->nest += NESTED_BITS; +} + +/** + * ring_buffer_nest_end - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * Must be called after ring_buffer_nest_start() and after the + * ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_end(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* disabled by ring_buffer_nest_start() */ + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recusive locking */ + cpu_buffer->nest -= NESTED_BITS; + preempt_enable_notrace(); } /** @@ -2637,7 +2764,8 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + " echo global > /sys/kernel/debug/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); info->add_timestamp = 1; } @@ -2669,7 +2797,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (!tail) + if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer)) info->delta = 0; /* See if we shot pass the end of this buffer page */ @@ -2746,8 +2874,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, /* make sure this diff is calculated here */ barrier(); - /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { + if (ring_buffer_time_stamp_abs(buffer)) { + info.delta = info.ts; + rb_handle_timestamp(cpu_buffer, &info); + } else /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { info.delta = diff; if (unlikely(test_time_stamp(info.delta))) rb_handle_timestamp(cpu_buffer, &info); @@ -3429,14 +3560,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + delta = ring_buffer_event_time_stamp(event); + cpu_buffer->read_stamp = delta; return; case RINGBUF_TYPE_DATA: @@ -3460,14 +3590,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + delta = ring_buffer_event_time_stamp(event); + iter->read_stamp = delta; return; case RINGBUF_TYPE_DATA: @@ -3691,6 +3820,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, struct buffer_page *reader; int nr_loops = 0; + if (ts) + *ts = 0; again: /* * We repeat when a time extend is encountered. @@ -3727,12 +3858,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, goto again; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + if (ts) { + *ts = ring_buffer_event_time_stamp(event); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: - if (ts) { + if (ts && !(*ts)) { *ts = cpu_buffer->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); @@ -3757,6 +3893,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; + if (ts) + *ts = 0; + cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; @@ -3809,12 +3948,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) goto again; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + if (ts) { + *ts = ring_buffer_event_time_stamp(event); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_DATA: - if (ts) { + if (ts && !(*ts)) { *ts = iter->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(buffer, cpu_buffer->cpu, ts); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 32c069bbf41b..dfbcf9ee1447 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,6 +41,7 @@ #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> +#include <linux/sched/clock.h> #include <linux/sched/rt.h> #include "trace.h" @@ -530,8 +531,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, ubuf += ret; cnt -= ret; - parser.buffer[parser.idx] = 0; - ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; @@ -1166,10 +1165,18 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, - { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_mono_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; +bool trace_clock_in_ns(struct trace_array *tr) +{ + if (trace_clocks[tr->clock_id].in_ns) + return true; + + return false; +} + /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -1236,18 +1243,18 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, cnt--; } + parser->idx = 0; + /* only spaces were written */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; } - - parser->idx = 0; } /* read the non-space input */ - while (cnt && !isspace(ch)) { + while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { @@ -1262,12 +1269,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } /* We either got finished input or we have to wait for another call. */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + /* Make sure the parsed string always terminates with '\0'. */ + parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; goto out; @@ -2269,7 +2278,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, *current_rb = trace_file->tr->trace_buffer.buffer; - if ((trace_file->flags & + if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ @@ -2380,7 +2389,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); * trace_buffer_unlock_commit_regs() * trace_event_buffer_commit() * trace_event_raw_event_xxx() -*/ + */ # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, @@ -4515,6 +4524,9 @@ static const char readme_msg[] = #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif + "\n timestamp_mode\t-view the mode used to timestamp events\n" + " delta: Delta difference against a buffer-wide timestamp\n" + " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" @@ -4691,8 +4703,9 @@ static const char readme_msg[] = "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" "\t .execname display a common_pid as a program name\n" - "\t .syscall display a syscall id as a syscall name\n\n" - "\t .log2 display log2 value rather than raw number\n\n" + "\t .syscall display a syscall id as a syscall name\n" + "\t .log2 display log2 value rather than raw number\n" + "\t .usecs display a common_timestamp in microseconds\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" @@ -5623,13 +5636,13 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; if (tr->trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; else return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, filp, poll_table); @@ -6202,7 +6215,7 @@ static int tracing_clock_show(struct seq_file *m, void *v) return 0; } -static int tracing_set_clock(struct trace_array *tr, const char *clockstr) +int tracing_set_clock(struct trace_array *tr, const char *clockstr) { int i; @@ -6282,6 +6295,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return ret; } +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + + mutex_lock(&trace_types_lock); + + if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer)) + seq_puts(m, "delta [absolute]\n"); + else + seq_puts(m, "[delta] absolute\n"); + + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr)) + return -ENODEV; + + ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) +{ + int ret = 0; + + mutex_lock(&trace_types_lock); + + if (abs && tr->time_stamp_abs_ref++) + goto out; + + if (!abs) { + if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { + ret = -EINVAL; + goto out; + } + + if (--tr->time_stamp_abs_ref) + goto out; + } + + ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->max_buffer.buffer) + ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); +#endif + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + struct ftrace_buffer_info { struct trace_iterator iter; void *spare; @@ -6529,6 +6607,13 @@ static const struct file_operations trace_clock_fops = { .write = tracing_clock_write, }; +static const struct file_operations trace_time_stamp_mode_fops = { + .open = tracing_time_stamp_mode_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, +}; + #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, @@ -7699,6 +7784,7 @@ static int instance_mkdir(const char *name) INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); + INIT_LIST_HEAD(&tr->hist_vars); if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -7851,6 +7937,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + trace_create_file("timestamp_mode", 0444, d_tracer, tr, + &trace_time_stamp_mode_fops); + create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) @@ -8446,6 +8535,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); + INIT_LIST_HEAD(&global_trace.hist_vars); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); @@ -8507,3 +8597,21 @@ __init static int clear_boot_tracer(void) fs_initcall(tracer_init_tracefs); late_initcall_sync(clear_boot_tracer); + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__init static int tracing_set_default_clock(void) +{ + /* sched_clock_stable() is determined in late_initcall */ + if (!trace_boot_clock && !sched_clock_stable()) { + printk(KERN_WARNING + "Unstable clock detected, switching default tracing clock to \"global\"\n" + "If you want to keep using the local clock, then add:\n" + " \"trace_clock=local\"\n" + "on the kernel command line\n"); + tracing_set_clock(&global_trace, "global"); + } + + return 0; +} +late_initcall_sync(tracing_set_default_clock); +#endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2a6d0325a761..6fb46a06c9dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -273,6 +273,8 @@ struct trace_array { /* function tracing enabled */ int function_enabled; #endif + int time_stamp_abs_ref; + struct list_head hist_vars; }; enum { @@ -286,6 +288,11 @@ extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern void trace_array_put(struct trace_array *tr); +extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); +extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); + +extern bool trace_clock_in_ns(struct trace_array *tr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -1209,12 +1216,11 @@ struct ftrace_event_field { int is_signed; }; +struct prog_entry; + struct event_filter { - int n_preds; /* Number assigned */ - int a_preds; /* allocated */ - struct filter_pred __rcu *preds; - struct filter_pred __rcu *root; - char *filter_string; + struct prog_entry __rcu *prog; + char *filter_string; }; struct event_subsystem { @@ -1291,7 +1297,7 @@ __event_trigger_test_discard(struct trace_event_file *file, unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry); + *tt = event_triggers_call(file, entry, event); if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && @@ -1328,7 +1334,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); if (tt) - event_triggers_post_call(file, tt, entry); + event_triggers_post_call(file, tt, entry, event); } /** @@ -1361,7 +1367,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file, irq_flags, pc, regs); if (tt) - event_triggers_post_call(file, tt, entry); + event_triggers_post_call(file, tt, entry, event); } #define FILTER_PRED_INVALID ((unsigned short)-1) @@ -1406,12 +1412,8 @@ struct filter_pred { unsigned short *ops; struct ftrace_event_field *field; int offset; - int not; + int not; int op; - unsigned short index; - unsigned short parent; - unsigned short left; - unsigned short right; }; static inline bool is_string_field(struct ftrace_event_field *field) @@ -1543,6 +1545,8 @@ extern void pause_named_trigger(struct event_trigger_data *data); extern void unpause_named_trigger(struct event_trigger_data *data); extern void set_named_trigger_data(struct event_trigger_data *data, struct event_trigger_data *named_data); +extern struct event_trigger_data * +get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); @@ -1586,7 +1590,8 @@ extern int register_trigger_hist_enable_disable_cmds(void); */ struct event_trigger_ops { void (*func)(struct event_trigger_data *data, - void *rec); + void *rec, + struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 5fdc779f411d..d8a188e0418a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -96,7 +96,7 @@ u64 notrace trace_clock_global(void) int this_cpu; u64 now; - local_irq_save(flags); + raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); now = sched_clock_cpu(this_cpu); @@ -122,7 +122,7 @@ u64 notrace trace_clock_global(void) arch_spin_unlock(&trace_clock_struct.lock); out: - local_irq_restore(flags); + raw_local_irq_restore(flags); return now; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 55d6dff37daf..c79193e598f5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/kprobes.h> #include "trace.h" +#include "trace_probe.h" static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; @@ -237,6 +238,111 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_unlock(&event_mutex); } +#ifdef CONFIG_KPROBE_EVENTS +int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *func = NULL; + struct trace_event_call *tp_event; + + if (p_event->attr.kprobe_func) { + func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!func) + return -ENOMEM; + ret = strncpy_from_user( + func, u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (ret == KSYM_NAME_LEN) + ret = -E2BIG; + if (ret < 0) + goto out; + + if (func[0] == '\0') { + kfree(func); + func = NULL; + } + } + + tp_event = create_local_trace_kprobe( + func, (void *)(unsigned long)(p_event->attr.kprobe_addr), + p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_kprobe(tp_event); +out: + kfree(func); + return ret; +} + +void perf_kprobe_destroy(struct perf_event *p_event) +{ + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + + destroy_local_trace_kprobe(p_event->tp_event); +} +#endif /* CONFIG_KPROBE_EVENTS */ + +#ifdef CONFIG_UPROBE_EVENTS +int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *path = NULL; + struct trace_event_call *tp_event; + + if (!p_event->attr.uprobe_path) + return -EINVAL; + path = kzalloc(PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + ret = strncpy_from_user( + path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); + if (ret == PATH_MAX) + return -E2BIG; + if (ret < 0) + goto out; + if (path[0] == '\0') { + ret = -EINVAL; + goto out; + } + + tp_event = create_local_trace_uprobe( + path, p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + /* + * local trace_uprobe need to hold event_mutex to call + * uprobe_buffer_enable() and uprobe_buffer_disable(). + * event_mutex is not required for local trace_kprobes. + */ + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_uprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(path); + return ret; +} + +void perf_uprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); + destroy_local_trace_uprobe(p_event->tp_event); +} +#endif /* CONFIG_UPROBE_EVENTS */ + int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1b87157edbff..05c7172c6667 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -885,8 +885,6 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (*parser.buffer == '!') set = 0; - parser.buffer[parser.idx] = 0; - ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 61e7f0678d33..9b4716bb8bb0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -33,163 +33,595 @@ "# Only events with the given fields will be affected.\n" \ "# If no events are modified, an error message will be displayed here" -enum filter_op_ids -{ - OP_OR, - OP_AND, - OP_GLOB, - OP_NE, - OP_EQ, - OP_LT, - OP_LE, - OP_GT, - OP_GE, - OP_BAND, - OP_NOT, - OP_NONE, - OP_OPEN_PAREN, -}; +/* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */ +#define OPS \ + C( OP_GLOB, "~" ), \ + C( OP_NE, "!=" ), \ + C( OP_EQ, "==" ), \ + C( OP_LE, "<=" ), \ + C( OP_LT, "<" ), \ + C( OP_GE, ">=" ), \ + C( OP_GT, ">" ), \ + C( OP_BAND, "&" ), \ + C( OP_MAX, NULL ) -struct filter_op { - int id; - char *string; - int precedence; -}; +#undef C +#define C(a, b) a -/* Order must be the same as enum filter_op_ids above */ -static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_GLOB, "~", 4 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_BAND, "&", 6 }, - { OP_NOT, "!", 6 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, -}; +enum filter_op_ids { OPS }; -enum { - FILT_ERR_NONE, - FILT_ERR_INVALID_OP, - FILT_ERR_UNBALANCED_PAREN, - FILT_ERR_TOO_MANY_OPERANDS, - FILT_ERR_OPERAND_TOO_LONG, - FILT_ERR_FIELD_NOT_FOUND, - FILT_ERR_ILLEGAL_FIELD_OP, - FILT_ERR_ILLEGAL_INTVAL, - FILT_ERR_BAD_SUBSYS_FILTER, - FILT_ERR_TOO_MANY_PREDS, - FILT_ERR_MISSING_FIELD, - FILT_ERR_INVALID_FILTER, - FILT_ERR_IP_FIELD_ONLY, - FILT_ERR_ILLEGAL_NOT_OP, -}; +#undef C +#define C(a, b) b -static char *err_text[] = { - "No error", - "Invalid operator", - "Unbalanced parens", - "Too many operands", - "Operand too long", - "Field not found", - "Illegal operation for field type", - "Illegal integer value", - "Couldn't find or set field in one of a subsystem's events", - "Too many terms in predicate expression", - "Missing field name and/or value", - "Meaningless filter expression", - "Only 'ip' field is supported for function trace", - "Illegal use of '!'", -}; +static const char * ops[] = { OPS }; -struct opstack_op { - enum filter_op_ids op; - struct list_head list; -}; +/* + * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND + * pred_funcs_##type below must match the order of them above. + */ +#define PRED_FUNC_START OP_LE +#define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START) + +#define ERRORS \ + C(NONE, "No error"), \ + C(INVALID_OP, "Invalid operator"), \ + C(TOO_MANY_OPEN, "Too many '('"), \ + C(TOO_MANY_CLOSE, "Too few '('"), \ + C(MISSING_QUOTE, "Missing matching quote"), \ + C(OPERAND_TOO_LONG, "Operand too long"), \ + C(EXPECT_STRING, "Expecting string field"), \ + C(EXPECT_DIGIT, "Expecting numeric field"), \ + C(ILLEGAL_FIELD_OP, "Illegal operation for field type"), \ + C(FIELD_NOT_FOUND, "Field not found"), \ + C(ILLEGAL_INTVAL, "Illegal integer value"), \ + C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ + C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ + C(INVALID_FILTER, "Meaningless filter expression"), \ + C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ + C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), + +#undef C +#define C(a, b) FILT_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static char *err_text[] = { ERRORS }; + +/* Called after a '!' character but "!=" and "!~" are not "not"s */ +static bool is_not(const char *str) +{ + switch (str[1]) { + case '=': + case '~': + return false; + } + return true; +} -struct postfix_elt { - enum filter_op_ids op; - char *operand; - struct list_head list; +/** + * prog_entry - a singe entry in the filter program + * @target: Index to jump to on a branch (actually one minus the index) + * @when_to_branch: The value of the result of the predicate to do a branch + * @pred: The predicate to execute. + */ +struct prog_entry { + int target; + int when_to_branch; + struct filter_pred *pred; }; -struct filter_parse_state { - struct filter_op *ops; - struct list_head opstack; - struct list_head postfix; +/** + * update_preds- assign a program entry a label target + * @prog: The program array + * @N: The index of the current entry in @prog + * @when_to_branch: What to assign a program entry for its branch condition + * + * The program entry at @N has a target that points to the index of a program + * entry that can have its target and when_to_branch fields updated. + * Update the current program entry denoted by index @N target field to be + * that of the updated entry. This will denote the entry to update if + * we are processing an "||" after an "&&" + */ +static void update_preds(struct prog_entry *prog, int N, int invert) +{ + int t, s; + + t = prog[N].target; + s = prog[t].target; + prog[t].when_to_branch = invert; + prog[t].target = N; + prog[N].target = s; +} + +struct filter_parse_error { int lasterr; int lasterr_pos; - - struct { - char *string; - unsigned int cnt; - unsigned int tail; - } infix; - - struct { - char string[MAX_FILTER_STR_VAL]; - int pos; - unsigned int tail; - } operand; }; -struct pred_stack { - struct filter_pred **preds; - int index; +static void parse_error(struct filter_parse_error *pe, int err, int pos) +{ + pe->lasterr = err; + pe->lasterr_pos = pos; +} + +typedef int (*parse_pred_fn)(const char *str, void *data, int pos, + struct filter_parse_error *pe, + struct filter_pred **pred); + +enum { + INVERT = 1, + PROCESS_AND = 2, + PROCESS_OR = 4, }; -/* If not of not match is equal to not of not, then it is a match */ +/* + * Without going into a formal proof, this explains the method that is used in + * parsing the logical expressions. + * + * For example, if we have: "a && !(!b || (c && g)) || d || e && !f" + * The first pass will convert it into the following program: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto l4; + * n4: r=g; r=!r; l4: if (r) goto l5; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto l7; + * n7: r=f; r=!r; l7: if (!r) goto F + * T: return TRUE + * F: return FALSE + * + * To do this, we use a data structure to represent each of the above + * predicate and conditions that has: + * + * predicate, when_to_branch, invert, target + * + * The "predicate" will hold the function to determine the result "r". + * The "when_to_branch" denotes what "r" should be if a branch is to be taken + * "&&" would contain "!r" or (0) and "||" would contain "r" or (1). + * The "invert" holds whether the value should be reversed before testing. + * The "target" contains the label "l#" to jump to. + * + * A stack is created to hold values when parentheses are used. + * + * To simplify the logic, the labels will start at 0 and not 1. + * + * The possible invert values are 1 and 0. The number of "!"s that are in scope + * before the predicate determines the invert value, if the number is odd then + * the invert value is 1 and 0 otherwise. This means the invert value only + * needs to be toggled when a new "!" is introduced compared to what is stored + * on the stack, where parentheses were used. + * + * The top of the stack and "invert" are initialized to zero. + * + * ** FIRST PASS ** + * + * #1 A loop through all the tokens is done: + * + * #2 If the token is an "(", the stack is push, and the current stack value + * gets the current invert value, and the loop continues to the next token. + * The top of the stack saves the "invert" value to keep track of what + * the current inversion is. As "!(a && !b || c)" would require all + * predicates being affected separately by the "!" before the parentheses. + * And that would end up being equivalent to "(!a || b) && !c" + * + * #3 If the token is an "!", the current "invert" value gets inverted, and + * the loop continues. Note, if the next token is a predicate, then + * this "invert" value is only valid for the current program entry, + * and does not affect other predicates later on. + * + * The only other acceptable token is the predicate string. + * + * #4 A new entry into the program is added saving: the predicate and the + * current value of "invert". The target is currently assigned to the + * previous program index (this will not be its final value). + * + * #5 We now enter another loop and look at the next token. The only valid + * tokens are ")", "&&", "||" or end of the input string "\0". + * + * #6 The invert variable is reset to the current value saved on the top of + * the stack. + * + * #7 The top of the stack holds not only the current invert value, but also + * if a "&&" or "||" needs to be processed. Note, the "&&" takes higher + * precedence than "||". That is "a && b || c && d" is equivalent to + * "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs + * to be processed. This is the case if an "&&" was the last token. If it was + * then we call update_preds(). This takes the program, the current index in + * the program, and the current value of "invert". More will be described + * below about this function. + * + * #8 If the next token is "&&" then we set a flag in the top of the stack + * that denotes that "&&" needs to be processed, break out of this loop + * and continue with the outer loop. + * + * #9 Otherwise, if a "||" needs to be processed then update_preds() is called. + * This is called with the program, the current index in the program, but + * this time with an inverted value of "invert" (that is !invert). This is + * because the value taken will become the "when_to_branch" value of the + * program. + * Note, this is called when the next token is not an "&&". As stated before, + * "&&" takes higher precedence, and "||" should not be processed yet if the + * next logical operation is "&&". + * + * #10 If the next token is "||" then we set a flag in the top of the stack + * that denotes that "||" needs to be processed, break out of this loop + * and continue with the outer loop. + * + * #11 If this is the end of the input string "\0" then we break out of both + * loops. + * + * #12 Otherwise, the next token is ")", where we pop the stack and continue + * this inner loop. + * + * Now to discuss the update_pred() function, as that is key to the setting up + * of the program. Remember the "target" of the program is initialized to the + * previous index and not the "l" label. The target holds the index into the + * program that gets affected by the operand. Thus if we have something like + * "a || b && c", when we process "a" the target will be "-1" (undefined). + * When we process "b", its target is "0", which is the index of "a", as that's + * the predicate that is affected by "||". But because the next token after "b" + * is "&&" we don't call update_preds(). Instead continue to "c". As the + * next token after "c" is not "&&" but the end of input, we first process the + * "&&" by calling update_preds() for the "&&" then we process the "||" by + * callin updates_preds() with the values for processing "||". + * + * What does that mean? What update_preds() does is to first save the "target" + * of the program entry indexed by the current program entry's "target" + * (remember the "target" is initialized to previous program entry), and then + * sets that "target" to the current index which represents the label "l#". + * That entry's "when_to_branch" is set to the value passed in (the "invert" + * or "!invert"). Then it sets the current program entry's target to the saved + * "target" value (the old value of the program that had its "target" updated + * to the label). + * + * Looking back at "a || b && c", we have the following steps: + * "a" - prog[0] = { "a", X, -1 } // pred, when_to_branch, target + * "||" - flag that we need to process "||"; continue outer loop + * "b" - prog[1] = { "b", X, 0 } + * "&&" - flag that we need to process "&&"; continue outer loop + * (Notice we did not process "||") + * "c" - prog[2] = { "c", X, 1 } + * update_preds(prog, 2, 0); // invert = 0 as we are processing "&&" + * t = prog[2].target; // t = 1 + * s = prog[t].target; // s = 0 + * prog[t].target = 2; // Set target to "l2" + * prog[t].when_to_branch = 0; + * prog[2].target = s; + * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||" + * t = prog[2].target; // t = 0 + * s = prog[t].target; // s = -1 + * prog[t].target = 2; // Set target to "l2" + * prog[t].when_to_branch = 1; + * prog[2].target = s; + * + * #13 Which brings us to the final step of the first pass, which is to set + * the last program entry's when_to_branch and target, which will be + * when_to_branch = 0; target = N; ( the label after the program entry after + * the last program entry processed above). + * + * If we denote "TRUE" to be the entry after the last program entry processed, + * and "FALSE" the program entry after that, we are now done with the first + * pass. + * + * Making the above "a || b && c" have a progam of: + * prog[0] = { "a", 1, 2 } + * prog[1] = { "b", 0, 2 } + * prog[2] = { "c", 0, 3 } + * + * Which translates into: + * n0: r = a; l0: if (r) goto l2; + * n1: r = b; l1: if (!r) goto l2; + * n2: r = c; l2: if (!r) goto l3; // Which is the same as "goto F;" + * T: return TRUE; l3: + * F: return FALSE + * + * Although, after the first pass, the program is correct, it is + * inefficient. The simple sample of "a || b && c" could be easily been + * converted into: + * n0: r = a; if (r) goto T + * n1: r = b; if (!r) goto F + * n2: r = c; if (!r) goto F + * T: return TRUE; + * F: return FALSE; + * + * The First Pass is over the input string. The next too passes are over + * the program itself. + * + * ** SECOND PASS ** + * + * Which brings us to the second pass. If a jump to a label has the + * same condition as that label, it can instead jump to its target. + * The original example of "a && !(!b || (c && g)) || d || e && !f" + * where the first pass gives us: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto l4; + * n4: r=g; r=!r; l4: if (r) goto l5; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto l7; + * n7: r=f; r=!r; l7: if (!r) goto F: + * T: return TRUE; + * F: return FALSE + * + * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;". + * And "l5: if (r) goto T", we could optimize this by converting l3 and l4 + * to go directly to T. To accomplish this, we start from the last + * entry in the program and work our way back. If the target of the entry + * has the same "when_to_branch" then we could use that entry's target. + * Doing this, the above would end up as: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto T; + * n4: r=g; r=!r; l4: if (r) goto T; + * n5: r=d; l5: if (r) goto T; + * n6: r=e; l6: if (!r) goto F; + * n7: r=f; r=!r; l7: if (!r) goto F; + * T: return TRUE + * F: return FALSE + * + * In that same pass, if the "when_to_branch" doesn't match, we can simply + * go to the program entry after the label. That is, "l2: if (!r) goto l4;" + * where "l4: if (r) goto T;", then we can convert l2 to be: + * "l2: if (!r) goto n5;". + * + * This will have the second pass give us: + * n1: r=a; l1: if (!r) goto n5; + * n2: r=b; l2: if (!r) goto n5; + * n3: r=c; r=!r; l3: if (r) goto T; + * n4: r=g; r=!r; l4: if (r) goto T; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto F; + * n7: r=f; r=!r; l7: if (!r) goto F + * T: return TRUE + * F: return FALSE + * + * Notice, all the "l#" labels are no longer used, and they can now + * be discarded. + * + * ** THIRD PASS ** + * + * For the third pass we deal with the inverts. As they simply just + * make the "when_to_branch" get inverted, a simple loop over the + * program to that does: "when_to_branch ^= invert;" will do the + * job, leaving us with: + * n1: r=a; if (!r) goto n5; + * n2: r=b; if (!r) goto n5; + * n3: r=c: if (!r) goto T; + * n4: r=g; if (!r) goto T; + * n5: r=d; if (r) goto T + * n6: r=e; if (!r) goto F; + * n7: r=f; if (r) goto F + * T: return TRUE + * F: return FALSE + * + * As "r = a; if (!r) goto n5;" is obviously the same as + * "if (!a) goto n5;" without doing anything we can interperate the + * program as: + * n1: if (!a) goto n5; + * n2: if (!b) goto n5; + * n3: if (!c) goto T; + * n4: if (!g) goto T; + * n5: if (d) goto T + * n6: if (!e) goto F; + * n7: if (f) goto F + * T: return TRUE + * F: return FALSE + * + * Since the inverts are discarded at the end, there's no reason to store + * them in the program array (and waste memory). A separate array to hold + * the inverts is used and freed at the end. + */ +static struct prog_entry * +predicate_parse(const char *str, int nr_parens, int nr_preds, + parse_pred_fn parse_pred, void *data, + struct filter_parse_error *pe) +{ + struct prog_entry *prog_stack; + struct prog_entry *prog; + const char *ptr = str; + char *inverts = NULL; + int *op_stack; + int *top; + int invert = 0; + int ret = -ENOMEM; + int len; + int N = 0; + int i; + + nr_preds += 2; /* For TRUE and FALSE */ + + op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL); + if (!op_stack) + return ERR_PTR(-ENOMEM); + prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL); + if (!prog_stack) { + parse_error(pe, -ENOMEM, 0); + goto out_free; + } + inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL); + if (!inverts) { + parse_error(pe, -ENOMEM, 0); + goto out_free; + } + + top = op_stack; + prog = prog_stack; + *top = 0; + + /* First pass */ + while (*ptr) { /* #1 */ + const char *next = ptr++; + + if (isspace(*next)) + continue; + + switch (*next) { + case '(': /* #2 */ + if (top - op_stack > nr_parens) + return ERR_PTR(-EINVAL); + *(++top) = invert; + continue; + case '!': /* #3 */ + if (!is_not(next)) + break; + invert = !invert; + continue; + } + + if (N >= nr_preds) { + parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); + goto out_free; + } + + inverts[N] = invert; /* #4 */ + prog[N].target = N-1; + + len = parse_pred(next, data, ptr - str, pe, &prog[N].pred); + if (len < 0) { + ret = len; + goto out_free; + } + ptr = next + len; + + N++; + + ret = -1; + while (1) { /* #5 */ + next = ptr++; + if (isspace(*next)) + continue; + + switch (*next) { + case ')': + case '\0': + break; + case '&': + case '|': + if (next[1] == next[0]) { + ptr++; + break; + } + default: + parse_error(pe, FILT_ERR_TOO_MANY_PREDS, + next - str); + goto out_free; + } + + invert = *top & INVERT; + + if (*top & PROCESS_AND) { /* #7 */ + update_preds(prog, N - 1, invert); + *top &= ~PROCESS_AND; + } + if (*next == '&') { /* #8 */ + *top |= PROCESS_AND; + break; + } + if (*top & PROCESS_OR) { /* #9 */ + update_preds(prog, N - 1, !invert); + *top &= ~PROCESS_OR; + } + if (*next == '|') { /* #10 */ + *top |= PROCESS_OR; + break; + } + if (!*next) /* #11 */ + goto out; + + if (top == op_stack) { + ret = -1; + /* Too few '(' */ + parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str); + goto out_free; + } + top--; /* #12 */ + } + } + out: + if (top != op_stack) { + /* Too many '(' */ + parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str); + goto out_free; + } + + prog[N].pred = NULL; /* #13 */ + prog[N].target = 1; /* TRUE */ + prog[N+1].pred = NULL; + prog[N+1].target = 0; /* FALSE */ + prog[N-1].target = N; + prog[N-1].when_to_branch = false; + + /* Second Pass */ + for (i = N-1 ; i--; ) { + int target = prog[i].target; + if (prog[i].when_to_branch == prog[target].when_to_branch) + prog[i].target = prog[target].target; + } + + /* Third Pass */ + for (i = 0; i < N; i++) { + invert = inverts[i] ^ prog[i].when_to_branch; + prog[i].when_to_branch = invert; + /* Make sure the program always moves forward */ + if (WARN_ON(prog[i].target <= i)) { + ret = -EINVAL; + goto out_free; + } + } + + return prog; +out_free: + kfree(op_stack); + kfree(prog_stack); + kfree(inverts); + return ERR_PTR(ret); +} + #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr < val); \ - return !!match == !pred->not; \ + return *addr < val; \ } \ static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr <= val); \ - return !!match == !pred->not; \ + return *addr <= val; \ } \ static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr > val); \ - return !!match == !pred->not; \ + return *addr > val; \ } \ static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr >= val); \ - return !!match == !pred->not; \ + return *addr >= val; \ } \ static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = !!(*addr & val); \ - return match == !pred->not; \ + return !!(*addr & val); \ } \ static const filter_pred_fn_t pred_funcs_##type[] = { \ - filter_pred_LT_##type, \ filter_pred_LE_##type, \ - filter_pred_GT_##type, \ + filter_pred_LT_##type, \ filter_pred_GE_##type, \ + filter_pred_GT_##type, \ filter_pred_BAND_##type, \ }; -#define PRED_FUNC_START OP_LT - #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ @@ -272,44 +704,36 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) static int filter_pred_cpu(struct filter_pred *pred, void *event) { int cpu, cmp; - int match = 0; cpu = raw_smp_processor_id(); cmp = pred->val; switch (pred->op) { case OP_EQ: - match = cpu == cmp; - break; + return cpu == cmp; + case OP_NE: + return cpu != cmp; case OP_LT: - match = cpu < cmp; - break; + return cpu < cmp; case OP_LE: - match = cpu <= cmp; - break; + return cpu <= cmp; case OP_GT: - match = cpu > cmp; - break; + return cpu > cmp; case OP_GE: - match = cpu >= cmp; - break; + return cpu >= cmp; default: - break; + return 0; } - - return !!match == !pred->not; } /* Filter predicate for COMM. */ static int filter_pred_comm(struct filter_pred *pred, void *event) { - int cmp, match; + int cmp; cmp = pred->regex.match(current->comm, &pred->regex, - pred->regex.field_len); - match = cmp ^ pred->not; - - return match; + TASK_COMM_LEN); + return cmp ^ pred->not; } static int filter_pred_none(struct filter_pred *pred, void *event) @@ -366,6 +790,7 @@ static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused) return 1; return 0; } + /** * filter_parse_regex - parse a basic regex * @buff: the raw regex @@ -400,7 +825,6 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) for (i = 0; i < len; i++) { if (buff[i] == '*') { if (!i) { - *search = buff + 1; type = MATCH_END_ONLY; } else if (i == len - 1) { if (type == MATCH_END_ONLY) @@ -410,14 +834,14 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) buff[i] = 0; break; } else { /* pattern continues, use full glob */ - type = MATCH_GLOB; - break; + return MATCH_GLOB; } } else if (strchr("[?\\", buff[i])) { - type = MATCH_GLOB; - break; + return MATCH_GLOB; } } + if (buff[0] == '*') + *search = buff + 1; return type; } @@ -427,10 +851,9 @@ static void filter_build_regex(struct filter_pred *pred) struct regex *r = &pred->regex; char *search; enum regex_type type = MATCH_FULL; - int not = 0; if (pred->op == OP_GLOB) { - type = filter_parse_regex(r->pattern, r->len, &search, ¬); + type = filter_parse_regex(r->pattern, r->len, &search, &pred->not); r->len = strlen(search); memmove(r->pattern, search, r->len+1); } @@ -452,210 +875,32 @@ static void filter_build_regex(struct filter_pred *pred) r->match = regex_match_glob; break; } - - pred->not ^= not; -} - -enum move_type { - MOVE_DOWN, - MOVE_UP_FROM_LEFT, - MOVE_UP_FROM_RIGHT -}; - -static struct filter_pred * -get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, - int index, enum move_type *move) -{ - if (pred->parent & FILTER_PRED_IS_RIGHT) - *move = MOVE_UP_FROM_RIGHT; - else - *move = MOVE_UP_FROM_LEFT; - pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; - - return pred; -} - -enum walk_return { - WALK_PRED_ABORT, - WALK_PRED_PARENT, - WALK_PRED_DEFAULT, -}; - -typedef int (*filter_pred_walkcb_t) (enum move_type move, - struct filter_pred *pred, - int *err, void *data); - -static int walk_pred_tree(struct filter_pred *preds, - struct filter_pred *root, - filter_pred_walkcb_t cb, void *data) -{ - struct filter_pred *pred = root; - enum move_type move = MOVE_DOWN; - int done = 0; - - if (!preds) - return -EINVAL; - - do { - int err = 0, ret; - - ret = cb(move, pred, &err, data); - if (ret == WALK_PRED_ABORT) - return err; - if (ret == WALK_PRED_PARENT) - goto get_parent; - - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - goto get_parent; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - get_parent: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, - &move); - continue; - } - done = 1; - } while (!done); - - /* We are fine. */ - return 0; -} - -/* - * A series of AND or ORs where found together. Instead of - * climbing up and down the tree branches, an array of the - * ops were made in order of checks. We can just move across - * the array and short circuit if needed. - */ -static int process_ops(struct filter_pred *preds, - struct filter_pred *op, void *rec) -{ - struct filter_pred *pred; - int match = 0; - int type; - int i; - - /* - * Micro-optimization: We set type to true if op - * is an OR and false otherwise (AND). Then we - * just need to test if the match is equal to - * the type, and if it is, we can short circuit the - * rest of the checks: - * - * if ((match && op->op == OP_OR) || - * (!match && op->op == OP_AND)) - * return match; - */ - type = op->op == OP_OR; - - for (i = 0; i < op->val; i++) { - pred = &preds[op->ops[i]]; - if (!WARN_ON_ONCE(!pred->fn)) - match = pred->fn(pred, rec); - if (!!match == type) - break; - } - /* If not of not match is equal to not of not, then it is a match */ - return !!match == !op->not; -} - -struct filter_match_preds_data { - struct filter_pred *preds; - int match; - void *rec; -}; - -static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct filter_match_preds_data *d = data; - - *err = 0; - switch (move) { - case MOVE_DOWN: - /* only AND and OR have children */ - if (pred->left != FILTER_PRED_INVALID) { - /* If ops is set, then it was folded. */ - if (!pred->ops) - return WALK_PRED_DEFAULT; - /* We can treat folded ops as a leaf node */ - d->match = process_ops(d->preds, pred, d->rec); - } else { - if (!WARN_ON_ONCE(!pred->fn)) - d->match = pred->fn(pred, d->rec); - } - - return WALK_PRED_PARENT; - case MOVE_UP_FROM_LEFT: - /* - * Check for short circuits. - * - * Optimization: !!match == (pred->op == OP_OR) - * is the same as: - * if ((match && pred->op == OP_OR) || - * (!match && pred->op == OP_AND)) - */ - if (!!d->match == (pred->op == OP_OR)) - return WALK_PRED_PARENT; - break; - case MOVE_UP_FROM_RIGHT: - break; - } - - return WALK_PRED_DEFAULT; } /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - struct filter_pred *preds; - struct filter_pred *root; - struct filter_match_preds_data data = { - /* match is currently meaningless */ - .match = -1, - .rec = rec, - }; - int n_preds, ret; + struct prog_entry *prog; + int i; /* no filter is considered a match */ if (!filter) return 1; - n_preds = filter->n_preds; - if (!n_preds) + prog = rcu_dereference_sched(filter->prog); + if (!prog) return 1; - /* - * n_preds, root and filter->preds are protect with preemption disabled. - */ - root = rcu_dereference_sched(filter->root); - if (!root) - return 1; - - data.preds = preds = rcu_dereference_sched(filter->preds); - ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); - WARN_ON(ret); - return data.match; + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; + int match = pred->fn(pred, rec); + if (match == prog[i].when_to_branch) + i = prog[i].target; + } + return prog[i].target; } EXPORT_SYMBOL_GPL(filter_match_preds); -static void parse_error(struct filter_parse_state *ps, int err, int pos) -{ - ps->lasterr = err; - ps->lasterr_pos = pos; -} - static void remove_filter_string(struct event_filter *filter) { if (!filter) @@ -665,57 +910,44 @@ static void remove_filter_string(struct event_filter *filter) filter->filter_string = NULL; } -static int replace_filter_string(struct event_filter *filter, - char *filter_string) -{ - kfree(filter->filter_string); - filter->filter_string = kstrdup(filter_string, GFP_KERNEL); - if (!filter->filter_string) - return -ENOMEM; - - return 0; -} - -static int append_filter_string(struct event_filter *filter, - char *string) -{ - int newlen; - char *new_filter_string; - - BUG_ON(!filter->filter_string); - newlen = strlen(filter->filter_string) + strlen(string) + 1; - new_filter_string = kmalloc(newlen, GFP_KERNEL); - if (!new_filter_string) - return -ENOMEM; - - strcpy(new_filter_string, filter->filter_string); - strcat(new_filter_string, string); - kfree(filter->filter_string); - filter->filter_string = new_filter_string; - - return 0; -} - -static void append_filter_err(struct filter_parse_state *ps, +static void append_filter_err(struct filter_parse_error *pe, struct event_filter *filter) { - int pos = ps->lasterr_pos; - char *buf, *pbuf; + struct trace_seq *s; + int pos = pe->lasterr_pos; + char *buf; + int len; + + if (WARN_ON(!filter->filter_string)) + return; - buf = (char *)__get_free_page(GFP_KERNEL); - if (!buf) + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) return; + trace_seq_init(s); - append_filter_string(filter, "\n"); - memset(buf, ' ', PAGE_SIZE); - if (pos > PAGE_SIZE - 128) - pos = 0; - buf[pos] = '^'; - pbuf = &buf[pos] + 1; + len = strlen(filter->filter_string); + if (pos > len) + pos = len; - sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); - append_filter_string(filter, buf); - free_page((unsigned long) buf); + /* indexing is off by one */ + if (pos) + pos++; + + trace_seq_puts(s, filter->filter_string); + if (pe->lasterr > 0) { + trace_seq_printf(s, "\n%*s", pos, "^"); + trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); + } else { + trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); + } + trace_seq_putc(s, 0); + buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); + if (buf) { + kfree(filter->filter_string); + filter->filter_string = buf; + } + kfree(s); } static inline struct event_filter *event_filter(struct trace_event_file *file) @@ -748,108 +980,18 @@ void print_subsystem_event_filter(struct event_subsystem *system, mutex_unlock(&event_mutex); } -static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) -{ - stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); - if (!stack->preds) - return -ENOMEM; - stack->index = n_preds; - return 0; -} - -static void __free_pred_stack(struct pred_stack *stack) -{ - kfree(stack->preds); - stack->index = 0; -} - -static int __push_pred_stack(struct pred_stack *stack, - struct filter_pred *pred) -{ - int index = stack->index; - - if (WARN_ON(index == 0)) - return -ENOSPC; - - stack->preds[--index] = pred; - stack->index = index; - return 0; -} - -static struct filter_pred * -__pop_pred_stack(struct pred_stack *stack) -{ - struct filter_pred *pred; - int index = stack->index; - - pred = stack->preds[index++]; - if (!pred) - return NULL; - - stack->index = index; - return pred; -} - -static int filter_set_pred(struct event_filter *filter, - int idx, - struct pred_stack *stack, - struct filter_pred *src) -{ - struct filter_pred *dest = &filter->preds[idx]; - struct filter_pred *left; - struct filter_pred *right; - - *dest = *src; - dest->index = idx; - - if (dest->op == OP_OR || dest->op == OP_AND) { - right = __pop_pred_stack(stack); - left = __pop_pred_stack(stack); - if (!left || !right) - return -EINVAL; - /* - * If both children can be folded - * and they are the same op as this op or a leaf, - * then this op can be folded. - */ - if (left->index & FILTER_PRED_FOLD && - ((left->op == dest->op && !left->not) || - left->left == FILTER_PRED_INVALID) && - right->index & FILTER_PRED_FOLD && - ((right->op == dest->op && !right->not) || - right->left == FILTER_PRED_INVALID)) - dest->index |= FILTER_PRED_FOLD; - - dest->left = left->index & ~FILTER_PRED_FOLD; - dest->right = right->index & ~FILTER_PRED_FOLD; - left->parent = dest->index & ~FILTER_PRED_FOLD; - right->parent = dest->index | FILTER_PRED_IS_RIGHT; - } else { - /* - * Make dest->left invalid to be used as a quick - * way to know this is a leaf node. - */ - dest->left = FILTER_PRED_INVALID; - - /* All leafs allow folding the parent ops. */ - dest->index |= FILTER_PRED_FOLD; - } - - return __push_pred_stack(stack, dest); -} - -static void __free_preds(struct event_filter *filter) +static void free_prog(struct event_filter *filter) { + struct prog_entry *prog; int i; - if (filter->preds) { - for (i = 0; i < filter->n_preds; i++) - kfree(filter->preds[i].ops); - kfree(filter->preds); - filter->preds = NULL; - } - filter->a_preds = 0; - filter->n_preds = 0; + prog = rcu_access_pointer(filter->prog); + if (!prog) + return; + + for (i = 0; prog[i].pred; i++) + kfree(prog[i].pred); + kfree(prog); } static void filter_disable(struct trace_event_file *file) @@ -867,7 +1009,7 @@ static void __free_filter(struct event_filter *filter) if (!filter) return; - __free_preds(filter); + free_prog(filter); kfree(filter->filter_string); kfree(filter); } @@ -877,38 +1019,6 @@ void free_event_filter(struct event_filter *filter) __free_filter(filter); } -static struct event_filter *__alloc_filter(void) -{ - struct event_filter *filter; - - filter = kzalloc(sizeof(*filter), GFP_KERNEL); - return filter; -} - -static int __alloc_preds(struct event_filter *filter, int n_preds) -{ - struct filter_pred *pred; - int i; - - if (filter->preds) - __free_preds(filter); - - filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - - filter->a_preds = n_preds; - filter->n_preds = 0; - - for (i = 0; i < n_preds; i++) { - pred = &filter->preds[i]; - pred->fn = filter_pred_none; - } - - return 0; -} - static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); @@ -945,27 +1055,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, } } -static int filter_add_pred(struct filter_parse_state *ps, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack) -{ - int err; - - if (WARN_ON(filter->n_preds == filter->a_preds)) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - err = filter_set_pred(filter, filter->n_preds, stack, pred); - if (err) - return err; - - filter->n_preds++; - - return 0; -} - int filter_assign_type(const char *type) { if (strstr(type, "__data_loc") && strstr(type, "char")) @@ -977,761 +1066,449 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op) -{ - if (is_string_field(field) && - (op != OP_EQ && op != OP_NE && op != OP_GLOB)) - return false; - if (!is_string_field(field) && op == OP_GLOB) - return false; - - return true; -} - static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, int field_size, int field_is_signed) { filter_pred_fn_t fn = NULL; + int pred_func_index = -1; + + switch (op) { + case OP_EQ: + case OP_NE: + break; + default: + if (WARN_ON_ONCE(op < PRED_FUNC_START)) + return NULL; + pred_func_index = op - PRED_FUNC_START; + if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) + return NULL; + } switch (field_size) { case 8: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_64; else if (field_is_signed) - fn = pred_funcs_s64[op - PRED_FUNC_START]; + fn = pred_funcs_s64[pred_func_index]; else - fn = pred_funcs_u64[op - PRED_FUNC_START]; + fn = pred_funcs_u64[pred_func_index]; break; case 4: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_32; else if (field_is_signed) - fn = pred_funcs_s32[op - PRED_FUNC_START]; + fn = pred_funcs_s32[pred_func_index]; else - fn = pred_funcs_u32[op - PRED_FUNC_START]; + fn = pred_funcs_u32[pred_func_index]; break; case 2: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_16; else if (field_is_signed) - fn = pred_funcs_s16[op - PRED_FUNC_START]; + fn = pred_funcs_s16[pred_func_index]; else - fn = pred_funcs_u16[op - PRED_FUNC_START]; + fn = pred_funcs_u16[pred_func_index]; break; case 1: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_8; else if (field_is_signed) - fn = pred_funcs_s8[op - PRED_FUNC_START]; + fn = pred_funcs_s8[pred_func_index]; else - fn = pred_funcs_u8[op - PRED_FUNC_START]; + fn = pred_funcs_u8[pred_func_index]; break; } return fn; } -static int init_pred(struct filter_parse_state *ps, - struct ftrace_event_field *field, - struct filter_pred *pred) - +/* Called when a predicate is encountered by predicate_parse() */ +static int parse_pred(const char *str, void *data, + int pos, struct filter_parse_error *pe, + struct filter_pred **pred_ptr) { - filter_pred_fn_t fn = filter_pred_none; - unsigned long long val; + struct trace_event_call *call = data; + struct ftrace_event_field *field; + struct filter_pred *pred = NULL; + char num_buf[24]; /* Big enough to hold an address */ + char *field_name; + char q; + u64 val; + int len; int ret; + int op; + int s; + int i = 0; - pred->offset = field->offset; - - if (!is_legal_op(field, pred->op)) { - parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); - return -EINVAL; - } - - if (field->filter_type == FILTER_COMM) { - filter_build_regex(pred); - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (is_string_field(field)) { - filter_build_regex(pred); - - if (field->filter_type == FILTER_STATIC_STRING) { - fn = filter_pred_string; - pred->regex.field_len = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) - fn = filter_pred_strloc; - else - fn = filter_pred_pchar; - } else if (is_function_field(field)) { - if (strcmp(field->name, "ip")) { - parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); - return -EINVAL; - } - } else { - if (field->is_signed) - ret = kstrtoll(pred->regex.pattern, 0, &val); - else - ret = kstrtoull(pred->regex.pattern, 0, &val); - if (ret) { - parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); - return -EINVAL; - } - pred->val = val; - - if (field->filter_type == FILTER_CPU) - fn = filter_pred_cpu; - else - fn = select_comparison_fn(pred->op, field->size, - field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; - } - } - - if (pred->op == OP_NE) - pred->not ^= 1; - - pred->fn = fn; - return 0; -} - -static void parse_init(struct filter_parse_state *ps, - struct filter_op *ops, - char *infix_string) -{ - memset(ps, '\0', sizeof(*ps)); - - ps->infix.string = infix_string; - ps->infix.cnt = strlen(infix_string); - ps->ops = ops; + /* First find the field to associate to */ + while (isspace(str[i])) + i++; + s = i; - INIT_LIST_HEAD(&ps->opstack); - INIT_LIST_HEAD(&ps->postfix); -} - -static char infix_next(struct filter_parse_state *ps) -{ - if (!ps->infix.cnt) - return 0; - - ps->infix.cnt--; - - return ps->infix.string[ps->infix.tail++]; -} - -static char infix_peek(struct filter_parse_state *ps) -{ - if (ps->infix.tail == strlen(ps->infix.string)) - return 0; - - return ps->infix.string[ps->infix.tail]; -} + while (isalnum(str[i]) || str[i] == '_') + i++; -static void infix_advance(struct filter_parse_state *ps) -{ - if (!ps->infix.cnt) - return; + len = i - s; - ps->infix.cnt--; - ps->infix.tail++; -} + if (!len) + return -1; -static inline int is_precedence_lower(struct filter_parse_state *ps, - int a, int b) -{ - return ps->ops[a].precedence < ps->ops[b].precedence; -} + field_name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!field_name) + return -ENOMEM; -static inline int is_op_char(struct filter_parse_state *ps, char c) -{ - int i; + /* Make sure that the field exists */ - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (ps->ops[i].string[0] == c) - return 1; + field = trace_find_event_field(call, field_name); + kfree(field_name); + if (!field) { + parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i); + return -EINVAL; } - return 0; -} - -static int infix_get_op(struct filter_parse_state *ps, char firstc) -{ - char nextc = infix_peek(ps); - char opstr[3]; - int i; - - opstr[0] = firstc; - opstr[1] = nextc; - opstr[2] = '\0'; + while (isspace(str[i])) + i++; - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (!strcmp(opstr, ps->ops[i].string)) { - infix_advance(ps); - return ps->ops[i].id; - } + /* Make sure this op is supported */ + for (op = 0; ops[op]; op++) { + /* This is why '<=' must come before '<' in ops[] */ + if (strncmp(str + i, ops[op], strlen(ops[op])) == 0) + break; } - opstr[1] = '\0'; - - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (!strcmp(opstr, ps->ops[i].string)) - return ps->ops[i].id; + if (!ops[op]) { + parse_error(pe, FILT_ERR_INVALID_OP, pos + i); + goto err_free; } - return OP_NONE; -} - -static inline void clear_operand_string(struct filter_parse_state *ps) -{ - memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); - ps->operand.tail = 0; -} + i += strlen(ops[op]); -static inline int append_operand_char(struct filter_parse_state *ps, char c) -{ - if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) - return -EINVAL; + while (isspace(str[i])) + i++; - ps->operand.string[ps->operand.tail++] = c; + s = i; - return 0; -} - -static int filter_opstack_push(struct filter_parse_state *ps, - enum filter_op_ids op) -{ - struct opstack_op *opstack_op; - - opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); - if (!opstack_op) + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) return -ENOMEM; - opstack_op->op = op; - list_add(&opstack_op->list, &ps->opstack); - - return 0; -} + pred->field = field; + pred->offset = field->offset; + pred->op = op; -static int filter_opstack_empty(struct filter_parse_state *ps) -{ - return list_empty(&ps->opstack); -} + if (ftrace_event_is_function(call)) { + /* + * Perf does things different with function events. + * It only allows an "ip" field, and expects a string. + * But the string does not need to be surrounded by quotes. + * If it is a string, the assigned function as a nop, + * (perf doesn't use it) and grab everything. + */ + if (strcmp(field->name, "ip") != 0) { + parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); + goto err_free; + } + pred->fn = filter_pred_none; + + /* + * Quotes are not required, but if they exist then we need + * to read them till we hit a matching one. + */ + if (str[i] == '\'' || str[i] == '"') + q = str[i]; + else + q = 0; + + for (i++; str[i]; i++) { + if (q && str[i] == q) + break; + if (!q && (str[i] == ')' || str[i] == '&' || + str[i] == '|')) + break; + } + /* Skip quotes */ + if (q) + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } -static int filter_opstack_top(struct filter_parse_state *ps) -{ - struct opstack_op *opstack_op; + pred->regex.len = len; + strncpy(pred->regex.pattern, str + s, len); + pred->regex.pattern[len] = 0; + + /* This is either a string, or an integer */ + } else if (str[i] == '\'' || str[i] == '"') { + char q = str[i]; + + /* Make sure the op is OK for strings */ + switch (op) { + case OP_NE: + pred->not = 1; + /* Fall through */ + case OP_GLOB: + case OP_EQ: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } - if (filter_opstack_empty(ps)) - return OP_NONE; + /* Make sure the field is OK for strings */ + if (!is_string_field(field)) { + parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i); + goto err_free; + } - opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + for (i++; str[i]; i++) { + if (str[i] == q) + break; + } + if (!str[i]) { + parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i); + goto err_free; + } - return opstack_op->op; -} + /* Skip quotes */ + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } -static int filter_opstack_pop(struct filter_parse_state *ps) -{ - struct opstack_op *opstack_op; - enum filter_op_ids op; + pred->regex.len = len; + strncpy(pred->regex.pattern, str + s, len); + pred->regex.pattern[len] = 0; - if (filter_opstack_empty(ps)) - return OP_NONE; + filter_build_regex(pred); - opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); - op = opstack_op->op; - list_del(&opstack_op->list); + if (field->filter_type == FILTER_COMM) { + pred->fn = filter_pred_comm; - kfree(opstack_op); + } else if (field->filter_type == FILTER_STATIC_STRING) { + pred->fn = filter_pred_string; + pred->regex.field_len = field->size; - return op; -} + } else if (field->filter_type == FILTER_DYN_STRING) + pred->fn = filter_pred_strloc; + else + pred->fn = filter_pred_pchar; + /* go past the last quote */ + i++; -static void filter_opstack_clear(struct filter_parse_state *ps) -{ - while (!filter_opstack_empty(ps)) - filter_opstack_pop(ps); -} + } else if (isdigit(str[i])) { -static char *curr_operand(struct filter_parse_state *ps) -{ - return ps->operand.string; -} + /* Make sure the field is not a string */ + if (is_string_field(field)) { + parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i); + goto err_free; + } -static int postfix_append_operand(struct filter_parse_state *ps, char *operand) -{ - struct postfix_elt *elt; + if (op == OP_GLOB) { + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } - elt = kmalloc(sizeof(*elt), GFP_KERNEL); - if (!elt) - return -ENOMEM; + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; - elt->op = OP_NONE; - elt->operand = kstrdup(operand, GFP_KERNEL); - if (!elt->operand) { - kfree(elt); - return -ENOMEM; - } + len = i - s; + /* 0xfeedfacedeadbeef is 18 chars max */ + if (len >= sizeof(num_buf)) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } - list_add_tail(&elt->list, &ps->postfix); + strncpy(num_buf, str + s, len); + num_buf[len] = 0; - return 0; -} + /* Make sure it is a value */ + if (field->is_signed) + ret = kstrtoll(num_buf, 0, &val); + else + ret = kstrtoull(num_buf, 0, &val); + if (ret) { + parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s); + goto err_free; + } -static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op) -{ - struct postfix_elt *elt; + pred->val = val; - elt = kmalloc(sizeof(*elt), GFP_KERNEL); - if (!elt) - return -ENOMEM; + if (field->filter_type == FILTER_CPU) + pred->fn = filter_pred_cpu; + else { + pred->fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (pred->op == OP_NE) + pred->not = 1; + } - elt->op = op; - elt->operand = NULL; + } else { + parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); + goto err_free; + } - list_add_tail(&elt->list, &ps->postfix); + *pred_ptr = pred; + return i; - return 0; +err_free: + kfree(pred); + return -EINVAL; } -static void postfix_clear(struct filter_parse_state *ps) -{ - struct postfix_elt *elt; +enum { + TOO_MANY_CLOSE = -1, + TOO_MANY_OPEN = -2, + MISSING_QUOTE = -3, +}; - while (!list_empty(&ps->postfix)) { - elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - list_del(&elt->list); - kfree(elt->operand); - kfree(elt); - } -} +/* + * Read the filter string once to calculate the number of predicates + * as well as how deep the parentheses go. + * + * Returns: + * 0 - everything is fine (err is undefined) + * -1 - too many ')' + * -2 - too many '(' + * -3 - No matching quote + */ +static int calc_stack(const char *str, int *parens, int *preds, int *err) +{ + bool is_pred = false; + int nr_preds = 0; + int open = 1; /* Count the expression as "(E)" */ + int last_quote = 0; + int max_open = 1; + int quote = 0; + int i; -static int filter_parse(struct filter_parse_state *ps) -{ - enum filter_op_ids op, top_op; - int in_string = 0; - char ch; + *err = 0; - while ((ch = infix_next(ps))) { - if (ch == '"') { - in_string ^= 1; + for (i = 0; str[i]; i++) { + if (isspace(str[i])) continue; - } - - if (in_string) - goto parse_operand; - - if (isspace(ch)) + if (quote) { + if (str[i] == quote) + quote = 0; continue; + } - if (is_op_char(ps, ch)) { - op = infix_get_op(ps, ch); - if (op == OP_NONE) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; - } - - if (strlen(curr_operand(ps))) { - postfix_append_operand(ps, curr_operand(ps)); - clear_operand_string(ps); - } - - while (!filter_opstack_empty(ps)) { - top_op = filter_opstack_top(ps); - if (!is_precedence_lower(ps, top_op, op)) { - top_op = filter_opstack_pop(ps); - postfix_append_op(ps, top_op); - continue; - } + switch (str[i]) { + case '\'': + case '"': + quote = str[i]; + last_quote = i; + break; + case '|': + case '&': + if (str[i+1] != str[i]) break; - } - - filter_opstack_push(ps, op); + is_pred = false; continue; - } - - if (ch == '(') { - filter_opstack_push(ps, OP_OPEN_PAREN); + case '(': + is_pred = false; + open++; + if (open > max_open) + max_open = open; continue; - } - - if (ch == ')') { - if (strlen(curr_operand(ps))) { - postfix_append_operand(ps, curr_operand(ps)); - clear_operand_string(ps); - } - - top_op = filter_opstack_pop(ps); - while (top_op != OP_NONE) { - if (top_op == OP_OPEN_PAREN) - break; - postfix_append_op(ps, top_op); - top_op = filter_opstack_pop(ps); - } - if (top_op == OP_NONE) { - parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); - return -EINVAL; + case ')': + is_pred = false; + if (open == 1) { + *err = i; + return TOO_MANY_CLOSE; } + open--; continue; } -parse_operand: - if (append_operand_char(ps, ch)) { - parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); - return -EINVAL; + if (!is_pred) { + nr_preds++; + is_pred = true; } } - if (strlen(curr_operand(ps))) - postfix_append_operand(ps, curr_operand(ps)); - - while (!filter_opstack_empty(ps)) { - top_op = filter_opstack_pop(ps); - if (top_op == OP_NONE) - break; - if (top_op == OP_OPEN_PAREN) { - parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); - return -EINVAL; - } - postfix_append_op(ps, top_op); + if (quote) { + *err = last_quote; + return MISSING_QUOTE; } - return 0; -} + if (open != 1) { + int level = open; -static struct filter_pred *create_pred(struct filter_parse_state *ps, - struct trace_event_call *call, - enum filter_op_ids op, - char *operand1, char *operand2) -{ - struct ftrace_event_field *field; - static struct filter_pred pred; - - memset(&pred, 0, sizeof(pred)); - pred.op = op; - - if (op == OP_AND || op == OP_OR) - return &pred; - - if (!operand1 || !operand2) { - parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return NULL; - } - - field = trace_find_event_field(call, operand1); - if (!field) { - parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); - return NULL; - } - - strcpy(pred.regex.pattern, operand2); - pred.regex.len = strlen(pred.regex.pattern); - pred.field = field; - return init_pred(ps, field, &pred) ? NULL : &pred; -} - -static int check_preds(struct filter_parse_state *ps) -{ - int n_normal_preds = 0, n_logical_preds = 0; - struct postfix_elt *elt; - int cnt = 0; - - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) { - cnt++; - continue; - } - - if (elt->op == OP_AND || elt->op == OP_OR) { - n_logical_preds++; - cnt--; - continue; + /* find the bad open */ + for (i--; i; i--) { + if (quote) { + if (str[i] == quote) + quote = 0; + continue; + } + switch (str[i]) { + case '(': + if (level == open) { + *err = i; + return TOO_MANY_OPEN; + } + level--; + break; + case ')': + level++; + break; + case '\'': + case '"': + quote = str[i]; + break; + } } - if (elt->op != OP_NOT) - cnt--; - n_normal_preds++; - /* all ops should have operands */ - if (cnt < 0) - break; - } - - if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) { - parse_error(ps, FILT_ERR_INVALID_FILTER, 0); - return -EINVAL; + /* First character is the '(' with missing ')' */ + *err = 0; + return TOO_MANY_OPEN; } + /* Set the size of the required stacks */ + *parens = max_open; + *preds = nr_preds; return 0; } -static int count_preds(struct filter_parse_state *ps) -{ - struct postfix_elt *elt; - int n_preds = 0; - - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) - continue; - n_preds++; - } - - return n_preds; -} - -struct check_pred_data { - int count; - int max; -}; - -static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct check_pred_data *d = data; - - if (WARN_ON(d->count++ > d->max)) { - *err = -EINVAL; - return WALK_PRED_ABORT; - } - return WALK_PRED_DEFAULT; -} - -/* - * The tree is walked at filtering of an event. If the tree is not correctly - * built, it may cause an infinite loop. Check here that the tree does - * indeed terminate. - */ -static int check_pred_tree(struct event_filter *filter, - struct filter_pred *root) -{ - struct check_pred_data data = { - /* - * The max that we can hit a node is three times. - * Once going down, once coming up from left, and - * once coming up from right. This is more than enough - * since leafs are only hit a single time. - */ - .max = 3 * filter->n_preds, - .count = 0, - }; - - return walk_pred_tree(filter->preds, root, - check_pred_tree_cb, &data); -} - -static int count_leafs_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - int *count = data; - - if ((move == MOVE_DOWN) && - (pred->left == FILTER_PRED_INVALID)) - (*count)++; - - return WALK_PRED_DEFAULT; -} - -static int count_leafs(struct filter_pred *preds, struct filter_pred *root) -{ - int count = 0, ret; - - ret = walk_pred_tree(preds, root, count_leafs_cb, &count); - WARN_ON(ret); - return count; -} - -struct fold_pred_data { - struct filter_pred *root; - int count; - int children; -}; - -static int fold_pred_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct fold_pred_data *d = data; - struct filter_pred *root = d->root; - - if (move != MOVE_DOWN) - return WALK_PRED_DEFAULT; - if (pred->left != FILTER_PRED_INVALID) - return WALK_PRED_DEFAULT; - - if (WARN_ON(d->count == d->children)) { - *err = -EINVAL; - return WALK_PRED_ABORT; - } - - pred->index &= ~FILTER_PRED_FOLD; - root->ops[d->count++] = pred->index; - return WALK_PRED_DEFAULT; -} - -static int fold_pred(struct filter_pred *preds, struct filter_pred *root) -{ - struct fold_pred_data data = { - .root = root, - .count = 0, - }; - int children; - - /* No need to keep the fold flag */ - root->index &= ~FILTER_PRED_FOLD; - - /* If the root is a leaf then do nothing */ - if (root->left == FILTER_PRED_INVALID) - return 0; - - /* count the children */ - children = count_leafs(preds, &preds[root->left]); - children += count_leafs(preds, &preds[root->right]); - - root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); - if (!root->ops) - return -ENOMEM; - - root->val = children; - data.children = children; - return walk_pred_tree(preds, root, fold_pred_cb, &data); -} - -static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct filter_pred *preds = data; - - if (move != MOVE_DOWN) - return WALK_PRED_DEFAULT; - if (!(pred->index & FILTER_PRED_FOLD)) - return WALK_PRED_DEFAULT; - - *err = fold_pred(preds, pred); - if (*err) - return WALK_PRED_ABORT; - - /* eveyrhing below is folded, continue with parent */ - return WALK_PRED_PARENT; -} - -/* - * To optimize the processing of the ops, if we have several "ors" or - * "ands" together, we can put them in an array and process them all - * together speeding up the filter logic. - */ -static int fold_pred_tree(struct event_filter *filter, - struct filter_pred *root) -{ - return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, - filter->preds); -} - -static int replace_preds(struct trace_event_call *call, +static int process_preds(struct trace_event_call *call, + const char *filter_string, struct event_filter *filter, - struct filter_parse_state *ps, - bool dry_run) + struct filter_parse_error *pe) { - char *operand1 = NULL, *operand2 = NULL; - struct filter_pred *pred; - struct filter_pred *root; - struct postfix_elt *elt; - struct pred_stack stack = { }; /* init to NULL */ - int err; - int n_preds = 0; - - n_preds = count_preds(ps); - if (n_preds >= MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - err = check_preds(ps); - if (err) - return err; - - if (!dry_run) { - err = __alloc_pred_stack(&stack, n_preds); - if (err) - return err; - err = __alloc_preds(filter, n_preds); - if (err) - goto fail; - } - - n_preds = 0; - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) { - if (!operand1) - operand1 = elt->operand; - else if (!operand2) - operand2 = elt->operand; - else { - parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - err = -EINVAL; - goto fail; - } - continue; - } - - if (elt->op == OP_NOT) { - if (!n_preds || operand1 || operand2) { - parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); - err = -EINVAL; - goto fail; - } - if (!dry_run) - filter->preds[n_preds - 1].not ^= 1; - continue; - } - - if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - err = -ENOSPC; - goto fail; - } - - pred = create_pred(ps, call, elt->op, operand1, operand2); - if (!pred) { - err = -EINVAL; - goto fail; - } + struct prog_entry *prog; + int nr_parens; + int nr_preds; + int index; + int ret; - if (!dry_run) { - err = filter_add_pred(ps, filter, pred, &stack); - if (err) - goto fail; + ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index); + if (ret < 0) { + switch (ret) { + case MISSING_QUOTE: + parse_error(pe, FILT_ERR_MISSING_QUOTE, index); + break; + case TOO_MANY_OPEN: + parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index); + break; + default: + parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index); } - - operand1 = operand2 = NULL; + return ret; } - if (!dry_run) { - /* We should have one item left on the stack */ - pred = __pop_pred_stack(&stack); - if (!pred) - return -EINVAL; - /* This item is where we start from in matching */ - root = pred; - /* Make sure the stack is empty */ - pred = __pop_pred_stack(&stack); - if (WARN_ON(pred)) { - err = -EINVAL; - filter->root = NULL; - goto fail; - } - err = check_pred_tree(filter, root); - if (err) - goto fail; - - /* Optimize the tree */ - err = fold_pred_tree(filter, root); - if (err) - goto fail; - - /* We don't set root until we know it works */ - barrier(); - filter->root = root; + if (!nr_preds) { + prog = NULL; + } else { + prog = predicate_parse(filter_string, nr_parens, nr_preds, + parse_pred, call, pe); + if (IS_ERR(prog)) + return PTR_ERR(prog); } - - err = 0; -fail: - __free_pred_stack(&stack); - return err; + rcu_assign_pointer(filter->prog, prog); + return 0; } static inline void event_set_filtered_flag(struct trace_event_file *file) @@ -1781,72 +1558,53 @@ struct filter_list { struct event_filter *filter; }; -static int replace_system_preds(struct trace_subsystem_dir *dir, +static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, - struct filter_parse_state *ps, + struct filter_parse_error *pe, char *filter_string) { struct trace_event_file *file; struct filter_list *filter_item; + struct event_filter *filter = NULL; struct filter_list *tmp; LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(file, &tr->events, list) { - if (file->system != dir) - continue; - - /* - * Try to see if the filter can be applied - * (filter arg is ignored on dry_run) - */ - err = replace_preds(file->event_call, NULL, ps, true); - if (err) - event_set_no_set_filter_flag(file); - else - event_clear_no_set_filter_flag(file); - } - - list_for_each_entry(file, &tr->events, list) { - struct event_filter *filter; if (file->system != dir) continue; - if (event_no_set_filter_flag(file)) - continue; - - filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); - if (!filter_item) - goto fail_mem; - - list_add_tail(&filter_item->list, &filter_list); - - filter_item->filter = __alloc_filter(); - if (!filter_item->filter) + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) goto fail_mem; - filter = filter_item->filter; - /* Can only fail on no memory */ - err = replace_filter_string(filter, filter_string); - if (err) + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) goto fail_mem; - err = replace_preds(file->event_call, filter, ps, false); + err = process_preds(file->event_call, filter_string, filter, pe); if (err) { filter_disable(file); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - append_filter_err(ps, filter); + parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(pe, filter); } else event_set_filtered_flag(file); + + + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); /* * Regardless of if this returned an error, we still * replace the filter for the call. */ - filter = event_filter(file); - event_set_filter(file, filter_item->filter); - filter_item->filter = filter; + filter_item->filter = event_filter(file); + event_set_filter(file, filter); + filter = NULL; fail = false; } @@ -1872,9 +1630,10 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, list_del(&filter_item->list); kfree(filter_item); } - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: + kfree(filter); /* If any call succeeded, we still need to sync */ if (!fail) synchronize_sched(); @@ -1886,47 +1645,42 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, return -ENOMEM; } -static int create_filter_start(char *filter_str, bool set_str, - struct filter_parse_state **psp, +static int create_filter_start(char *filter_string, bool set_str, + struct filter_parse_error **pse, struct event_filter **filterp) { struct event_filter *filter; - struct filter_parse_state *ps = NULL; + struct filter_parse_error *pe = NULL; int err = 0; - WARN_ON_ONCE(*psp || *filterp); + if (WARN_ON_ONCE(*pse || *filterp)) + return -EINVAL; - /* allocate everything, and if any fails, free all and fail */ - filter = __alloc_filter(); - if (filter && set_str) - err = replace_filter_string(filter, filter_str); + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (filter && set_str) { + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + err = -ENOMEM; + } - ps = kzalloc(sizeof(*ps), GFP_KERNEL); + pe = kzalloc(sizeof(*pe), GFP_KERNEL); - if (!filter || !ps || err) { - kfree(ps); + if (!filter || !pe || err) { + kfree(pe); __free_filter(filter); return -ENOMEM; } /* we're committed to creating a new filter */ *filterp = filter; - *psp = ps; + *pse = pe; - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err && set_str) - append_filter_err(ps, filter); - return err; + return 0; } -static void create_filter_finish(struct filter_parse_state *ps) +static void create_filter_finish(struct filter_parse_error *pe) { - if (ps) { - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - } + kfree(pe); } /** @@ -1946,26 +1700,20 @@ static void create_filter_finish(struct filter_parse_state *ps) * freeing it. */ static int create_filter(struct trace_event_call *call, - char *filter_str, bool set_str, + char *filter_string, bool set_str, struct event_filter **filterp) { - struct event_filter *filter = NULL; - struct filter_parse_state *ps = NULL; + struct filter_parse_error *pe = NULL; int err; - err = create_filter_start(filter_str, set_str, &ps, &filter); - if (!err) { - err = replace_preds(call, filter, ps, false); - if (err && set_str) - append_filter_err(ps, filter); - } - if (err && !set_str) { - free_event_filter(filter); - filter = NULL; - } - create_filter_finish(ps); + err = create_filter_start(filter_string, set_str, &pe, filterp); + if (err) + return err; + + err = process_preds(call, filter_string, *filterp, pe); + if (err && set_str) + append_filter_err(pe, *filterp); - *filterp = filter; return err; } @@ -1989,24 +1737,22 @@ static int create_system_filter(struct trace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { - struct event_filter *filter = NULL; - struct filter_parse_state *ps = NULL; + struct filter_parse_error *pe = NULL; int err; - err = create_filter_start(filter_str, true, &ps, &filter); + err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { - err = replace_system_preds(dir, tr, ps, filter_str); + err = process_system_preds(dir, tr, pe, filter_str); if (!err) { /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; + kfree((*filterp)->filter_string); + (*filterp)->filter_string = NULL; } else { - append_filter_err(ps, filter); + append_filter_err(pe, *filterp); } } - create_filter_finish(ps); + create_filter_finish(pe); - *filterp = filter; return err; } @@ -2014,7 +1760,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, int apply_event_filter(struct trace_event_file *file, char *filter_string) { struct trace_event_call *call = file->event_call; - struct event_filter *filter; + struct event_filter *filter = NULL; int err; if (!strcmp(strstrip(filter_string), "0")) { @@ -2067,7 +1813,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, { struct event_subsystem *system = dir->subsystem; struct trace_array *tr = dir->tr; - struct event_filter *filter; + struct event_filter *filter = NULL; int err = 0; mutex_lock(&event_mutex); @@ -2187,66 +1933,80 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len, return ret; } -static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +static int ftrace_function_check_pred(struct filter_pred *pred) { struct ftrace_event_field *field = pred->field; - if (leaf) { - /* - * Check the leaf predicate for function trace, verify: - * - only '==' and '!=' is used - * - the 'ip' field is used - */ - if ((pred->op != OP_EQ) && (pred->op != OP_NE)) - return -EINVAL; + /* + * Check the predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; - if (strcmp(field->name, "ip")) - return -EINVAL; - } else { - /* - * Check the non leaf predicate for function trace, verify: - * - only '||' is used - */ - if (pred->op != OP_OR) - return -EINVAL; - } + if (strcmp(field->name, "ip")) + return -EINVAL; return 0; } -static int ftrace_function_set_filter_cb(enum move_type move, - struct filter_pred *pred, - int *err, void *data) +static int ftrace_function_set_filter_pred(struct filter_pred *pred, + struct function_filter_data *data) { + int ret; + /* Checking the node is valid for function trace. */ - if ((move != MOVE_DOWN) || - (pred->left != FILTER_PRED_INVALID)) { - *err = ftrace_function_check_pred(pred, 0); - } else { - *err = ftrace_function_check_pred(pred, 1); - if (*err) - return WALK_PRED_ABORT; - - *err = __ftrace_function_set_filter(pred->op == OP_EQ, - pred->regex.pattern, - pred->regex.len, - data); - } + ret = ftrace_function_check_pred(pred); + if (ret) + return ret; - return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; + return __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); +} + +static bool is_or(struct prog_entry *prog, int i) +{ + int target; + + /* + * Only "||" is allowed for function events, thus, + * all true branches should jump to true, and any + * false branch should jump to false. + */ + target = prog[i].target + 1; + /* True and false have NULL preds (all prog entries should jump to one */ + if (prog[target].pred) + return false; + + /* prog[target].target is 1 for TRUE, 0 for FALSE */ + return prog[i].when_to_branch == prog[target].target; } static int ftrace_function_set_filter(struct perf_event *event, struct event_filter *filter) { + struct prog_entry *prog = rcu_dereference_protected(filter->prog, + lockdep_is_held(&event_mutex)); struct function_filter_data data = { .first_filter = 1, .first_notrace = 1, .ops = &event->ftrace_ops, }; + int i; + + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; - return walk_pred_tree(filter->preds, filter->root, - ftrace_function_set_filter_cb, &data); + if (!is_or(prog, i)) + return -EINVAL; + + if (ftrace_function_set_filter_pred(pred, &data) < 0) + return -EINVAL; + } + return 0; } #else static int ftrace_function_set_filter(struct perf_event *event, @@ -2260,7 +2020,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { int err; - struct event_filter *filter; + struct event_filter *filter = NULL; struct trace_event_call *call; mutex_lock(&event_mutex); @@ -2376,7 +2136,7 @@ static struct test_filter_data_t { #undef YES #undef NO -#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) +#define DATA_CNT ARRAY_SIZE(test_filter_data) static int test_pred_visited; @@ -2389,26 +2149,28 @@ static int test_pred_visited_fn(struct filter_pred *pred, void *event) return 1; } -static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) +static void update_pred_fn(struct event_filter *filter, char *fields) { - char *fields = data; + struct prog_entry *prog = rcu_dereference_protected(filter->prog, + lockdep_is_held(&event_mutex)); + int i; - if ((move == MOVE_DOWN) && - (pred->left == FILTER_PRED_INVALID)) { + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; struct ftrace_event_field *field = pred->field; + WARN_ON_ONCE(!pred->fn); + if (!field) { - WARN(1, "all leafs should have field defined"); - return WALK_PRED_DEFAULT; + WARN_ONCE(1, "all leafs should have field defined %d", i); + continue; } + if (!strchr(fields, *field->name)) - return WALK_PRED_DEFAULT; + continue; - WARN_ON(!pred->fn); pred->fn = test_pred_visited_fn; } - return WALK_PRED_DEFAULT; } static __init int ftrace_test_event_filter(void) @@ -2432,20 +2194,22 @@ static __init int ftrace_test_event_filter(void) break; } + /* Needed to dereference filter->prog */ + mutex_lock(&event_mutex); /* * The preemption disabling is not really needed for self * tests, but the rcu dereference will complain without it. */ preempt_disable(); if (*d->not_visited) - walk_pred_tree(filter->preds, filter->root, - test_walk_pred_cb, - d->not_visited); + update_pred_fn(filter, d->not_visited); test_pred_visited = 0; err = filter_match_preds(filter, &d->rec); preempt_enable(); + mutex_unlock(&event_mutex); + __free_filter(filter); if (test_pred_visited) { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1e1558c99d56..0d7b3ffbecc2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -20,15 +20,39 @@ #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/rculist.h> +#include <linux/tracefs.h> #include "tracing_map.h" #include "trace.h" +#define SYNTH_SYSTEM "synthetic" +#define SYNTH_FIELDS_MAX 16 + +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ + struct hist_field; -typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); +typedef u64 (*hist_field_fn_t) (struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event); #define HIST_FIELD_OPERANDS_MAX 2 +#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) +#define HIST_ACTIONS_MAX 8 + +enum field_op_id { + FIELD_OP_NONE, + FIELD_OP_PLUS, + FIELD_OP_MINUS, + FIELD_OP_UNARY_MINUS, +}; + +struct hist_var { + char *name; + struct hist_trigger_data *hist_data; + unsigned int idx; +}; struct hist_field { struct ftrace_event_field *field; @@ -37,27 +61,49 @@ struct hist_field { unsigned int size; unsigned int offset; unsigned int is_signed; + const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; + struct hist_trigger_data *hist_data; + struct hist_var var; + enum field_op_id operator; + char *system; + char *event_name; + char *name; + unsigned int var_idx; + unsigned int var_ref_idx; + bool read_once; }; -static u64 hist_field_none(struct hist_field *field, void *event) +static u64 hist_field_none(struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { return 0; } -static u64 hist_field_counter(struct hist_field *field, void *event) +static u64 hist_field_counter(struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { return 1; } -static u64 hist_field_string(struct hist_field *hist_field, void *event) +static u64 hist_field_string(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { char *addr = (char *)(event + hist_field->field->offset); return (u64)(unsigned long)addr; } -static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) +static u64 hist_field_dynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { u32 str_item = *(u32 *)(event + hist_field->field->offset); int str_loc = str_item & 0xffff; @@ -66,24 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) return (u64)(unsigned long)addr; } -static u64 hist_field_pstring(struct hist_field *hist_field, void *event) +static u64 hist_field_pstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { char **addr = (char **)(event + hist_field->field->offset); return (u64)(unsigned long)*addr; } -static u64 hist_field_log2(struct hist_field *hist_field, void *event) +static u64 hist_field_log2(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, event); + u64 val = operand->fn(operand, elt, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } +static u64 hist_field_plus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, rbe, event); + u64 val2 = operand2->fn(operand2, elt, rbe, event); + + return val1 + val2; +} + +static u64 hist_field_minus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, rbe, event); + u64 val2 = operand2->fn(operand2, elt, rbe, event); + + return val1 - val2; +} + +static u64 hist_field_unary_minus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + + s64 sval = (s64)operand->fn(operand, elt, rbe, event); + u64 val = (u64)-sval; + + return val; +} + #define DEFINE_HIST_FIELD_FN(type) \ -static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ + static u64 hist_field_##type(struct hist_field *hist_field, \ + struct tracing_map_elt *elt, \ + struct ring_buffer_event *rbe, \ + void *event) \ { \ type *addr = (type *)(event + hist_field->field->offset); \ \ @@ -126,6 +222,19 @@ enum hist_field_flags { HIST_FIELD_FL_SYSCALL = 1 << 7, HIST_FIELD_FL_STACKTRACE = 1 << 8, HIST_FIELD_FL_LOG2 = 1 << 9, + HIST_FIELD_FL_TIMESTAMP = 1 << 10, + HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11, + HIST_FIELD_FL_VAR = 1 << 12, + HIST_FIELD_FL_EXPR = 1 << 13, + HIST_FIELD_FL_VAR_REF = 1 << 14, + HIST_FIELD_FL_CPU = 1 << 15, + HIST_FIELD_FL_ALIAS = 1 << 16, +}; + +struct var_defs { + unsigned int n_vars; + char *name[TRACING_MAP_VARS_MAX]; + char *expr[TRACING_MAP_VARS_MAX]; }; struct hist_trigger_attrs { @@ -133,25 +242,1437 @@ struct hist_trigger_attrs { char *vals_str; char *sort_key_str; char *name; + char *clock; bool pause; bool cont; bool clear; + bool ts_in_usecs; unsigned int map_bits; + + char *assignment_str[TRACING_MAP_VARS_MAX]; + unsigned int n_assignments; + + char *action_str[HIST_ACTIONS_MAX]; + unsigned int n_actions; + + struct var_defs var_defs; +}; + +struct field_var { + struct hist_field *var; + struct hist_field *val; +}; + +struct field_var_hist { + struct hist_trigger_data *hist_data; + char *cmd; }; struct hist_trigger_data { - struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; + struct hist_field *fields[HIST_FIELDS_MAX]; unsigned int n_vals; unsigned int n_keys; unsigned int n_fields; + unsigned int n_vars; unsigned int key_size; struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; unsigned int n_sort_keys; struct trace_event_file *event_file; struct hist_trigger_attrs *attrs; struct tracing_map *map; + bool enable_timestamps; + bool remove; + struct hist_field *var_refs[TRACING_MAP_VARS_MAX]; + unsigned int n_var_refs; + + struct action_data *actions[HIST_ACTIONS_MAX]; + unsigned int n_actions; + + struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX]; + unsigned int n_synth_var_refs; + struct field_var *field_vars[SYNTH_FIELDS_MAX]; + unsigned int n_field_vars; + unsigned int n_field_var_str; + struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; + unsigned int n_field_var_hists; + + struct field_var *max_vars[SYNTH_FIELDS_MAX]; + unsigned int n_max_vars; + unsigned int n_max_var_str; +}; + +struct synth_field { + char *type; + char *name; + size_t size; + bool is_signed; + bool is_string; +}; + +struct synth_event { + struct list_head list; + int ref; + char *name; + struct synth_field **fields; + unsigned int n_fields; + unsigned int n_u64; + struct trace_event_class class; + struct trace_event_call call; + struct tracepoint *tp; +}; + +struct action_data; + +typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals); + +struct action_data { + action_fn_t fn; + unsigned int n_params; + char *params[SYNTH_FIELDS_MAX]; + + union { + struct { + unsigned int var_ref_idx; + char *match_event; + char *match_event_system; + char *synth_event_name; + struct synth_event *synth_event; + } onmatch; + + struct { + char *var_str; + char *fn_name; + unsigned int max_var_ref_idx; + struct hist_field *max_var; + struct hist_field *var; + } onmax; + }; +}; + + +static char last_hist_cmd[MAX_FILTER_STR_VAL]; +static char hist_err_str[MAX_FILTER_STR_VAL]; + +static void last_cmd_set(char *str) +{ + if (!str) + return; + + strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); +} + +static void hist_err(char *str, char *var) +{ + int maxlen = MAX_FILTER_STR_VAL - 1; + + if (!str) + return; + + if (strlen(hist_err_str)) + return; + + if (!var) + var = ""; + + if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) + return; + + strcat(hist_err_str, str); + strcat(hist_err_str, var); +} + +static void hist_err_event(char *str, char *system, char *event, char *var) +{ + char err[MAX_FILTER_STR_VAL]; + + if (system && var) + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); + else if (system) + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); + else + strncpy(err, var, MAX_FILTER_STR_VAL); + + hist_err(str, err); +} + +static void hist_err_clear(void) +{ + hist_err_str[0] = '\0'; +} + +static bool have_hist_err(void) +{ + if (strlen(hist_err_str)) + return true; + + return false; +} + +static LIST_HEAD(synth_event_list); +static DEFINE_MUTEX(synth_event_mutex); + +struct synth_trace_event { + struct trace_entry ent; + u64 fields[]; +}; + +static int synth_event_define_fields(struct trace_event_call *call) +{ + struct synth_trace_event trace; + int offset = offsetof(typeof(trace), fields); + struct synth_event *event = call->data; + unsigned int i, size, n_u64; + char *name, *type; + bool is_signed; + int ret = 0; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + size = event->fields[i]->size; + is_signed = event->fields[i]->is_signed; + type = event->fields[i]->type; + name = event->fields[i]->name; + ret = trace_define_field(call, type, name, offset, size, + is_signed, FILTER_OTHER); + if (ret) + break; + + if (event->fields[i]->is_string) { + offset += STR_VAR_LEN_MAX; + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + offset += sizeof(u64); + n_u64++; + } + } + + event->n_u64 = n_u64; + + return ret; +} + +static bool synth_field_signed(char *type) +{ + if (strncmp(type, "u", 1) == 0) + return false; + + return true; +} + +static int synth_field_is_string(char *type) +{ + if (strstr(type, "char[") != NULL) + return true; + + return false; +} + +static int synth_field_string_size(char *type) +{ + char buf[4], *end, *start; + unsigned int len; + int size, err; + + start = strstr(type, "char["); + if (start == NULL) + return -EINVAL; + start += strlen("char["); + + end = strchr(type, ']'); + if (!end || end < start) + return -EINVAL; + + len = end - start; + if (len > 3) + return -EINVAL; + + strncpy(buf, start, len); + buf[len] = '\0'; + + err = kstrtouint(buf, 0, &size); + if (err) + return err; + + if (size > STR_VAR_LEN_MAX) + return -EINVAL; + + return size; +} + +static int synth_field_size(char *type) +{ + int size = 0; + + if (strcmp(type, "s64") == 0) + size = sizeof(s64); + else if (strcmp(type, "u64") == 0) + size = sizeof(u64); + else if (strcmp(type, "s32") == 0) + size = sizeof(s32); + else if (strcmp(type, "u32") == 0) + size = sizeof(u32); + else if (strcmp(type, "s16") == 0) + size = sizeof(s16); + else if (strcmp(type, "u16") == 0) + size = sizeof(u16); + else if (strcmp(type, "s8") == 0) + size = sizeof(s8); + else if (strcmp(type, "u8") == 0) + size = sizeof(u8); + else if (strcmp(type, "char") == 0) + size = sizeof(char); + else if (strcmp(type, "unsigned char") == 0) + size = sizeof(unsigned char); + else if (strcmp(type, "int") == 0) + size = sizeof(int); + else if (strcmp(type, "unsigned int") == 0) + size = sizeof(unsigned int); + else if (strcmp(type, "long") == 0) + size = sizeof(long); + else if (strcmp(type, "unsigned long") == 0) + size = sizeof(unsigned long); + else if (strcmp(type, "pid_t") == 0) + size = sizeof(pid_t); + else if (synth_field_is_string(type)) + size = synth_field_string_size(type); + + return size; +} + +static const char *synth_field_fmt(char *type) +{ + const char *fmt = "%llu"; + + if (strcmp(type, "s64") == 0) + fmt = "%lld"; + else if (strcmp(type, "u64") == 0) + fmt = "%llu"; + else if (strcmp(type, "s32") == 0) + fmt = "%d"; + else if (strcmp(type, "u32") == 0) + fmt = "%u"; + else if (strcmp(type, "s16") == 0) + fmt = "%d"; + else if (strcmp(type, "u16") == 0) + fmt = "%u"; + else if (strcmp(type, "s8") == 0) + fmt = "%d"; + else if (strcmp(type, "u8") == 0) + fmt = "%u"; + else if (strcmp(type, "char") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned char") == 0) + fmt = "%u"; + else if (strcmp(type, "int") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned int") == 0) + fmt = "%u"; + else if (strcmp(type, "long") == 0) + fmt = "%ld"; + else if (strcmp(type, "unsigned long") == 0) + fmt = "%lu"; + else if (strcmp(type, "pid_t") == 0) + fmt = "%d"; + else if (synth_field_is_string(type)) + fmt = "%s"; + + return fmt; +} + +static enum print_line_t print_synth_event(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct synth_trace_event *entry; + struct synth_event *se; + unsigned int i, n_u64; + char print_fmt[32]; + const char *fmt; + + entry = (struct synth_trace_event *)iter->ent; + se = container_of(event, struct synth_event, call.event); + + trace_seq_printf(s, "%s: ", se->name); + + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { + if (trace_seq_has_overflowed(s)) + goto end; + + fmt = synth_field_fmt(se->fields[i]->type); + + /* parameter types */ + if (tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", fmt); + + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); + + /* parameter values */ + if (se->fields[i]->is_string) { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64++; + } + } +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static struct trace_event_functions synth_event_funcs = { + .trace = print_synth_event +}; + +static notrace void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int var_ref_idx) +{ + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct ring_buffer *buffer; + struct synth_event *event; + unsigned int i, n_u64; + int fields_size = 0; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->trace_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + goto out; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i]; + char *str_field = (char *)&entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + entry->fields[n_u64] = var_ref_vals[var_ref_idx + i]; + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); +} + +static void free_synth_event_print_fmt(struct trace_event_call *call) +{ + if (call) { + kfree(call->print_fmt); + call->print_fmt = NULL; + } +} + +static int __set_synth_event_print_fmt(struct synth_event *event, + char *buf, int len) +{ + const char *fmt; + int pos = 0; + int i; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < event->n_fields; i++) { + fmt = synth_field_fmt(event->fields[i]->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", + event->fields[i]->name, fmt, + i == event->n_fields - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < event->n_fields; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_synth_event_print_fmt(struct trace_event_call *call) +{ + struct synth_event *event = call->data; + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_synth_event_print_fmt(event, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_synth_event_print_fmt(event, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_synth_field(struct synth_field *field) +{ + kfree(field->type); + kfree(field->name); + kfree(field); +} + +static struct synth_field *parse_synth_field(char *field_type, + char *field_name) +{ + struct synth_field *field; + int len, ret = 0; + char *array; + + if (field_type[0] == ';') + field_type++; + + len = strlen(field_name); + if (field_name[len - 1] == ';') + field_name[len - 1] = '\0'; + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + return ERR_PTR(-ENOMEM); + + len = strlen(field_type) + 1; + array = strchr(field_name, '['); + if (array) + len += strlen(array); + field->type = kzalloc(len, GFP_KERNEL); + if (!field->type) { + ret = -ENOMEM; + goto free; + } + strcat(field->type, field_type); + if (array) { + strcat(field->type, array); + *array = '\0'; + } + + field->size = synth_field_size(field->type); + if (!field->size) { + ret = -EINVAL; + goto free; + } + + if (synth_field_is_string(field->type)) + field->is_string = true; + + field->is_signed = synth_field_signed(field->type); + + field->name = kstrdup(field_name, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + out: + return field; + free: + free_synth_field(field); + field = ERR_PTR(ret); + goto out; +} + +static void free_synth_tracepoint(struct tracepoint *tp) +{ + if (!tp) + return; + + kfree(tp->name); + kfree(tp); +} + +static struct tracepoint *alloc_synth_tracepoint(char *name) +{ + struct tracepoint *tp; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + tp->name = kstrdup(name, GFP_KERNEL); + if (!tp->name) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + + return tp; +} + +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, + unsigned int var_ref_idx); + +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, + unsigned int var_ref_idx) +{ + struct tracepoint *tp = event->tp; + + if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + synth_probe_func_t probe_func; + void *__data; + + if (!(cpu_online(raw_smp_processor_id()))) + return; + + probe_func_ptr = rcu_dereference_sched((tp)->funcs); + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + __data = probe_func_ptr->data; + probe_func(__data, var_ref_vals, var_ref_idx); + } while ((++probe_func_ptr)->func); + } + } +} + +static struct synth_event *find_synth_event(const char *name) +{ + struct synth_event *event; + + list_for_each_entry(event, &synth_event_list, list) { + if (strcmp(event->name, name) == 0) + return event; + } + + return NULL; +} + +static int register_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret = 0; + + event->call.class = &event->class; + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); + if (!event->class.system) { + ret = -ENOMEM; + goto out; + } + + event->tp = alloc_synth_tracepoint(event->name); + if (IS_ERR(event->tp)) { + ret = PTR_ERR(event->tp); + event->tp = NULL; + goto out; + } + + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &synth_event_funcs; + call->class->define_fields = synth_event_define_fields; + + ret = register_trace_event(&call->event); + if (!ret) { + ret = -ENODEV; + goto out; + } + call->flags = TRACE_EVENT_FL_TRACEPOINT; + call->class->reg = trace_event_reg; + call->class->probe = trace_event_raw_event_synth; + call->data = event; + call->tp = event->tp; + + ret = trace_add_event_call(call); + if (ret) { + pr_warn("Failed to register synthetic event: %s\n", + trace_event_name(call)); + goto err; + } + + ret = set_synth_event_print_fmt(call); + if (ret < 0) { + trace_remove_event_call(call); + goto err; + } + out: + return ret; + err: + unregister_trace_event(&call->event); + goto out; +} + +static int unregister_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret; + + ret = trace_remove_event_call(call); + + return ret; +} + +static void free_synth_event(struct synth_event *event) +{ + unsigned int i; + + if (!event) + return; + + for (i = 0; i < event->n_fields; i++) + free_synth_field(event->fields[i]); + + kfree(event->fields); + kfree(event->name); + kfree(event->class.system); + free_synth_tracepoint(event->tp); + free_synth_event_print_fmt(&event->call); + kfree(event); +} + +static struct synth_event *alloc_synth_event(char *event_name, int n_fields, + struct synth_field **fields) +{ + struct synth_event *event; + unsigned int i; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) { + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->name = kstrdup(event_name, GFP_KERNEL); + if (!event->name) { + kfree(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + if (!event->fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + for (i = 0; i < n_fields; i++) + event->fields[i] = fields[i]; + + event->n_fields = n_fields; + out: + return event; +} + +static void action_trace(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals) +{ + struct synth_event *event = data->onmatch.synth_event; + + trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx); +} + +struct hist_var_data { + struct list_head list; + struct hist_trigger_data *hist_data; +}; + +static void add_or_delete_synth_event(struct synth_event *event, int delete) +{ + if (delete) + free_synth_event(event); + else { + mutex_lock(&synth_event_mutex); + if (!find_synth_event(event->name)) + list_add(&event->list, &synth_event_list); + else + free_synth_event(event); + mutex_unlock(&synth_event_mutex); + } +} + +static int create_synth_event(int argc, char **argv) +{ + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + struct synth_event *event = NULL; + bool delete_event = false; + int i, n_fields = 0, ret = 0; + char *name; + + mutex_lock(&synth_event_mutex); + + /* + * Argument syntax: + * - Add synthetic event: <event_name> field[;field] ... + * - Remove synthetic event: !<event_name> field[;field] ... + * where 'field' = type field_name + */ + if (argc < 1) { + ret = -EINVAL; + goto out; + } + + name = argv[0]; + if (name[0] == '!') { + delete_event = true; + name++; + } + + event = find_synth_event(name); + if (event) { + if (delete_event) { + if (event->ref) { + event = NULL; + ret = -EBUSY; + goto out; + } + list_del(&event->list); + goto out; + } + event = NULL; + ret = -EEXIST; + goto out; + } else if (delete_event) + goto out; + + if (argc < 2) { + ret = -EINVAL; + goto out; + } + + for (i = 1; i < argc - 1; i++) { + if (strcmp(argv[i], ";") == 0) + continue; + if (n_fields == SYNTH_FIELDS_MAX) { + ret = -EINVAL; + goto err; + } + + field = parse_synth_field(argv[i], argv[i + 1]); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto err; + } + fields[n_fields] = field; + i++; n_fields++; + } + + if (i < argc) { + ret = -EINVAL; + goto err; + } + + event = alloc_synth_event(name, n_fields, fields); + if (IS_ERR(event)) { + ret = PTR_ERR(event); + event = NULL; + goto err; + } + out: + mutex_unlock(&synth_event_mutex); + + if (event) { + if (delete_event) { + ret = unregister_synth_event(event); + add_or_delete_synth_event(event, !ret); + } else { + ret = register_synth_event(event); + add_or_delete_synth_event(event, ret); + } + } + + return ret; + err: + mutex_unlock(&synth_event_mutex); + + for (i = 0; i < n_fields; i++) + free_synth_field(fields[i]); + free_synth_event(event); + + return ret; +} + +static int release_all_synth_events(void) +{ + struct list_head release_events; + struct synth_event *event, *e; + int ret = 0; + + INIT_LIST_HEAD(&release_events); + + mutex_lock(&synth_event_mutex); + + list_for_each_entry(event, &synth_event_list, list) { + if (event->ref) { + mutex_unlock(&synth_event_mutex); + return -EBUSY; + } + } + + list_splice_init(&event->list, &release_events); + + mutex_unlock(&synth_event_mutex); + + list_for_each_entry_safe(event, e, &release_events, list) { + list_del(&event->list); + + ret = unregister_synth_event(event); + add_or_delete_synth_event(event, !ret); + } + + return ret; +} + + +static void *synth_events_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&synth_event_mutex); + + return seq_list_start(&synth_event_list, *pos); +} + +static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &synth_event_list, pos); +} + +static void synth_events_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&synth_event_mutex); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct synth_field *field; + struct synth_event *event = v; + unsigned int i; + + seq_printf(m, "%s\t", event->name); + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + + /* parameter values */ + seq_printf(m, "%s %s%s", field->type, field->name, + i == event->n_fields - 1 ? "" : "; "); + } + + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations synth_events_seq_op = { + .start = synth_events_seq_start, + .next = synth_events_seq_next, + .stop = synth_events_seq_stop, + .show = synth_events_seq_show +}; + +static int synth_events_open(struct inode *inode, struct file *file) +{ + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = release_all_synth_events(); + if (ret < 0) + return ret; + } + + return seq_open(file, &synth_events_seq_op); +} + +static ssize_t synth_events_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_synth_event); +} + +static const struct file_operations synth_events_fops = { + .open = synth_events_open, + .write = synth_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static u64 hist_field_timestamp(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_trigger_data *hist_data = hist_field->hist_data; + struct trace_array *tr = hist_data->event_file->tr; + + u64 ts = ring_buffer_event_time_stamp(rbe); + + if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) + ts = ns2usecs(ts); + + return ts; +} + +static u64 hist_field_cpu(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + int cpu = smp_processor_id(); + + return cpu; +} + +static struct hist_field * +check_field_for_var_ref(struct hist_field *hist_field, + struct hist_trigger_data *var_data, + unsigned int var_idx) +{ + struct hist_field *found = NULL; + + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) { + if (hist_field->var.idx == var_idx && + hist_field->var.hist_data == var_data) { + found = hist_field; + } + } + + return found; +} + +static struct hist_field * +check_field_for_var_refs(struct hist_trigger_data *hist_data, + struct hist_field *hist_field, + struct hist_trigger_data *var_data, + unsigned int var_idx, + unsigned int level) +{ + struct hist_field *found = NULL; + unsigned int i; + + if (level > 3) + return found; + + if (!hist_field) + return found; + + found = check_field_for_var_ref(hist_field, var_data, var_idx); + if (found) + return found; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { + struct hist_field *operand; + + operand = hist_field->operands[i]; + found = check_field_for_var_refs(hist_data, operand, var_data, + var_idx, level + 1); + if (found) + return found; + } + + return found; +} + +static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, + struct hist_trigger_data *var_data, + unsigned int var_idx) +{ + struct hist_field *hist_field, *found = NULL; + unsigned int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + found = check_field_for_var_refs(hist_data, hist_field, + var_data, var_idx, 0); + if (found) + return found; + } + + for (i = 0; i < hist_data->n_synth_var_refs; i++) { + hist_field = hist_data->synth_var_refs[i]; + found = check_field_for_var_refs(hist_data, hist_field, + var_data, var_idx, 0); + if (found) + return found; + } + + return found; +} + +static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, + unsigned int var_idx) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *found = NULL; + struct hist_var_data *var_data; + + list_for_each_entry(var_data, &tr->hist_vars, list) { + if (var_data->hist_data == hist_data) + continue; + found = find_var_ref(var_data->hist_data, hist_data, var_idx); + if (found) + break; + } + + return found; +} + +static bool check_var_refs(struct hist_trigger_data *hist_data) +{ + struct hist_field *field; + bool found = false; + int i; + + for_each_hist_field(i, hist_data) { + field = hist_data->fields[i]; + if (field && field->flags & HIST_FIELD_FL_VAR) { + if (find_any_var_ref(hist_data, field->var.idx)) { + found = true; + break; + } + } + } + + return found; +} + +static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data, *found = NULL; + + list_for_each_entry(var_data, &tr->hist_vars, list) { + if (var_data->hist_data == hist_data) { + found = var_data; + break; + } + } + + return found; +} + +static bool field_has_hist_vars(struct hist_field *hist_field, + unsigned int level) +{ + int i; + + if (level > 3) + return false; + + if (!hist_field) + return false; + + if (hist_field->flags & HIST_FIELD_FL_VAR || + hist_field->flags & HIST_FIELD_FL_VAR_REF) + return true; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { + struct hist_field *operand; + + operand = hist_field->operands[i]; + if (field_has_hist_vars(operand, level + 1)) + return true; + } + + return false; +} + +static bool has_hist_vars(struct hist_trigger_data *hist_data) +{ + struct hist_field *hist_field; + int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (field_has_hist_vars(hist_field, 0)) + return true; + } + + return false; +} + +static int save_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data; + + var_data = find_hist_vars(hist_data); + if (var_data) + return 0; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); + if (!var_data) { + trace_array_put(tr); + return -ENOMEM; + } + + var_data->hist_data = hist_data; + list_add(&var_data->list, &tr->hist_vars); + + return 0; +} + +static void remove_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data; + + var_data = find_hist_vars(hist_data); + if (!var_data) + return; + + if (WARN_ON(check_var_refs(hist_data))) + return; + + list_del(&var_data->list); + + kfree(var_data); + + trace_array_put(tr); +} + +static struct hist_field *find_var_field(struct hist_trigger_data *hist_data, + const char *var_name) +{ + struct hist_field *hist_field, *found = NULL; + int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR && + strcmp(hist_field->var.name, var_name) == 0) { + found = hist_field; + break; + } + } + + return found; +} + +static struct hist_field *find_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + const char *var_name) +{ + struct hist_trigger_data *test_data; + struct event_trigger_data *test; + struct hist_field *hist_field; + + hist_field = find_var_field(hist_data, var_name); + if (hist_field) + return hist_field; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + test_data = test->private_data; + hist_field = find_var_field(test_data, var_name); + if (hist_field) + return hist_field; + } + } + + return NULL; +} + +static struct trace_event_file *find_var_file(struct trace_array *tr, + char *system, + char *event_name, + char *var_name) +{ + struct hist_trigger_data *var_hist_data; + struct hist_var_data *var_data; + struct trace_event_file *file, *found = NULL; + + if (system) + return find_event_file(tr, system, event_name); + + list_for_each_entry(var_data, &tr->hist_vars, list) { + var_hist_data = var_data->hist_data; + file = var_hist_data->event_file; + if (file == found) + continue; + + if (find_var_field(var_hist_data, var_name)) { + if (found) { + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + return NULL; + } + + found = file; + } + } + + return found; +} + +static struct hist_field *find_file_var(struct trace_event_file *file, + const char *var_name) +{ + struct hist_trigger_data *test_data; + struct event_trigger_data *test; + struct hist_field *hist_field; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + test_data = test->private_data; + hist_field = find_var_field(test_data, var_name); + if (hist_field) + return hist_field; + } + } + + return NULL; +} + +static struct hist_field * +find_match_var(struct hist_trigger_data *hist_data, char *var_name) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field, *found = NULL; + struct trace_event_file *file; + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == action_trace) { + char *system = data->onmatch.match_event_system; + char *event_name = data->onmatch.match_event; + + file = find_var_file(tr, system, event_name, var_name); + if (!file) + continue; + hist_field = find_file_var(file, var_name); + if (hist_field) { + if (found) { + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + return ERR_PTR(-EINVAL); + } + + found = hist_field; + } + } + } + return found; +} + +static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, + char *system, + char *event_name, + char *var_name) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field = NULL; + struct trace_event_file *file; + + if (!system || !event_name) { + hist_field = find_match_var(hist_data, var_name); + if (IS_ERR(hist_field)) + return NULL; + if (hist_field) + return hist_field; + } + + file = find_var_file(tr, system, event_name, var_name); + if (!file) + return NULL; + + hist_field = find_file_var(file, var_name); + + return hist_field; +} + +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; }; +static u64 hist_field_var_ref(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + u64 var_val = 0; + + elt_data = elt->private_data; + var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; + + return var_val; +} + +static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key, + u64 *var_ref_vals, bool self) +{ + struct hist_trigger_data *var_data; + struct tracing_map_elt *var_elt; + struct hist_field *hist_field; + unsigned int i, var_idx; + bool resolved = true; + u64 var_val = 0; + + for (i = 0; i < hist_data->n_var_refs; i++) { + hist_field = hist_data->var_refs[i]; + var_idx = hist_field->var.idx; + var_data = hist_field->var.hist_data; + + if (var_data == NULL) { + resolved = false; + break; + } + + if ((self && var_data != hist_data) || + (!self && var_data == hist_data)) + continue; + + var_elt = tracing_map_lookup(var_data->map, key); + if (!var_elt) { + resolved = false; + break; + } + + if (!tracing_map_var_set(var_elt, var_idx)) { + resolved = false; + break; + } + + if (self || !hist_field->read_once) + var_val = tracing_map_read_var(var_elt, var_idx); + else + var_val = tracing_map_read_var_once(var_elt, var_idx); + + var_ref_vals[i] = var_val; + } + + return resolved; +} + static const char *hist_field_name(struct hist_field *field, unsigned int level) { @@ -162,8 +1683,26 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; - else if (field->flags & HIST_FIELD_FL_LOG2) + else if (field->flags & HIST_FIELD_FL_LOG2 || + field->flags & HIST_FIELD_FL_ALIAS) field_name = hist_field_name(field->operands[0], ++level); + else if (field->flags & HIST_FIELD_FL_CPU) + field_name = "cpu"; + else if (field->flags & HIST_FIELD_FL_EXPR || + field->flags & HIST_FIELD_FL_VAR_REF) { + if (field->system) { + static char full_name[MAX_FILTER_STR_VAL]; + + strcat(full_name, field->system); + strcat(full_name, "."); + strcat(full_name, field->event_name); + strcat(full_name, "."); + strcat(full_name, field->name); + field_name = full_name; + } else + field_name = field->name; + } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) + field_name = "common_timestamp"; if (field_name == NULL) field_name = ""; @@ -232,16 +1771,119 @@ static int parse_map_size(char *str) static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) { + unsigned int i; + if (!attrs) return; + for (i = 0; i < attrs->n_assignments; i++) + kfree(attrs->assignment_str[i]); + + for (i = 0; i < attrs->n_actions; i++) + kfree(attrs->action_str[i]); + kfree(attrs->name); kfree(attrs->sort_key_str); kfree(attrs->keys_str); kfree(attrs->vals_str); + kfree(attrs->clock); kfree(attrs); } +static int parse_action(char *str, struct hist_trigger_attrs *attrs) +{ + int ret = -EINVAL; + + if (attrs->n_actions >= HIST_ACTIONS_MAX) + return ret; + + if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) || + (strncmp(str, "onmax(", strlen("onmax(")) == 0)) { + attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); + if (!attrs->action_str[attrs->n_actions]) { + ret = -ENOMEM; + return ret; + } + attrs->n_actions++; + ret = 0; + } + + return ret; +} + +static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) +{ + int ret = 0; + + if ((strncmp(str, "key=", strlen("key=")) == 0) || + (strncmp(str, "keys=", strlen("keys=")) == 0)) { + attrs->keys_str = kstrdup(str, GFP_KERNEL); + if (!attrs->keys_str) { + ret = -ENOMEM; + goto out; + } + } else if ((strncmp(str, "val=", strlen("val=")) == 0) || + (strncmp(str, "vals=", strlen("vals=")) == 0) || + (strncmp(str, "values=", strlen("values=")) == 0)) { + attrs->vals_str = kstrdup(str, GFP_KERNEL); + if (!attrs->vals_str) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "sort=", strlen("sort=")) == 0) { + attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + if (!attrs->sort_key_str) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "name=", strlen("name=")) == 0) { + attrs->name = kstrdup(str, GFP_KERNEL); + if (!attrs->name) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "clock=", strlen("clock=")) == 0) { + strsep(&str, "="); + if (!str) { + ret = -EINVAL; + goto out; + } + + str = strstrip(str); + attrs->clock = kstrdup(str, GFP_KERNEL); + if (!attrs->clock) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "size=", strlen("size=")) == 0) { + int map_bits = parse_map_size(str); + + if (map_bits < 0) { + ret = map_bits; + goto out; + } + attrs->map_bits = map_bits; + } else { + char *assignment; + + if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { + hist_err("Too many variables defined: ", str); + ret = -EINVAL; + goto out; + } + + assignment = kstrdup(str, GFP_KERNEL); + if (!assignment) { + ret = -ENOMEM; + goto out; + } + + attrs->assignment_str[attrs->n_assignments++] = assignment; + } + out: + return ret; +} + static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) { struct hist_trigger_attrs *attrs; @@ -254,35 +1896,21 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) while (trigger_str) { char *str = strsep(&trigger_str, ":"); - if ((strncmp(str, "key=", strlen("key=")) == 0) || - (strncmp(str, "keys=", strlen("keys=")) == 0)) - attrs->keys_str = kstrdup(str, GFP_KERNEL); - else if ((strncmp(str, "val=", strlen("val=")) == 0) || - (strncmp(str, "vals=", strlen("vals=")) == 0) || - (strncmp(str, "values=", strlen("values=")) == 0)) - attrs->vals_str = kstrdup(str, GFP_KERNEL); - else if (strncmp(str, "sort=", strlen("sort=")) == 0) - attrs->sort_key_str = kstrdup(str, GFP_KERNEL); - else if (strncmp(str, "name=", strlen("name=")) == 0) - attrs->name = kstrdup(str, GFP_KERNEL); - else if (strcmp(str, "pause") == 0) + if (strchr(str, '=')) { + ret = parse_assignment(str, attrs); + if (ret) + goto free; + } else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) attrs->cont = true; else if (strcmp(str, "clear") == 0) attrs->clear = true; - else if (strncmp(str, "size=", strlen("size=")) == 0) { - int map_bits = parse_map_size(str); - - if (map_bits < 0) { - ret = map_bits; + else { + ret = parse_action(str, attrs); + if (ret) goto free; - } - attrs->map_bits = map_bits; - } else { - ret = -EINVAL; - goto free; } } @@ -291,6 +1919,14 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) goto free; } + if (!attrs->clock) { + attrs->clock = kstrdup("global", GFP_KERNEL); + if (!attrs->clock) { + ret = -ENOMEM; + goto free; + } + } + return attrs; free: destroy_hist_trigger_attrs(attrs); @@ -313,64 +1949,203 @@ static inline void save_comm(char *comm, struct task_struct *task) memcpy(comm, task->comm, TASK_COMM_LEN); } -static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) +static void hist_elt_data_free(struct hist_elt_data *elt_data) { - kfree((char *)elt->private_data); + unsigned int i; + + for (i = 0; i < SYNTH_FIELDS_MAX; i++) + kfree(elt_data->field_var_str[i]); + + kfree(elt_data->comm); + kfree(elt_data); } -static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) +static void hist_trigger_elt_data_free(struct tracing_map_elt *elt) +{ + struct hist_elt_data *elt_data = elt->private_data; + + hist_elt_data_free(elt_data); +} + +static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) { struct hist_trigger_data *hist_data = elt->map->private_data; + unsigned int size = TASK_COMM_LEN; + struct hist_elt_data *elt_data; struct hist_field *key_field; - unsigned int i; + unsigned int i, n_str; + + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) + return -ENOMEM; for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_EXECNAME) { - unsigned int size = TASK_COMM_LEN + 1; - - elt->private_data = kzalloc(size, GFP_KERNEL); - if (!elt->private_data) + elt_data->comm = kzalloc(size, GFP_KERNEL); + if (!elt_data->comm) { + kfree(elt_data); return -ENOMEM; + } break; } } + n_str = hist_data->n_field_var_str + hist_data->n_max_var_str; + + size = STR_VAR_LEN_MAX; + + for (i = 0; i < n_str; i++) { + elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); + if (!elt_data->field_var_str[i]) { + hist_elt_data_free(elt_data); + return -ENOMEM; + } + } + + elt->private_data = elt_data; + return 0; } -static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, - struct tracing_map_elt *from) +static void hist_trigger_elt_data_init(struct tracing_map_elt *elt) +{ + struct hist_elt_data *elt_data = elt->private_data; + + if (elt_data->comm) + save_comm(elt_data->comm, current); +} + +static const struct tracing_map_ops hist_trigger_elt_data_ops = { + .elt_alloc = hist_trigger_elt_data_alloc, + .elt_free = hist_trigger_elt_data_free, + .elt_init = hist_trigger_elt_data_init, +}; + +static const char *get_hist_field_flags(struct hist_field *hist_field) +{ + const char *flags_str = NULL; + + if (hist_field->flags & HIST_FIELD_FL_HEX) + flags_str = "hex"; + else if (hist_field->flags & HIST_FIELD_FL_SYM) + flags_str = "sym"; + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) + flags_str = "sym-offset"; + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) + flags_str = "execname"; + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) + flags_str = "syscall"; + else if (hist_field->flags & HIST_FIELD_FL_LOG2) + flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) + flags_str = "usecs"; + + return flags_str; +} + +static void expr_field_str(struct hist_field *field, char *expr) { - char *comm_from = from->private_data; - char *comm_to = to->private_data; + if (field->flags & HIST_FIELD_FL_VAR_REF) + strcat(expr, "$"); + + strcat(expr, hist_field_name(field, 0)); - if (comm_from) - memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); + if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) { + const char *flags_str = get_hist_field_flags(field); + + if (flags_str) { + strcat(expr, "."); + strcat(expr, flags_str); + } + } } -static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) +static char *expr_str(struct hist_field *field, unsigned int level) { - char *comm = elt->private_data; + char *expr; + + if (level > 1) + return NULL; + + expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!expr) + return NULL; + + if (!field->operands[0]) { + expr_field_str(field, expr); + return expr; + } + + if (field->operator == FIELD_OP_UNARY_MINUS) { + char *subexpr; - if (comm) - save_comm(comm, current); + strcat(expr, "-("); + subexpr = expr_str(field->operands[0], ++level); + if (!subexpr) { + kfree(expr); + return NULL; + } + strcat(expr, subexpr); + strcat(expr, ")"); + + kfree(subexpr); + + return expr; + } + + expr_field_str(field->operands[0], expr); + + switch (field->operator) { + case FIELD_OP_MINUS: + strcat(expr, "-"); + break; + case FIELD_OP_PLUS: + strcat(expr, "+"); + break; + default: + kfree(expr); + return NULL; + } + + expr_field_str(field->operands[1], expr); + + return expr; } -static const struct tracing_map_ops hist_trigger_elt_comm_ops = { - .elt_alloc = hist_trigger_elt_comm_alloc, - .elt_copy = hist_trigger_elt_comm_copy, - .elt_free = hist_trigger_elt_comm_free, - .elt_init = hist_trigger_elt_comm_init, -}; +static int contains_operator(char *str) +{ + enum field_op_id field_op = FIELD_OP_NONE; + char *op; + + op = strpbrk(str, "+-"); + if (!op) + return FIELD_OP_NONE; + + switch (*op) { + case '-': + if (*str == '-') + field_op = FIELD_OP_UNARY_MINUS; + else + field_op = FIELD_OP_MINUS; + break; + case '+': + field_op = FIELD_OP_PLUS; + break; + default: + break; + } + + return field_op; +} static void destroy_hist_field(struct hist_field *hist_field, unsigned int level) { unsigned int i; - if (level > 2) + if (level > 3) return; if (!hist_field) @@ -379,11 +2154,17 @@ static void destroy_hist_field(struct hist_field *hist_field, for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) destroy_hist_field(hist_field->operands[i], level + 1); + kfree(hist_field->var.name); + kfree(hist_field->name); + kfree(hist_field->type); + kfree(hist_field); } -static struct hist_field *create_hist_field(struct ftrace_event_field *field, - unsigned long flags) +static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, + struct ftrace_event_field *field, + unsigned long flags, + char *var_name) { struct hist_field *hist_field; @@ -394,8 +2175,22 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (!hist_field) return NULL; + hist_field->hist_data = hist_data; + + if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS) + goto out; /* caller will populate */ + + if (flags & HIST_FIELD_FL_VAR_REF) { + hist_field->fn = hist_field_var_ref; + goto out; + } + if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -407,8 +2202,29 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (flags & HIST_FIELD_FL_LOG2) { unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; hist_field->fn = hist_field_log2; - hist_field->operands[0] = create_hist_field(field, fl); + hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; + hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + + if (flags & HIST_FIELD_FL_TIMESTAMP) { + hist_field->fn = hist_field_timestamp; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + + if (flags & HIST_FIELD_FL_CPU) { + hist_field->fn = hist_field_cpu; + hist_field->size = sizeof(int); + hist_field->type = kstrdup("unsigned int", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -418,6 +2234,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = kstrdup(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + if (field->filter_type == FILTER_STATIC_STRING) hist_field->fn = hist_field_string; else if (field->filter_type == FILTER_DYN_STRING) @@ -425,6 +2246,12 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, else hist_field->fn = hist_field_pstring; } else { + hist_field->size = field->size; + hist_field->is_signed = field->is_signed; + hist_field->type = kstrdup(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + hist_field->fn = select_value_fn(field->size, field->is_signed); if (!hist_field->fn) { @@ -436,14 +2263,23 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, hist_field->field = field; hist_field->flags = flags; + if (var_name) { + hist_field->var.name = kstrdup(var_name, GFP_KERNEL); + if (!hist_field->var.name) + goto free; + } + return hist_field; + free: + destroy_hist_field(hist_field, 0); + return NULL; } static void destroy_hist_fields(struct hist_trigger_data *hist_data) { unsigned int i; - for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { + for (i = 0; i < HIST_FIELDS_MAX; i++) { if (hist_data->fields[i]) { destroy_hist_field(hist_data->fields[i], 0); hist_data->fields[i] = NULL; @@ -451,69 +2287,1610 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) } } -static int create_hitcount_val(struct hist_trigger_data *hist_data) +static int init_var_ref(struct hist_field *ref_field, + struct hist_field *var_field, + char *system, char *event_name) { - hist_data->fields[HITCOUNT_IDX] = - create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); - if (!hist_data->fields[HITCOUNT_IDX]) - return -ENOMEM; + int err = 0; + + ref_field->var.idx = var_field->var.idx; + ref_field->var.hist_data = var_field->hist_data; + ref_field->size = var_field->size; + ref_field->is_signed = var_field->is_signed; + ref_field->flags |= var_field->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + + if (system) { + ref_field->system = kstrdup(system, GFP_KERNEL); + if (!ref_field->system) + return -ENOMEM; + } - hist_data->n_vals++; + if (event_name) { + ref_field->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ref_field->event_name) { + err = -ENOMEM; + goto free; + } + } - if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + if (var_field->var.name) { + ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } + } else if (var_field->name) { + ref_field->name = kstrdup(var_field->name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } + } + + ref_field->type = kstrdup(var_field->type, GFP_KERNEL); + if (!ref_field->type) { + err = -ENOMEM; + goto free; + } + out: + return err; + free: + kfree(ref_field->system); + kfree(ref_field->event_name); + kfree(ref_field->name); + + goto out; +} + +static struct hist_field *create_var_ref(struct hist_field *var_field, + char *system, char *event_name) +{ + unsigned long flags = HIST_FIELD_FL_VAR_REF; + struct hist_field *ref_field; + + ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); + if (ref_field) { + if (init_var_ref(ref_field, var_field, system, event_name)) { + destroy_hist_field(ref_field, 0); + return NULL; + } + } + + return ref_field; +} + +static bool is_var_ref(char *var_name) +{ + if (!var_name || strlen(var_name) < 2 || var_name[0] != '$') + return false; + + return true; +} + +static char *field_name_from_var(struct hist_trigger_data *hist_data, + char *var_name) +{ + char *name, *field; + unsigned int i; + + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { + name = hist_data->attrs->var_defs.name[i]; + + if (strcmp(var_name, name) == 0) { + field = hist_data->attrs->var_defs.expr[i]; + if (contains_operator(field) || is_var_ref(field)) + continue; + return field; + } + } + + return NULL; +} + +static char *local_field_var_ref(struct hist_trigger_data *hist_data, + char *system, char *event_name, + char *var_name) +{ + struct trace_event_call *call; + + if (system && event_name) { + call = hist_data->event_file->event_call; + + if (strcmp(system, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + if (!!system != !!event_name) + return NULL; + + if (!is_var_ref(var_name)) + return NULL; + + var_name++; + + return field_name_from_var(hist_data, var_name); +} + +static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, + char *system, char *event_name, + char *var_name) +{ + struct hist_field *var_field = NULL, *ref_field = NULL; + + if (!is_var_ref(var_name)) + return NULL; + + var_name++; + + var_field = find_event_var(hist_data, system, event_name, var_name); + if (var_field) + ref_field = create_var_ref(var_field, system, event_name); + + if (!ref_field) + hist_err_event("Couldn't find variable: $", + system, event_name, var_name); + + return ref_field; +} + +static struct ftrace_event_field * +parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, + char *field_str, unsigned long *flags) +{ + struct ftrace_event_field *field = NULL; + char *field_name, *modifier, *str; + + modifier = str = kstrdup(field_str, GFP_KERNEL); + if (!modifier) + return ERR_PTR(-ENOMEM); + + field_name = strsep(&modifier, "."); + if (modifier) { + if (strcmp(modifier, "hex") == 0) + *flags |= HIST_FIELD_FL_HEX; + else if (strcmp(modifier, "sym") == 0) + *flags |= HIST_FIELD_FL_SYM; + else if (strcmp(modifier, "sym-offset") == 0) + *flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(modifier, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + *flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(modifier, "syscall") == 0) + *flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(modifier, "log2") == 0) + *flags |= HIST_FIELD_FL_LOG2; + else if (strcmp(modifier, "usecs") == 0) + *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; + else { + field = ERR_PTR(-EINVAL); + goto out; + } + } + + if (strcmp(field_name, "common_timestamp") == 0) { + *flags |= HIST_FIELD_FL_TIMESTAMP; + hist_data->enable_timestamps = true; + if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) + hist_data->attrs->ts_in_usecs = true; + } else if (strcmp(field_name, "cpu") == 0) + *flags |= HIST_FIELD_FL_CPU; + else { + field = trace_find_event_field(file->event_call, field_name); + if (!field || !field->size) { + field = ERR_PTR(-EINVAL); + goto out; + } + } + out: + kfree(str); + + return field; +} + +static struct hist_field *create_alias(struct hist_trigger_data *hist_data, + struct hist_field *var_ref, + char *var_name) +{ + struct hist_field *alias = NULL; + unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR; + + alias = create_hist_field(hist_data, NULL, flags, var_name); + if (!alias) + return NULL; + + alias->fn = var_ref->fn; + alias->operands[0] = var_ref; + + if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { + destroy_hist_field(alias, 0); + return NULL; + } + + return alias; +} + +static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, + struct trace_event_file *file, char *str, + unsigned long *flags, char *var_name) +{ + char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; + struct ftrace_event_field *field = NULL; + struct hist_field *hist_field = NULL; + int ret = 0; + + s = strchr(str, '.'); + if (s) { + s = strchr(++s, '.'); + if (s) { + ref_system = strsep(&str, "."); + if (!str) { + ret = -EINVAL; + goto out; + } + ref_event = strsep(&str, "."); + if (!str) { + ret = -EINVAL; + goto out; + } + ref_var = str; + } + } + + s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); + if (!s) { + hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); + if (hist_field) { + hist_data->var_refs[hist_data->n_var_refs] = hist_field; + hist_field->var_ref_idx = hist_data->n_var_refs++; + if (var_name) { + hist_field = create_alias(hist_data, hist_field, var_name); + if (!hist_field) { + ret = -ENOMEM; + goto out; + } + } + return hist_field; + } + } else + str = s; + + field = parse_field(hist_data, file, str, flags); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto out; + } + + hist_field = create_hist_field(hist_data, field, *flags, var_name); + if (!hist_field) { + ret = -ENOMEM; + goto out; + } + + return hist_field; + out: + return ERR_PTR(ret); +} + +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level); + +static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level) +{ + struct hist_field *operand1, *expr = NULL; + unsigned long operand_flags; + int ret = 0; + char *s; + + /* we support only -(xxx) i.e. explicit parens required */ + + if (level > 3) { + hist_err("Too many subexpressions (3 max): ", str); + ret = -EINVAL; + goto free; + } + + str++; /* skip leading '-' */ + + s = strchr(str, '('); + if (s) + str++; + else { + ret = -EINVAL; + goto free; + } + + s = strrchr(str, ')'); + if (s) + *s = '\0'; + else { + ret = -EINVAL; /* no closing ')' */ + goto free; + } + + flags |= HIST_FIELD_FL_EXPR; + expr = create_hist_field(hist_data, NULL, flags, var_name); + if (!expr) { + ret = -ENOMEM; + goto free; + } + + operand_flags = 0; + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + if (IS_ERR(operand1)) { + ret = PTR_ERR(operand1); + goto free; + } + + expr->flags |= operand1->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + expr->fn = hist_field_unary_minus; + expr->operands[0] = operand1; + expr->operator = FIELD_OP_UNARY_MINUS; + expr->name = expr_str(expr, 0); + expr->type = kstrdup(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } + + return expr; + free: + destroy_hist_field(expr, 0); + return ERR_PTR(ret); +} + +static int check_expr_operands(struct hist_field *operand1, + struct hist_field *operand2) +{ + unsigned long operand1_flags = operand1->flags; + unsigned long operand2_flags = operand2->flags; + + if ((operand1_flags & HIST_FIELD_FL_VAR_REF) || + (operand1_flags & HIST_FIELD_FL_ALIAS)) { + struct hist_field *var; + + var = find_var_field(operand1->var.hist_data, operand1->name); + if (!var) + return -EINVAL; + operand1_flags = var->flags; + } + + if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || + (operand2_flags & HIST_FIELD_FL_ALIAS)) { + struct hist_field *var; + + var = find_var_field(operand2->var.hist_data, operand2->name); + if (!var) + return -EINVAL; + operand2_flags = var->flags; + } + + if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != + (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { + hist_err("Timestamp units in expression don't match", NULL); return -EINVAL; + } return 0; } -static int create_val_field(struct hist_trigger_data *hist_data, - unsigned int val_idx, - struct trace_event_file *file, - char *field_str) +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level) { - struct ftrace_event_field *field = NULL; - unsigned long flags = 0; - char *field_name; + struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; + unsigned long operand_flags; + int field_op, ret = -EINVAL; + char *sep, *operand1_str; + + if (level > 3) { + hist_err("Too many subexpressions (3 max): ", str); + return ERR_PTR(-EINVAL); + } + + field_op = contains_operator(str); + + if (field_op == FIELD_OP_NONE) + return parse_atom(hist_data, file, str, &flags, var_name); + + if (field_op == FIELD_OP_UNARY_MINUS) + return parse_unary(hist_data, file, str, flags, var_name, ++level); + + switch (field_op) { + case FIELD_OP_MINUS: + sep = "-"; + break; + case FIELD_OP_PLUS: + sep = "+"; + break; + default: + goto free; + } + + operand1_str = strsep(&str, sep); + if (!operand1_str || !str) + goto free; + + operand_flags = 0; + operand1 = parse_atom(hist_data, file, operand1_str, + &operand_flags, NULL); + if (IS_ERR(operand1)) { + ret = PTR_ERR(operand1); + operand1 = NULL; + goto free; + } + + /* rest of string could be another expression e.g. b+c in a+b+c */ + operand_flags = 0; + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + if (IS_ERR(operand2)) { + ret = PTR_ERR(operand2); + operand2 = NULL; + goto free; + } + + ret = check_expr_operands(operand1, operand2); + if (ret) + goto free; + + flags |= HIST_FIELD_FL_EXPR; + + flags |= operand1->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + + expr = create_hist_field(hist_data, NULL, flags, var_name); + if (!expr) { + ret = -ENOMEM; + goto free; + } + + operand1->read_once = true; + operand2->read_once = true; + + expr->operands[0] = operand1; + expr->operands[1] = operand2; + expr->operator = field_op; + expr->name = expr_str(expr, 0); + expr->type = kstrdup(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } + + switch (field_op) { + case FIELD_OP_MINUS: + expr->fn = hist_field_minus; + break; + case FIELD_OP_PLUS: + expr->fn = hist_field_plus; + break; + default: + ret = -EINVAL; + goto free; + } + + return expr; + free: + destroy_hist_field(operand1, 0); + destroy_hist_field(operand2, 0); + destroy_hist_field(expr, 0); + + return ERR_PTR(ret); +} + +static char *find_trigger_filter(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (test->private_data == hist_data) + return test->filter_str; + } + } + + return NULL; +} + +static struct event_command trigger_hist_cmd; +static int event_hist_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); + +static bool compatible_keys(struct hist_trigger_data *target_hist_data, + struct hist_trigger_data *hist_data, + unsigned int n_keys) +{ + struct hist_field *target_hist_field, *hist_field; + unsigned int n, i, j; + + if (hist_data->n_fields - hist_data->n_vals != n_keys) + return false; + + i = hist_data->n_vals; + j = target_hist_data->n_vals; + + for (n = 0; n < n_keys; n++) { + hist_field = hist_data->fields[i + n]; + target_hist_field = target_hist_data->fields[j + n]; + + if (strcmp(hist_field->type, target_hist_field->type) != 0) + return false; + if (hist_field->size != target_hist_field->size) + return false; + if (hist_field->is_signed != target_hist_field->is_signed) + return false; + } + + return true; +} + +static struct hist_trigger_data * +find_compatible_hist(struct hist_trigger_data *target_hist_data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *test; + unsigned int n_keys; + + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + + if (compatible_keys(target_hist_data, hist_data, n_keys)) + return hist_data; + } + } + + return NULL; +} + +static struct trace_event_file *event_file(struct trace_array *tr, + char *system, char *event_name) +{ + struct trace_event_file *file; + + file = find_event_file(tr, system, event_name); + if (!file) + return ERR_PTR(-EINVAL); + + return file; +} + +static struct hist_field * +find_synthetic_field_var(struct hist_trigger_data *target_hist_data, + char *system, char *event_name, char *field_name) +{ + struct hist_field *event_var; + char *synthetic_name; + + synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!synthetic_name) + return ERR_PTR(-ENOMEM); + + strcpy(synthetic_name, "synthetic_"); + strcat(synthetic_name, field_name); + + event_var = find_event_var(target_hist_data, system, event_name, synthetic_name); + + kfree(synthetic_name); + + return event_var; +} + +/** + * create_field_var_hist - Automatically create a histogram and var for a field + * @target_hist_data: The target hist trigger + * @subsys_name: Optional subsystem name + * @event_name: Optional event name + * @field_name: The name of the field (and the resulting variable) + * + * Hist trigger actions fetch data from variables, not directly from + * events. However, for convenience, users are allowed to directly + * specify an event field in an action, which will be automatically + * converted into a variable on their behalf. + + * If a user specifies a field on an event that isn't the event the + * histogram currently being defined (the target event histogram), the + * only way that can be accomplished is if a new hist trigger is + * created and the field variable defined on that. + * + * This function creates a new histogram compatible with the target + * event (meaning a histogram with the same key as the target + * histogram), and creates a variable for the specified field, but + * with 'synthetic_' prepended to the variable name in order to avoid + * collision with normal field variables. + * + * Return: The variable created for the field. + */ +static struct hist_field * +create_field_var_hist(struct hist_trigger_data *target_hist_data, + char *subsys_name, char *event_name, char *field_name) +{ + struct trace_array *tr = target_hist_data->event_file->tr; + struct hist_field *event_var = ERR_PTR(-EINVAL); + struct hist_trigger_data *hist_data; + unsigned int i, n, first = true; + struct field_var_hist *var_hist; + struct trace_event_file *file; + struct hist_field *key_field; + char *saved_filter; + char *cmd; + int ret; + + if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { + hist_err_event("onmatch: Too many field variables defined: ", + subsys_name, event_name, field_name); + return ERR_PTR(-EINVAL); + } + + file = event_file(tr, subsys_name, event_name); + + if (IS_ERR(file)) { + hist_err_event("onmatch: Event file not found: ", + subsys_name, event_name, field_name); + ret = PTR_ERR(file); + return ERR_PTR(ret); + } + + /* + * Look for a histogram compatible with target. We'll use the + * found histogram specification to create a new matching + * histogram with our variable on it. target_hist_data is not + * yet a registered histogram so we can't use that. + */ + hist_data = find_compatible_hist(target_hist_data, file); + if (!hist_data) { + hist_err_event("onmatch: Matching event histogram not found: ", + subsys_name, event_name, field_name); + return ERR_PTR(-EINVAL); + } + + /* See if a synthetic field variable has already been created */ + event_var = find_synthetic_field_var(target_hist_data, subsys_name, + event_name, field_name); + if (!IS_ERR_OR_NULL(event_var)) + return event_var; + + var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL); + if (!var_hist) + return ERR_PTR(-ENOMEM); + + cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!cmd) { + kfree(var_hist); + return ERR_PTR(-ENOMEM); + } + + /* Use the same keys as the compatible histogram */ + strcat(cmd, "keys="); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + if (!first) + strcat(cmd, ","); + strcat(cmd, key_field->field->name); + first = false; + } + + /* Create the synthetic field variable specification */ + strcat(cmd, ":synthetic_"); + strcat(cmd, field_name); + strcat(cmd, "="); + strcat(cmd, field_name); + + /* Use the same filter as the compatible histogram */ + saved_filter = find_trigger_filter(hist_data, file); + if (saved_filter) { + strcat(cmd, " if "); + strcat(cmd, saved_filter); + } + + var_hist->cmd = kstrdup(cmd, GFP_KERNEL); + if (!var_hist->cmd) { + kfree(cmd); + kfree(var_hist); + return ERR_PTR(-ENOMEM); + } + + /* Save the compatible histogram information */ + var_hist->hist_data = hist_data; + + /* Create the new histogram with our variable */ + ret = event_hist_trigger_func(&trigger_hist_cmd, file, + "", "hist", cmd); + if (ret) { + kfree(cmd); + kfree(var_hist->cmd); + kfree(var_hist); + hist_err_event("onmatch: Couldn't create histogram for field: ", + subsys_name, event_name, field_name); + return ERR_PTR(ret); + } + + kfree(cmd); + + /* If we can't find the variable, something went wrong */ + event_var = find_synthetic_field_var(target_hist_data, subsys_name, + event_name, field_name); + if (IS_ERR_OR_NULL(event_var)) { + kfree(var_hist->cmd); + kfree(var_hist); + hist_err_event("onmatch: Couldn't find synthetic variable: ", + subsys_name, event_name, field_name); + return ERR_PTR(-EINVAL); + } + + n = target_hist_data->n_field_var_hists; + target_hist_data->field_var_hists[n] = var_hist; + target_hist_data->n_field_var_hists++; + + return event_var; +} + +static struct hist_field * +find_target_event_var(struct hist_trigger_data *hist_data, + char *subsys_name, char *event_name, char *var_name) +{ + struct trace_event_file *file = hist_data->event_file; + struct hist_field *hist_field = NULL; + + if (subsys_name) { + struct trace_event_call *call; + + if (!event_name) + return NULL; + + call = file->event_call; + + if (strcmp(subsys_name, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + hist_field = find_var_field(hist_data, var_name); + + return hist_field; +} + +static inline void __update_field_vars(struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec, + struct field_var **field_vars, + unsigned int n_field_vars, + unsigned int field_var_str_start) +{ + struct hist_elt_data *elt_data = elt->private_data; + unsigned int i, j, var_idx; + u64 var_val; + + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { + struct field_var *field_var = field_vars[i]; + struct hist_field *var = field_var->var; + struct hist_field *val = field_var->val; + + var_val = val->fn(val, elt, rbe, rec); + var_idx = var->var.idx; + + if (val->flags & HIST_FIELD_FL_STRING) { + char *str = elt_data->field_var_str[j++]; + char *val_str = (char *)(uintptr_t)var_val; + + strscpy(str, val_str, STR_VAR_LEN_MAX); + var_val = (u64)(uintptr_t)str; + } + tracing_map_set_var(elt, var_idx, var_val); + } +} + +static void update_field_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec) +{ + __update_field_vars(elt, rbe, rec, hist_data->field_vars, + hist_data->n_field_vars, 0); +} + +static void update_max_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec) +{ + __update_field_vars(elt, rbe, rec, hist_data->max_vars, + hist_data->n_max_vars, hist_data->n_field_var_str); +} + +static struct hist_field *create_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *name, int size, const char *type) +{ + struct hist_field *var; + int idx; + + if (find_var(hist_data, file, name) && !hist_data->remove) { + var = ERR_PTR(-EINVAL); + goto out; + } + + var = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!var) { + var = ERR_PTR(-ENOMEM); + goto out; + } + + idx = tracing_map_add_var(hist_data->map); + if (idx < 0) { + kfree(var); + var = ERR_PTR(-EINVAL); + goto out; + } + + var->flags = HIST_FIELD_FL_VAR; + var->var.idx = idx; + var->var.hist_data = var->hist_data = hist_data; + var->size = size; + var->var.name = kstrdup(name, GFP_KERNEL); + var->type = kstrdup(type, GFP_KERNEL); + if (!var->var.name || !var->type) { + kfree(var->var.name); + kfree(var->type); + kfree(var); + var = ERR_PTR(-ENOMEM); + } + out: + return var; +} + +static struct field_var *create_field_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *field_name) +{ + struct hist_field *val = NULL, *var = NULL; + unsigned long flags = HIST_FIELD_FL_VAR; + struct field_var *field_var; int ret = 0; - if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { + hist_err("Too many field variables defined: ", field_name); + ret = -EINVAL; + goto err; + } + + val = parse_atom(hist_data, file, field_name, &flags, NULL); + if (IS_ERR(val)) { + hist_err("Couldn't parse field variable: ", field_name); + ret = PTR_ERR(val); + goto err; + } + + var = create_var(hist_data, file, field_name, val->size, val->type); + if (IS_ERR(var)) { + hist_err("Couldn't create or find variable: ", field_name); + kfree(val); + ret = PTR_ERR(var); + goto err; + } + + field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); + if (!field_var) { + kfree(val); + kfree(var); + ret = -ENOMEM; + goto err; + } + + field_var->var = var; + field_var->val = val; + out: + return field_var; + err: + field_var = ERR_PTR(ret); + goto out; +} + +/** + * create_target_field_var - Automatically create a variable for a field + * @target_hist_data: The target hist trigger + * @subsys_name: Optional subsystem name + * @event_name: Optional event name + * @var_name: The name of the field (and the resulting variable) + * + * Hist trigger actions fetch data from variables, not directly from + * events. However, for convenience, users are allowed to directly + * specify an event field in an action, which will be automatically + * converted into a variable on their behalf. + + * This function creates a field variable with the name var_name on + * the hist trigger currently being defined on the target event. If + * subsys_name and event_name are specified, this function simply + * verifies that they do in fact match the target event subsystem and + * event name. + * + * Return: The variable created for the field. + */ +static struct field_var * +create_target_field_var(struct hist_trigger_data *target_hist_data, + char *subsys_name, char *event_name, char *var_name) +{ + struct trace_event_file *file = target_hist_data->event_file; + + if (subsys_name) { + struct trace_event_call *call; + + if (!event_name) + return NULL; + + call = file->event_call; + + if (strcmp(subsys_name, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + return create_field_var(target_hist_data, file, var_name); +} + +static void onmax_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx; + + seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); + + for (i = 0; i < hist_data->n_max_vars; i++) { + struct hist_field *save_val = hist_data->max_vars[i]->val; + struct hist_field *save_var = hist_data->max_vars[i]->var; + u64 val; + + save_var_idx = save_var->var.idx; + + val = tracing_map_read_var(elt, save_var_idx); + + if (save_val->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, " %s: %-32s", save_var->var.name, + (char *)(uintptr_t)(val)); + } else + seq_printf(m, " %s: %10llu", save_var->var.name, val); + } +} + +static void onmax_save(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals) +{ + unsigned int max_idx = data->onmax.max_var->var.idx; + unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx; + + u64 var_val, max_val; + + var_val = var_ref_vals[max_var_ref_idx]; + max_val = tracing_map_read_var(elt, max_idx); + + if (var_val <= max_val) + return; + + tracing_map_set_var(elt, max_idx, var_val); + + update_max_vars(hist_data, elt, rbe, rec); +} + +static void onmax_destroy(struct action_data *data) +{ + unsigned int i; + + destroy_hist_field(data->onmax.max_var, 0); + destroy_hist_field(data->onmax.var, 0); + + kfree(data->onmax.var_str); + kfree(data->onmax.fn_name); + + for (i = 0; i < data->n_params; i++) + kfree(data->params[i]); + + kfree(data); +} + +static int onmax_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_event_file *file = hist_data->event_file; + struct hist_field *var_field, *ref_field, *max_var; + unsigned int var_ref_idx = hist_data->n_var_refs; + struct field_var *field_var; + char *onmax_var_str, *param; + unsigned long flags; + unsigned int i; + int ret = 0; + + onmax_var_str = data->onmax.var_str; + if (onmax_var_str[0] != '$') { + hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str); return -EINVAL; + } + onmax_var_str++; - field_name = strsep(&field_str, "."); - if (field_str) { - if (strcmp(field_str, "hex") == 0) - flags |= HIST_FIELD_FL_HEX; - else { + var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); + if (!var_field) { + hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str); + return -EINVAL; + } + + flags = HIST_FIELD_FL_VAR_REF; + ref_field = create_hist_field(hist_data, NULL, flags, NULL); + if (!ref_field) + return -ENOMEM; + + if (init_var_ref(ref_field, var_field, NULL, NULL)) { + destroy_hist_field(ref_field, 0); + ret = -ENOMEM; + goto out; + } + hist_data->var_refs[hist_data->n_var_refs] = ref_field; + ref_field->var_ref_idx = hist_data->n_var_refs++; + data->onmax.var = ref_field; + + data->fn = onmax_save; + data->onmax.max_var_ref_idx = var_ref_idx; + max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); + if (IS_ERR(max_var)) { + hist_err("onmax: Couldn't create onmax variable: ", "max"); + ret = PTR_ERR(max_var); + goto out; + } + data->onmax.max_var = max_var; + + for (i = 0; i < data->n_params; i++) { + param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto out; + } + + field_var = create_target_field_var(hist_data, NULL, NULL, param); + if (IS_ERR(field_var)) { + hist_err("onmax: Couldn't create field variable: ", param); + ret = PTR_ERR(field_var); + kfree(param); + goto out; + } + + hist_data->max_vars[hist_data->n_max_vars++] = field_var; + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_max_var_str++; + + kfree(param); + } + out: + return ret; +} + +static int parse_action_params(char *params, struct action_data *data) +{ + char *param, *saved_param; + int ret = 0; + + while (params) { + if (data->n_params >= SYNTH_FIELDS_MAX) + goto out; + + param = strsep(¶ms, ","); + if (!param) { ret = -EINVAL; goto out; } + + param = strstrip(param); + if (strlen(param) < 2) { + hist_err("Invalid action param: ", param); + ret = -EINVAL; + goto out; + } + + saved_param = kstrdup(param, GFP_KERNEL); + if (!saved_param) { + ret = -ENOMEM; + goto out; + } + + data->params[data->n_params++] = saved_param; } + out: + return ret; +} - field = trace_find_event_field(file->event_call, field_name); - if (!field || !field->size) { +static struct action_data *onmax_parse(char *str) +{ + char *onmax_fn_name, *onmax_var_str; + struct action_data *data; + int ret = -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + onmax_var_str = strsep(&str, ")"); + if (!onmax_var_str || !str) { ret = -EINVAL; - goto out; + goto free; + } + + data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL); + if (!data->onmax.var_str) { + ret = -ENOMEM; + goto free; + } + + strsep(&str, "."); + if (!str) + goto free; + + onmax_fn_name = strsep(&str, "("); + if (!onmax_fn_name || !str) + goto free; + + if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) { + char *params = strsep(&str, ")"); + + if (!params) { + ret = -EINVAL; + goto free; + } + + ret = parse_action_params(params, data); + if (ret) + goto free; + } else + goto free; + + data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL); + if (!data->onmax.fn_name) { + ret = -ENOMEM; + goto free; + } + out: + return data; + free: + onmax_destroy(data); + data = ERR_PTR(ret); + goto out; +} + +static void onmatch_destroy(struct action_data *data) +{ + unsigned int i; + + mutex_lock(&synth_event_mutex); + + kfree(data->onmatch.match_event); + kfree(data->onmatch.match_event_system); + kfree(data->onmatch.synth_event_name); + + for (i = 0; i < data->n_params; i++) + kfree(data->params[i]); + + if (data->onmatch.synth_event) + data->onmatch.synth_event->ref--; + + kfree(data); + + mutex_unlock(&synth_event_mutex); +} + +static void destroy_field_var(struct field_var *field_var) +{ + if (!field_var) + return; + + destroy_hist_field(field_var->var, 0); + destroy_hist_field(field_var->val, 0); + + kfree(field_var); +} + +static void destroy_field_vars(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_field_vars; i++) + destroy_field_var(hist_data->field_vars[i]); +} + +static void save_field_var(struct hist_trigger_data *hist_data, + struct field_var *field_var) +{ + hist_data->field_vars[hist_data->n_field_vars++] = field_var; + + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_field_var_str++; +} + + +static void destroy_synth_var_refs(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_synth_var_refs; i++) + destroy_hist_field(hist_data->synth_var_refs[i], 0); +} + +static void save_synth_var_ref(struct hist_trigger_data *hist_data, + struct hist_field *var_ref) +{ + hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref; + + hist_data->var_refs[hist_data->n_var_refs] = var_ref; + var_ref->var_ref_idx = hist_data->n_var_refs++; +} + +static int check_synth_field(struct synth_event *event, + struct hist_field *hist_field, + unsigned int field_pos) +{ + struct synth_field *field; + + if (field_pos >= event->n_fields) + return -EINVAL; + + field = event->fields[field_pos]; + + if (strcmp(field->type, hist_field->type) != 0) + return -EINVAL; + + return 0; +} + +static struct hist_field * +onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, + char *system, char *event, char *var) +{ + struct hist_field *hist_field; + + var++; /* skip '$' */ + + hist_field = find_target_event_var(hist_data, system, event, var); + if (!hist_field) { + if (!system) { + system = data->onmatch.match_event_system; + event = data->onmatch.match_event; + } + + hist_field = find_event_var(hist_data, system, event, var); + } + + if (!hist_field) + hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var); + + return hist_field; +} + +static struct hist_field * +onmatch_create_field_var(struct hist_trigger_data *hist_data, + struct action_data *data, char *system, + char *event, char *var) +{ + struct hist_field *hist_field = NULL; + struct field_var *field_var; + + /* + * First try to create a field var on the target event (the + * currently being defined). This will create a variable for + * unqualified fields on the target event, or if qualified, + * target fields that have qualified names matching the target. + */ + field_var = create_target_field_var(hist_data, system, event, var); + + if (field_var && !IS_ERR(field_var)) { + save_field_var(hist_data, field_var); + hist_field = field_var->var; + } else { + field_var = NULL; + /* + * If no explicit system.event is specfied, default to + * looking for fields on the onmatch(system.event.xxx) + * event. + */ + if (!system) { + system = data->onmatch.match_event_system; + event = data->onmatch.match_event; + } + + /* + * At this point, we're looking at a field on another + * event. Because we can't modify a hist trigger on + * another event to add a variable for a field, we need + * to create a new trigger on that event and create the + * variable at the same time. + */ + hist_field = create_field_var_hist(hist_data, system, event, var); + if (IS_ERR(hist_field)) + goto free; + } + out: + return hist_field; + free: + destroy_field_var(field_var); + hist_field = NULL; + goto out; +} + +static int onmatch_create(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + struct action_data *data) +{ + char *event_name, *param, *system = NULL; + struct hist_field *hist_field, *var_ref; + unsigned int i, var_ref_idx; + unsigned int field_pos = 0; + struct synth_event *event; + int ret = 0; + + mutex_lock(&synth_event_mutex); + event = find_synth_event(data->onmatch.synth_event_name); + if (!event) { + hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); + mutex_unlock(&synth_event_mutex); + return -EINVAL; + } + event->ref++; + mutex_unlock(&synth_event_mutex); + + var_ref_idx = hist_data->n_var_refs; + + for (i = 0; i < data->n_params; i++) { + char *p; + + p = param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto err; + } + + system = strsep(¶m, "."); + if (!param) { + param = (char *)system; + system = event_name = NULL; + } else { + event_name = strsep(¶m, "."); + if (!param) { + kfree(p); + ret = -EINVAL; + goto err; + } + } + + if (param[0] == '$') + hist_field = onmatch_find_var(hist_data, data, system, + event_name, param); + else + hist_field = onmatch_create_field_var(hist_data, data, + system, + event_name, + param); + + if (!hist_field) { + kfree(p); + ret = -EINVAL; + goto err; + } + + if (check_synth_field(event, hist_field, field_pos) == 0) { + var_ref = create_var_ref(hist_field, system, event_name); + if (!var_ref) { + kfree(p); + ret = -ENOMEM; + goto err; + } + + save_synth_var_ref(hist_data, var_ref); + field_pos++; + kfree(p); + continue; + } + + hist_err_event("onmatch: Param type doesn't match synthetic event field type: ", + system, event_name, param); + kfree(p); + ret = -EINVAL; + goto err; + } + + if (field_pos != event->n_fields) { + hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name); + ret = -EINVAL; + goto err; + } + + data->fn = action_trace; + data->onmatch.synth_event = event; + data->onmatch.var_ref_idx = var_ref_idx; + out: + return ret; + err: + mutex_lock(&synth_event_mutex); + event->ref--; + mutex_unlock(&synth_event_mutex); + + goto out; +} + +static struct action_data *onmatch_parse(struct trace_array *tr, char *str) +{ + char *match_event, *match_event_system; + char *synth_event_name, *params; + struct action_data *data; + int ret = -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + match_event = strsep(&str, ")"); + if (!match_event || !str) { + hist_err("onmatch: Missing closing paren: ", match_event); + goto free; + } + + match_event_system = strsep(&match_event, "."); + if (!match_event) { + hist_err("onmatch: Missing subsystem for match event: ", match_event_system); + goto free; + } + + if (IS_ERR(event_file(tr, match_event_system, match_event))) { + hist_err_event("onmatch: Invalid subsystem or event name: ", + match_event_system, match_event, NULL); + goto free; + } + + data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); + if (!data->onmatch.match_event) { + ret = -ENOMEM; + goto free; + } + + data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL); + if (!data->onmatch.match_event_system) { + ret = -ENOMEM; + goto free; + } + + strsep(&str, "."); + if (!str) { + hist_err("onmatch: Missing . after onmatch(): ", str); + goto free; + } + + synth_event_name = strsep(&str, "("); + if (!synth_event_name || !str) { + hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name); + goto free; } - hist_data->fields[val_idx] = create_hist_field(field, flags); - if (!hist_data->fields[val_idx]) { + data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); + if (!data->onmatch.synth_event_name) { ret = -ENOMEM; + goto free; + } + + params = strsep(&str, ")"); + if (!params || !str || (str && strlen(str))) { + hist_err("onmatch: Missing closing paramlist paren: ", params); + goto free; + } + + ret = parse_action_params(params, data); + if (ret) + goto free; + out: + return data; + free: + onmatch_destroy(data); + data = ERR_PTR(ret); + goto out; +} + +static int create_hitcount_val(struct hist_trigger_data *hist_data) +{ + hist_data->fields[HITCOUNT_IDX] = + create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL); + if (!hist_data->fields[HITCOUNT_IDX]) + return -ENOMEM; + + hist_data->n_vals++; + hist_data->n_fields++; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return 0; +} + +static int __create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *var_name, char *field_str, + unsigned long flags) +{ + struct hist_field *hist_field; + int ret = 0; + + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0); + if (IS_ERR(hist_field)) { + ret = PTR_ERR(hist_field); goto out; } + hist_data->fields[val_idx] = hist_field; + ++hist_data->n_vals; + ++hist_data->n_fields; - if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) ret = -EINVAL; out: return ret; } +static int create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *field_str) +{ + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); +} + +static int create_var_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *var_name, char *expr_str) +{ + unsigned long flags = 0; + + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) + return -EINVAL; + + if (find_var(hist_data, file, var_name) && !hist_data->remove) { + hist_err("Variable already defined: ", var_name); + return -EINVAL; + } + + flags |= HIST_FIELD_FL_VAR; + hist_data->n_vars++; + if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) + return -EINVAL; + + return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); +} + static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { char *fields_str, *field_str; - unsigned int i, j; + unsigned int i, j = 1; int ret; ret = create_hitcount_val(hist_data); @@ -533,12 +3910,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data, field_str = strsep(&fields_str, ","); if (!field_str) break; + if (strcmp(field_str, "hitcount") == 0) continue; + ret = create_val_field(hist_data, j++, file, field_str); if (ret) goto out; } + if (fields_str && (strcmp(fields_str, "hitcount") != 0)) ret = -EINVAL; out: @@ -551,12 +3931,13 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *field_str) { - struct ftrace_event_field *field = NULL; + struct hist_field *hist_field = NULL; + unsigned long flags = 0; unsigned int key_size; int ret = 0; - if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) + if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) return -EINVAL; flags |= HIST_FIELD_FL_KEY; @@ -564,57 +3945,40 @@ static int create_key_field(struct hist_trigger_data *hist_data, if (strcmp(field_str, "stacktrace") == 0) { flags |= HIST_FIELD_FL_STACKTRACE; key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; + hist_field = create_hist_field(hist_data, NULL, flags, NULL); } else { - char *field_name = strsep(&field_str, "."); - - if (field_str) { - if (strcmp(field_str, "hex") == 0) - flags |= HIST_FIELD_FL_HEX; - else if (strcmp(field_str, "sym") == 0) - flags |= HIST_FIELD_FL_SYM; - else if (strcmp(field_str, "sym-offset") == 0) - flags |= HIST_FIELD_FL_SYM_OFFSET; - else if ((strcmp(field_str, "execname") == 0) && - (strcmp(field_name, "common_pid") == 0)) - flags |= HIST_FIELD_FL_EXECNAME; - else if (strcmp(field_str, "syscall") == 0) - flags |= HIST_FIELD_FL_SYSCALL; - else if (strcmp(field_str, "log2") == 0) - flags |= HIST_FIELD_FL_LOG2; - else { - ret = -EINVAL; - goto out; - } + hist_field = parse_expr(hist_data, file, field_str, flags, + NULL, 0); + if (IS_ERR(hist_field)) { + ret = PTR_ERR(hist_field); + goto out; } - field = trace_find_event_field(file->event_call, field_name); - if (!field || !field->size) { + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { + hist_err("Using variable references as keys not supported: ", field_str); + destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; } - if (is_string_field(field)) - key_size = MAX_FILTER_STR_VAL; - else - key_size = field->size; + key_size = hist_field->size; } - hist_data->fields[key_idx] = create_hist_field(field, flags); - if (!hist_data->fields[key_idx]) { - ret = -ENOMEM; - goto out; - } + hist_data->fields[key_idx] = hist_field; key_size = ALIGN(key_size, sizeof(u64)); hist_data->fields[key_idx]->size = key_size; hist_data->fields[key_idx]->offset = key_offset; + hist_data->key_size += key_size; + if (hist_data->key_size > HIST_KEY_SIZE_MAX) { ret = -EINVAL; goto out; } hist_data->n_keys++; + hist_data->n_fields++; if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) return -EINVAL; @@ -658,21 +4022,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data, return ret; } +static int create_var_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, j = hist_data->n_vals; + int ret = 0; + + unsigned int n_vars = hist_data->attrs->var_defs.n_vars; + + for (i = 0; i < n_vars; i++) { + char *var_name = hist_data->attrs->var_defs.name[i]; + char *expr = hist_data->attrs->var_defs.expr[i]; + + ret = create_var_field(hist_data, j++, file, var_name, expr); + if (ret) + goto out; + } + out: + return ret; +} + +static void free_var_defs(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { + kfree(hist_data->attrs->var_defs.name[i]); + kfree(hist_data->attrs->var_defs.expr[i]); + } + + hist_data->attrs->var_defs.n_vars = 0; +} + +static int parse_var_defs(struct hist_trigger_data *hist_data) +{ + char *s, *str, *var_name, *field_str; + unsigned int i, j, n_vars = 0; + int ret = 0; + + for (i = 0; i < hist_data->attrs->n_assignments; i++) { + str = hist_data->attrs->assignment_str[i]; + for (j = 0; j < TRACING_MAP_VARS_MAX; j++) { + field_str = strsep(&str, ","); + if (!field_str) + break; + + var_name = strsep(&field_str, "="); + if (!var_name || !field_str) { + hist_err("Malformed assignment: ", var_name); + ret = -EINVAL; + goto free; + } + + if (n_vars == TRACING_MAP_VARS_MAX) { + hist_err("Too many variables defined: ", var_name); + ret = -EINVAL; + goto free; + } + + s = kstrdup(var_name, GFP_KERNEL); + if (!s) { + ret = -ENOMEM; + goto free; + } + hist_data->attrs->var_defs.name[n_vars] = s; + + s = kstrdup(field_str, GFP_KERNEL); + if (!s) { + kfree(hist_data->attrs->var_defs.name[n_vars]); + ret = -ENOMEM; + goto free; + } + hist_data->attrs->var_defs.expr[n_vars++] = s; + + hist_data->attrs->var_defs.n_vars = n_vars; + } + } + + return ret; + free: + free_var_defs(hist_data); + + return ret; +} + static int create_hist_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { int ret; + ret = parse_var_defs(hist_data); + if (ret) + goto out; + ret = create_val_fields(hist_data, file); if (ret) goto out; - ret = create_key_fields(hist_data, file); + ret = create_var_fields(hist_data, file); if (ret) goto out; - hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; + ret = create_key_fields(hist_data, file); + if (ret) + goto out; out: + free_var_defs(hist_data); + return ret; } @@ -695,7 +4151,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) char *fields_str = hist_data->attrs->sort_key_str; struct tracing_map_sort_key *sort_key; int descending, ret = 0; - unsigned int i, j; + unsigned int i, j, k; hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ @@ -743,12 +4199,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) continue; } - for (j = 1; j < hist_data->n_fields; j++) { + for (j = 1, k = 1; j < hist_data->n_fields; j++) { + unsigned int idx; + hist_field = hist_data->fields[j]; + if (hist_field->flags & HIST_FIELD_FL_VAR) + continue; + + idx = k++; + test_name = hist_field_name(hist_field, 0); if (strcmp(field_name, test_name) == 0) { - sort_key->field_idx = j; + sort_key->field_idx = idx; descending = is_descending(field_str); if (descending < 0) { ret = descending; @@ -763,16 +4226,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) break; } } + hist_data->n_sort_keys = i; out: return ret; } +static void destroy_actions(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == action_trace) + onmatch_destroy(data); + else if (data->fn == onmax_save) + onmax_destroy(data); + else + kfree(data); + } +} + +static int parse_actions(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct action_data *data; + unsigned int i; + int ret = 0; + char *str; + + for (i = 0; i < hist_data->attrs->n_actions; i++) { + str = hist_data->attrs->action_str[i]; + + if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) { + char *action_str = str + strlen("onmatch("); + + data = onmatch_parse(tr, action_str); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + data->fn = action_trace; + } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) { + char *action_str = str + strlen("onmax("); + + data = onmax_parse(action_str); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + data->fn = onmax_save; + } else { + ret = -EINVAL; + break; + } + + hist_data->actions[hist_data->n_actions++] = data; + } + + return ret; +} + +static int create_actions(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + struct action_data *data; + unsigned int i; + int ret = 0; + + for (i = 0; i < hist_data->attrs->n_actions; i++) { + data = hist_data->actions[i]; + + if (data->fn == action_trace) { + ret = onmatch_create(hist_data, file, data); + if (ret) + return ret; + } else if (data->fn == onmax_save) { + ret = onmax_create(hist_data, data); + if (ret) + return ret; + } + } + + return ret; +} + +static void print_actions(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == onmax_save) + onmax_print(m, hist_data, elt, data); + } +} + +static void print_onmax_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + unsigned int i; + + seq_puts(m, ":onmax("); + seq_printf(m, "%s", data->onmax.var_str); + seq_printf(m, ").%s(", data->onmax.fn_name); + + for (i = 0; i < hist_data->n_max_vars; i++) { + seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name); + if (i < hist_data->n_max_vars - 1) + seq_puts(m, ","); + } + seq_puts(m, ")"); +} + +static void print_onmatch_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + unsigned int i; + + seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, + data->onmatch.match_event); + + seq_printf(m, "%s(", data->onmatch.synth_event->name); + + for (i = 0; i < data->n_params; i++) { + if (i) + seq_puts(m, ","); + seq_printf(m, "%s", data->params[i]); + } + + seq_puts(m, ")"); +} + +static bool actions_match(struct hist_trigger_data *hist_data, + struct hist_trigger_data *hist_data_test) +{ + unsigned int i, j; + + if (hist_data->n_actions != hist_data_test->n_actions) + return false; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + struct action_data *data_test = hist_data_test->actions[i]; + + if (data->fn != data_test->fn) + return false; + + if (data->n_params != data_test->n_params) + return false; + + for (j = 0; j < data->n_params; j++) { + if (strcmp(data->params[j], data_test->params[j]) != 0) + return false; + } + + if (data->fn == action_trace) { + if (strcmp(data->onmatch.synth_event_name, + data_test->onmatch.synth_event_name) != 0) + return false; + if (strcmp(data->onmatch.match_event_system, + data_test->onmatch.match_event_system) != 0) + return false; + if (strcmp(data->onmatch.match_event, + data_test->onmatch.match_event) != 0) + return false; + } else if (data->fn == onmax_save) { + if (strcmp(data->onmax.var_str, + data_test->onmax.var_str) != 0) + return false; + if (strcmp(data->onmax.fn_name, + data_test->onmax.fn_name) != 0) + return false; + } + } + + return true; +} + + +static void print_actions_spec(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == action_trace) + print_onmatch_spec(m, hist_data, data); + else if (data->fn == onmax_save) + print_onmax_spec(m, hist_data, data); + } +} + +static void destroy_field_var_hists(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_field_var_hists; i++) { + kfree(hist_data->field_var_hists[i]->cmd); + kfree(hist_data->field_var_hists[i]); + } +} + static void destroy_hist_data(struct hist_trigger_data *hist_data) { + if (!hist_data) + return; + destroy_hist_trigger_attrs(hist_data->attrs); destroy_hist_fields(hist_data); tracing_map_destroy(hist_data->map); + + destroy_actions(hist_data); + destroy_field_vars(hist_data); + destroy_field_var_hists(hist_data); + destroy_synth_var_refs(hist_data); + kfree(hist_data); } @@ -781,7 +4458,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) struct tracing_map *map = hist_data->map; struct ftrace_event_field *field; struct hist_field *hist_field; - int i, idx; + int i, idx = 0; for_each_hist_field(i, hist_data) { hist_field = hist_data->fields[i]; @@ -792,6 +4469,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; + else if (!field) + cmp_fn = tracing_map_cmp_num(hist_field->size, + hist_field->is_signed); else if (is_string_field(field)) cmp_fn = tracing_map_cmp_string; else @@ -800,36 +4480,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) idx = tracing_map_add_key_field(map, hist_field->offset, cmp_fn); - - } else + } else if (!(hist_field->flags & HIST_FIELD_FL_VAR)) idx = tracing_map_add_sum_field(map); if (idx < 0) return idx; - } - - return 0; -} - -static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) -{ - struct hist_field *key_field; - unsigned int i; - - for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; - if (key_field->flags & HIST_FIELD_FL_EXECNAME) - return true; + if (hist_field->flags & HIST_FIELD_FL_VAR) { + idx = tracing_map_add_var(map); + if (idx < 0) + return idx; + hist_field->var.idx = idx; + hist_field->var.hist_data = hist_data; + } } - return false; + return 0; } static struct hist_trigger_data * create_hist_data(unsigned int map_bits, struct hist_trigger_attrs *attrs, - struct trace_event_file *file) + struct trace_event_file *file, + bool remove) { const struct tracing_map_ops *map_ops = NULL; struct hist_trigger_data *hist_data; @@ -840,6 +4513,12 @@ create_hist_data(unsigned int map_bits, return ERR_PTR(-ENOMEM); hist_data->attrs = attrs; + hist_data->remove = remove; + hist_data->event_file = file; + + ret = parse_actions(hist_data); + if (ret) + goto free; ret = create_hist_fields(hist_data, file); if (ret) @@ -849,8 +4528,7 @@ create_hist_data(unsigned int map_bits, if (ret) goto free; - if (need_tracing_map_ops(hist_data)) - map_ops = &hist_trigger_elt_comm_ops; + map_ops = &hist_trigger_elt_data_ops; hist_data->map = tracing_map_create(map_bits, hist_data->key_size, map_ops, hist_data); @@ -863,12 +4541,6 @@ create_hist_data(unsigned int map_bits, ret = create_tracing_map_fields(hist_data); if (ret) goto free; - - ret = tracing_map_init(hist_data->map); - if (ret) - goto free; - - hist_data->event_file = file; out: return hist_data; free: @@ -882,18 +4554,39 @@ create_hist_data(unsigned int map_bits, } static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - void *rec) + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + u64 *var_ref_vals) { + struct hist_elt_data *elt_data; struct hist_field *hist_field; - unsigned int i; + unsigned int i, var_idx; u64 hist_val; + elt_data = elt->private_data; + elt_data->var_ref_vals = var_ref_vals; + for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, rec); + hist_val = hist_field->fn(hist_field, elt, rbe, rec); + if (hist_field->flags & HIST_FIELD_FL_VAR) { + var_idx = hist_field->var.idx; + tracing_map_set_var(elt, var_idx, hist_val); + continue; + } tracing_map_update_sum(elt, i, hist_val); } + + for_each_hist_key_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_VAR) { + hist_val = hist_field->fn(hist_field, elt, rbe, rec); + var_idx = hist_field->var.idx; + tracing_map_set_var(elt, var_idx, hist_val); + } + } + + update_field_vars(hist_data, elt, rbe, rec); } static inline void add_to_key(char *compound_key, void *key, @@ -920,15 +4613,31 @@ static inline void add_to_key(char *compound_key, void *key, memcpy(compound_key + key_field->offset, key, size); } -static void event_hist_trigger(struct event_trigger_data *data, void *rec) +static void +hist_trigger_actions(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, u64 *var_ref_vals) +{ + struct action_data *data; + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + data = hist_data->actions[i]; + data->fn(hist_data, elt, rec, rbe, data, var_ref_vals); + } +} + +static void event_hist_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; char compound_key[HIST_KEY_SIZE_MAX]; + struct tracing_map_elt *elt = NULL; struct stack_trace stacktrace; struct hist_field *key_field; - struct tracing_map_elt *elt; u64 field_contents; void *key = NULL; unsigned int i; @@ -949,7 +4658,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) key = entries; } else { - field_contents = key_field->fn(key_field, rec); + field_contents = key_field->fn(key_field, elt, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; @@ -964,9 +4673,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) if (use_compound_key) key = compound_key; + if (hist_data->n_var_refs && + !resolve_var_refs(hist_data, key, var_ref_vals, false)) + return; + elt = tracing_map_insert(hist_data->map, key); - if (elt) - hist_trigger_elt_update(hist_data, elt, rec); + if (!elt) + return; + + hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); + + if (resolve_var_refs(hist_data, key, var_ref_vals, true)) + hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -1023,7 +4741,13 @@ hist_trigger_entry_print(struct seq_file *m, seq_printf(m, "%s: [%llx] %-55s", field_name, uval, str); } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { - char *comm = elt->private_data; + struct hist_elt_data *elt_data = elt->private_data; + char *comm; + + if (WARN_ON_ONCE(!elt_data)) + return; + + comm = elt_data->comm; uval = *(u64 *)(key + key_field->offset); seq_printf(m, "%s: %-16s[%10llu]", field_name, @@ -1067,6 +4791,10 @@ hist_trigger_entry_print(struct seq_file *m, for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); + if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || + hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) + continue; + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { seq_printf(m, " %s: %10llx", field_name, tracing_map_read_sum(elt, i)); @@ -1076,6 +4804,8 @@ hist_trigger_entry_print(struct seq_file *m, } } + print_actions(m, hist_data, elt); + seq_puts(m, "\n"); } @@ -1144,6 +4874,11 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } + if (have_hist_err()) { + seq_printf(m, "\nERROR: %s\n", hist_err_str); + seq_printf(m, " Last command: %s\n", last_hist_cmd); + } + out_unlock: mutex_unlock(&event_mutex); @@ -1162,37 +4897,22 @@ const struct file_operations event_hist_fops = { .release = single_release, }; -static const char *get_hist_field_flags(struct hist_field *hist_field) -{ - const char *flags_str = NULL; - - if (hist_field->flags & HIST_FIELD_FL_HEX) - flags_str = "hex"; - else if (hist_field->flags & HIST_FIELD_FL_SYM) - flags_str = "sym"; - else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) - flags_str = "sym-offset"; - else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) - flags_str = "execname"; - else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) - flags_str = "syscall"; - else if (hist_field->flags & HIST_FIELD_FL_LOG2) - flags_str = "log2"; - - return flags_str; -} - static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); - seq_printf(m, "%s", field_name); - if (hist_field->flags) { - const char *flags_str = get_hist_field_flags(hist_field); - - if (flags_str) - seq_printf(m, ".%s", flags_str); - } + if (hist_field->var.name) + seq_printf(m, "%s=", hist_field->var.name); + + if (hist_field->flags & HIST_FIELD_FL_CPU) + seq_puts(m, "cpu"); + else if (field_name) { + if (hist_field->flags & HIST_FIELD_FL_VAR_REF || + hist_field->flags & HIST_FIELD_FL_ALIAS) + seq_putc(m, '$'); + seq_printf(m, "%s", field_name); + } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) + seq_puts(m, "common_timestamp"); } static int event_hist_trigger_print(struct seq_file *m, @@ -1200,7 +4920,8 @@ static int event_hist_trigger_print(struct seq_file *m, struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; - struct hist_field *key_field; + struct hist_field *field; + bool have_var = false; unsigned int i; seq_puts(m, "hist:"); @@ -1211,25 +4932,47 @@ static int event_hist_trigger_print(struct seq_file *m, seq_puts(m, "keys="); for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; + field = hist_data->fields[i]; if (i > hist_data->n_vals) seq_puts(m, ","); - if (key_field->flags & HIST_FIELD_FL_STACKTRACE) + if (field->flags & HIST_FIELD_FL_STACKTRACE) seq_puts(m, "stacktrace"); else - hist_field_print(m, key_field); + hist_field_print(m, field); } seq_puts(m, ":vals="); for_each_hist_val_field(i, hist_data) { + field = hist_data->fields[i]; + if (field->flags & HIST_FIELD_FL_VAR) { + have_var = true; + continue; + } + if (i == HITCOUNT_IDX) seq_puts(m, "hitcount"); else { seq_puts(m, ","); - hist_field_print(m, hist_data->fields[i]); + hist_field_print(m, field); + } + } + + if (have_var) { + unsigned int n = 0; + + seq_puts(m, ":"); + + for_each_hist_val_field(i, hist_data) { + field = hist_data->fields[i]; + + if (field->flags & HIST_FIELD_FL_VAR) { + if (n++) + seq_puts(m, ","); + hist_field_print(m, field); + } } } @@ -1237,28 +4980,36 @@ static int event_hist_trigger_print(struct seq_file *m, for (i = 0; i < hist_data->n_sort_keys; i++) { struct tracing_map_sort_key *sort_key; + unsigned int idx, first_key_idx; + + /* skip VAR vals */ + first_key_idx = hist_data->n_vals - hist_data->n_vars; sort_key = &hist_data->sort_keys[i]; + idx = sort_key->field_idx; + + if (WARN_ON(idx >= HIST_FIELDS_MAX)) + return -EINVAL; if (i > 0) seq_puts(m, ","); - if (sort_key->field_idx == HITCOUNT_IDX) + if (idx == HITCOUNT_IDX) seq_puts(m, "hitcount"); else { - unsigned int idx = sort_key->field_idx; - - if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) - return -EINVAL; - + if (idx >= first_key_idx) + idx += hist_data->n_vars; hist_field_print(m, hist_data->fields[idx]); } if (sort_key->descending) seq_puts(m, ".descending"); } - seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + if (hist_data->enable_timestamps) + seq_printf(m, ":clock=%s", hist_data->attrs->clock); + + print_actions_spec(m, hist_data); if (data->filter_str) seq_printf(m, " if %s", data->filter_str); @@ -1286,6 +5037,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops, return 0; } +static void unregister_field_var_hists(struct hist_trigger_data *hist_data) +{ + struct trace_event_file *file; + unsigned int i; + char *cmd; + int ret; + + for (i = 0; i < hist_data->n_field_var_hists; i++) { + file = hist_data->field_var_hists[i]->hist_data->event_file; + cmd = hist_data->field_var_hists[i]->cmd; + ret = event_hist_trigger_func(&trigger_hist_cmd, file, + "!hist", "hist", cmd); + } +} + static void event_hist_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data) { @@ -1298,7 +5064,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, if (!data->ref) { if (data->name) del_named_trigger(data); + trigger_data_free(data); + + remove_hist_vars(hist_data); + + unregister_field_var_hists(hist_data); + destroy_hist_data(hist_data); } } @@ -1425,6 +5197,15 @@ static bool hist_trigger_match(struct event_trigger_data *data, return false; if (key_field->offset != key_field_test->offset) return false; + if (key_field->size != key_field_test->size) + return false; + if (key_field->is_signed != key_field_test->is_signed) + return false; + if (!!key_field->var.name != !!key_field_test->var.name) + return false; + if (key_field->var.name && + strcmp(key_field->var.name, key_field_test->var.name) != 0) + return false; } for (i = 0; i < hist_data->n_sort_keys; i++) { @@ -1440,6 +5221,9 @@ static bool hist_trigger_match(struct event_trigger_data *data, (strcmp(data->filter_str, data_test->filter_str) != 0)) return false; + if (!actions_match(hist_data, hist_data_test)) + return false; + return true; } @@ -1456,6 +5240,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { + hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); ret = -EINVAL; goto out; } @@ -1475,13 +5260,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, test->paused = false; else if (hist_data->attrs->clear) hist_clear(test); - else + else { + hist_err("Hist trigger already exists", NULL); ret = -EEXIST; + } goto out; } } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { + hist_err("Can't clear or continue a nonexistent hist trigger", NULL); ret = -ENOENT; goto out; } @@ -1490,7 +5278,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, data->paused = true; if (named_data) { - destroy_hist_data(data->private_data); data->private_data = named_data->private_data; set_named_trigger_data(data, named_data); data->ops = &event_hist_trigger_named_ops; @@ -1502,8 +5289,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } - list_add_rcu(&data->list, &file->triggers); + if (hist_data->enable_timestamps) { + char *clock = hist_data->attrs->clock; + + ret = tracing_set_clock(file->tr, hist_data->attrs->clock); + if (ret) { + hist_err("Couldn't set trace_clock: ", clock); + goto out; + } + + tracing_set_time_stamp_abs(file->tr, true); + } + + if (named_data) + destroy_hist_data(hist_data); + ret++; + out: + return ret; +} + +static int hist_trigger_enable(struct event_trigger_data *data, + struct trace_event_file *file) +{ + int ret = 0; + + list_add_tail_rcu(&data->list, &file->triggers); update_cond_flag(file); @@ -1512,10 +5323,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, update_cond_flag(file); ret--; } - out: + return ret; } +static bool have_hist_trigger_match(struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool match = false; + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (hist_trigger_match(data, test, named_data, false)) { + match = true; + break; + } + } + } + + return match; +} + +static bool hist_trigger_check_refs(struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + hist_data = test->private_data; + if (check_var_refs(hist_data)) + return true; + break; + } + } + + return false; +} + static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) @@ -1541,17 +5397,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, if (unregistered && test->ops->free) test->ops->free(test->ops, test); + + if (hist_data->enable_timestamps) { + if (!hist_data->remove || unregistered) + tracing_set_time_stamp_abs(file->tr, false); + } +} + +static bool hist_file_check_refs(struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + if (check_var_refs(hist_data)) + return true; + } + } + + return false; } static void hist_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; + struct hist_trigger_data *hist_data; + struct synth_event *se; + const char *se_name; + + if (hist_file_check_refs(file)) + return; list_for_each_entry_safe(test, n, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); + + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref--; + mutex_unlock(&synth_event_mutex); + update_cond_flag(file); + if (hist_data->enable_timestamps) + tracing_set_time_stamp_abs(file->tr, false); if (test->ops->free) test->ops->free(test->ops, test); } @@ -1567,16 +5461,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct hist_trigger_attrs *attrs; struct event_trigger_ops *trigger_ops; struct hist_trigger_data *hist_data; - char *trigger; + struct synth_event *se; + const char *se_name; + bool remove = false; + char *trigger, *p; int ret = 0; + if (glob && strlen(glob)) { + last_cmd_set(param); + hist_err_clear(); + } + if (!param) return -EINVAL; - /* separate the trigger from the filter (k:v [if filter]) */ - trigger = strsep(¶m, " \t"); - if (!trigger) - return -EINVAL; + if (glob[0] == '!') + remove = true; + + /* + * separate the trigger from the filter (k:v [if filter]) + * allowing for whitespace in the trigger + */ + p = trigger = param; + do { + p = strstr(p, "if"); + if (!p) + break; + if (p == param) + return -EINVAL; + if (*(p - 1) != ' ' && *(p - 1) != '\t') { + p++; + continue; + } + if (p >= param + strlen(param) - strlen("if") - 1) + return -EINVAL; + if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') { + p++; + continue; + } + break; + } while (p); + + if (!p) + param = NULL; + else { + *(p - 1) = '\0'; + param = strstrip(p); + trigger = strstrip(trigger); + } attrs = parse_hist_trigger_attrs(trigger); if (IS_ERR(attrs)) @@ -1585,7 +5517,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (attrs->map_bits) hist_trigger_bits = attrs->map_bits; - hist_data = create_hist_data(hist_trigger_bits, attrs, file); + hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove); if (IS_ERR(hist_data)) { destroy_hist_trigger_attrs(attrs); return PTR_ERR(hist_data); @@ -1593,10 +5525,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - ret = -ENOMEM; trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); - if (!trigger_data) + if (!trigger_data) { + ret = -ENOMEM; goto out_free; + } trigger_data->count = -1; trigger_data->ops = trigger_ops; @@ -1614,8 +5547,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } - if (glob[0] == '!') { + if (remove) { + if (!have_hist_trigger_match(trigger_data, file)) + goto out_free; + + if (hist_trigger_check_refs(trigger_data, file)) { + ret = -EBUSY; + goto out_free; + } + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref--; + mutex_unlock(&synth_event_mutex); + ret = 0; goto out_free; } @@ -1632,14 +5581,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } else if (ret < 0) goto out_free; + + if (get_named_trigger_data(trigger_data)) + goto enable; + + if (has_hist_vars(hist_data)) + save_hist_vars(hist_data); + + ret = create_actions(hist_data, file); + if (ret) + goto out_unreg; + + ret = tracing_map_init(hist_data->map); + if (ret) + goto out_unreg; +enable: + ret = hist_trigger_enable(trigger_data, file); + if (ret) + goto out_unreg; + + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref++; + mutex_unlock(&synth_event_mutex); + /* Just return zero, not the number of registered triggers */ ret = 0; out: + if (ret == 0) + hist_err_clear(); + return ret; + out_unreg: + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); + remove_hist_vars(hist_data); + kfree(trigger_data); destroy_hist_data(hist_data); @@ -1669,7 +5651,8 @@ __init int register_trigger_hist_cmd(void) } static void -hist_enable_trigger(struct event_trigger_data *data, void *rec) +hist_enable_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; struct event_trigger_data *test; @@ -1685,7 +5668,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec) } static void -hist_enable_count_trigger(struct event_trigger_data *data, void *rec) +hist_enable_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1693,7 +5677,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - hist_enable_trigger(data, rec); + hist_enable_trigger(data, rec, event); } static struct event_trigger_ops hist_enable_trigger_ops = { @@ -1798,3 +5782,31 @@ __init int register_trigger_hist_enable_disable_cmds(void) return ret; } + +static __init int trace_events_hist_init(void) +{ + struct dentry *entry = NULL; + struct dentry *d_tracer; + int err = 0; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) { + err = PTR_ERR(d_tracer); + goto err; + } + + entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + NULL, &synth_events_fops); + if (!entry) { + err = -ENODEV; + goto err; + } + + return err; + err: + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); + + return err; +} + +fs_initcall(trace_events_hist_init); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 87411482a46f..d251cabcf69a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec) +event_triggers_call(struct trace_event_file *file, void *rec, + struct ring_buffer_event *event) { struct event_trigger_data *data; enum event_trigger_type tt = ETT_NONE; @@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) if (data->paused) continue; if (!rec) { - data->ops->func(data, rec); + data->ops->func(data, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, rec); + data->ops->func(data, rec, event); } return tt; } @@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt, - void *rec) + void *rec, struct ring_buffer_event *event) { struct event_trigger_data *data; @@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, rec); + data->ops->func(data, rec, event); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -908,8 +909,15 @@ void set_named_trigger_data(struct event_trigger_data *data, data->named_data = named_data; } +struct event_trigger_data * +get_named_trigger_data(struct event_trigger_data *data) +{ + return data->named_data; +} + static void -traceon_trigger(struct event_trigger_data *data, void *rec) +traceon_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (tracing_is_on()) return; @@ -918,7 +926,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec) } static void -traceon_count_trigger(struct event_trigger_data *data, void *rec) +traceon_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (tracing_is_on()) return; @@ -933,7 +942,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec) } static void -traceoff_trigger(struct event_trigger_data *data, void *rec) +traceoff_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!tracing_is_on()) return; @@ -942,7 +952,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec) } static void -traceoff_count_trigger(struct event_trigger_data *data, void *rec) +traceoff_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!tracing_is_on()) return; @@ -1039,13 +1050,15 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data, void *rec) +snapshot_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { tracing_snapshot(); } static void -snapshot_count_trigger(struct event_trigger_data *data, void *rec) +snapshot_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1053,7 +1066,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - snapshot_trigger(data, rec); + snapshot_trigger(data, rec, event); } static int @@ -1141,13 +1154,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif static void -stacktrace_trigger(struct event_trigger_data *data, void *rec) +stacktrace_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data, void *rec) +stacktrace_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1155,7 +1170,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - stacktrace_trigger(data, rec); + stacktrace_trigger(data, rec, event); } static int @@ -1217,7 +1232,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) } static void -event_enable_trigger(struct event_trigger_data *data, void *rec) +event_enable_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1228,7 +1244,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec) } static void -event_enable_count_trigger(struct event_trigger_data *data, void *rec) +event_enable_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1242,7 +1259,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - event_enable_trigger(data, rec); + event_enable_trigger(data, rec, event); } int event_enable_trigger_print(struct seq_file *m, diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 492700c5fb4d..1cd3fb4d70f8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/rculist.h> +#include <linux/error-injection.h> #include "trace_probe.h" @@ -42,7 +43,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -87,6 +87,30 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +bool trace_kprobe_on_func_entry(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + + return kprobe_on_func_entry(tk->rp.kp.addr, + tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, + tk->rp.kp.addr ? 0 : tk->rp.kp.offset); +} + +bool trace_kprobe_error_injectable(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return within_error_injection_list(addr); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -438,6 +462,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) disable_kprobe(&tk->rp.kp); wait = 1; } + + /* + * if tk is not added to any list, it must be a local trace_kprobe + * created with perf_event_open. We don't need to wait for these + * trace_kprobes + */ + if (list_empty(&tk->list)) + wait = 0; out: if (wait) { /* @@ -635,7 +667,7 @@ static int create_trace_kprobe(int argc, char **argv) char *symbol = NULL, *event = NULL, *group = NULL; int maxactive = 0; char *arg; - unsigned long offset = 0; + long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -723,7 +755,7 @@ static int create_trace_kprobe(int argc, char **argv) symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); - if (ret) { + if (ret || offset < 0 || offset > UINT_MAX) { pr_info("Failed to parse either an address or a symbol.\n"); return ret; } @@ -1170,7 +1202,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1179,12 +1211,31 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + unsigned long orig_ip = instruction_pointer(regs); + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the single stepping. + * The ftrace kprobe handler leaves it up to us to re-enable + * preemption here before returning if we've modified the ip. + */ + if (orig_ip != instruction_pointer(regs)) { + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1193,13 +1244,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1275,6 +1327,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1282,9 +1335,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); @@ -1313,12 +1366,9 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_kprobe_event(struct trace_kprobe *tk) +static inline void init_trace_event_call(struct trace_kprobe *tk, + struct trace_event_call *call) { - struct trace_event_call *call = &tk->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; @@ -1327,6 +1377,19 @@ static int register_kprobe_event(struct trace_kprobe *tk) call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } + + call->flags = TRACE_EVENT_FL_KPROBE; + call->class->reg = kprobe_register; + call->data = tk; +} + +static int register_kprobe_event(struct trace_kprobe *tk) +{ + struct trace_event_call *call = &tk->tp.call; + int ret = 0; + + init_trace_event_call(tk, call); + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1334,9 +1397,6 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = TRACE_EVENT_FL_KPROBE; - call->class->reg = kprobe_register; - call->data = tk; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", @@ -1358,6 +1418,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) return ret; } +#ifdef CONFIG_PERF_EVENTS +/* create a trace_kprobe, but don't add it to global lists */ +struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return) +{ + struct trace_kprobe *tk; + int ret; + char *event; + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name here. + */ + event = func ? func : "DUMMY_EVENT"; + + tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func, + offs, 0 /* maxactive */, 0 /* nargs */, + is_return); + + if (IS_ERR(tk)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tk)); + return ERR_CAST(tk); + } + + init_trace_event_call(tk, &tk->tp.call); + + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ret = -ENOMEM; + goto error; + } + + ret = __register_trace_kprobe(tk); + if (ret < 0) + goto error; + + return &tk->tp.call; +error: + free_trace_kprobe(tk); + return ERR_PTR(ret); +} + +void destroy_local_trace_kprobe(struct trace_event_call *event_call) +{ + struct trace_kprobe *tk; + + tk = container_of(event_call, struct trace_kprobe, tp.call); + + if (trace_probe_is_enabled(&tk->tp)) { + WARN_ON(1); + return; + } + + __unregister_trace_kprobe(tk); + free_trace_kprobe(tk); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ad1d6164e946..50f44b7b2b32 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -196,7 +196,7 @@ struct notifier_block module_trace_bprintk_format_nb = { }; int __trace_bprintk(unsigned long ip, const char *fmt, ...) - { +{ int ret; va_list ap; @@ -214,7 +214,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) EXPORT_SYMBOL_GPL(__trace_bprintk); int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) - { +{ if (unlikely(!fmt)) return 0; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d59357308677..daf54bda4dc8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -320,7 +320,7 @@ static fetch_func_t get_fetch_size_function(const struct fetch_type *type, } /* Split symbol and offset. */ -int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +int traceprobe_split_symbol_offset(char *symbol, long *offset) { char *tmp; int ret; @@ -328,13 +328,11 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) if (!offset) return -EINVAL; - tmp = strchr(symbol, '+'); + tmp = strpbrk(symbol, "+-"); if (tmp) { - /* skip sign because kstrtoul doesn't accept '+' */ - ret = kstrtoul(tmp + 1, 0, offset); + ret = kstrtol(tmp, 0, offset); if (ret) return ret; - *tmp = '\0'; } else *offset = 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb66e3eaa192..75daff22ccea 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,6 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +bool trace_kprobe_on_func_entry(struct trace_event_call *call); +bool trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) +{ + return false; +} + +static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) +{ + return false; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { @@ -353,7 +365,7 @@ extern int traceprobe_conflict_field_name(const char *name, extern void traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); -extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); +extern int traceprobe_split_symbol_offset(char *symbol, long *offset); /* Sum up total data length for dynamic arraies (strings) */ static nokprobe_inline int @@ -404,3 +416,14 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, } extern int set_print_fmt(struct trace_probe *tp, bool is_return); + +#ifdef CONFIG_PERF_EVENTS +extern struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return); +extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); + +extern struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); +#endif diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 8cda06a10d66..c364cf777e1a 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -1,13 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/compiler.h> #include "trace.h" -int DYN_FTRACE_TEST_NAME(void) +noinline __noclone int DYN_FTRACE_TEST_NAME(void) { /* used to call mcount */ return 0; } -int DYN_FTRACE_TEST_NAME2(void) +noinline __noclone int DYN_FTRACE_TEST_NAME2(void) { /* used to call mcount */ return 0; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 40592e7b3568..34fd0e0ec51d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -151,6 +151,8 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, return; ret = strncpy_from_user(dst, src, maxlen); + if (ret == maxlen) + dst[--ret] = '\0'; if (ret < 0) { /* Failed to fetch string */ ((u8 *)get_rloc_data(dest))[0] = '\0'; @@ -446,7 +448,7 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - inode = igrab(d_inode(path.dentry)); + inode = igrab(d_real_inode(path.dentry)); path_put(&path); if (!inode || !S_ISREG(inode->i_mode)) { @@ -602,24 +604,9 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, - trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:", tu->filename); - - /* Don't print "0x (null)" when offset is 0 */ - if (tu->offset) { - seq_printf(m, "0x%p", (void *)tu->offset); - } else { - switch (sizeof(void *)) { - case 4: - seq_printf(m, "0x00000000"); - break; - case 8: - default: - seq_printf(m, "0x0000000000000000"); - break; - } - } + seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system, + trace_event_name(&tu->tp.call), tu->filename, + (int)(sizeof(void *) * 2), tu->offset); for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); @@ -1292,16 +1279,25 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; -static int register_uprobe_event(struct trace_uprobe *tu) +static inline void init_trace_event_call(struct trace_uprobe *tu, + struct trace_event_call *call) { - struct trace_event_call *call = &tu->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; + call->flags = TRACE_EVENT_FL_UPROBE; + call->class->reg = trace_uprobe_register; + call->data = tu; +} + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + struct trace_event_call *call = &tu->tp.call; + int ret = 0; + + init_trace_event_call(tu, call); + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; @@ -1311,9 +1307,6 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - call->flags = TRACE_EVENT_FL_UPROBE; - call->class->reg = trace_uprobe_register; - call->data = tu; ret = trace_add_event_call(call); if (ret) { @@ -1339,6 +1332,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) return 0; } +#ifdef CONFIG_PERF_EVENTS +struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +{ + struct trace_uprobe *tu; + struct inode *inode; + struct path path; + int ret; + + ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = igrab(d_inode(path.dentry)); + path_put(&path); + + if (!inode || !S_ISREG(inode->i_mode)) { + iput(inode); + return ERR_PTR(-EINVAL); + } + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name "DUMMY_EVENT" here. + */ + tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, + is_return); + + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", + (int)PTR_ERR(tu)); + return ERR_CAST(tu); + } + + tu->offset = offs; + tu->inode = inode; + tu->filename = kstrdup(name, GFP_KERNEL); + init_trace_event_call(tu, &tu->tp.call); + + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ret = -ENOMEM; + goto error; + } + + return &tu->tp.call; +error: + free_trace_uprobe(tu); + return ERR_PTR(ret); +} + +void destroy_local_trace_uprobe(struct trace_event_call *event_call) +{ + struct trace_uprobe *tu; + + tu = container_of(event_call, struct trace_uprobe, tp.call); + + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; + + free_trace_uprobe(tu); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a trace interface for controling probe points */ static __init int init_uprobe_trace(void) { diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 07e75344725b..5cadb1b8b5fe 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) return (u64)atomic64_read(&elt->fields[i].sum); } +/** + * tracing_map_set_var - Assign a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * @n: The value to assign + * + * Assign n to variable i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_var() when the tracing map was set up. + */ +void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_set(&elt->vars[i], n); + elt->var_set[i] = true; +} + +/** + * tracing_map_var_set - Return whether or not a variable has been set + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Return true if the variable has been set, false otherwise. The + * index i is the index returned by the call to tracing_map_add_var() + * when the tracing map was set up. + */ +bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i) +{ + return elt->var_set[i]; +} + +/** + * tracing_map_read_var - Return the value of a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Retrieve the value of the variable i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_var() when the tracing map was set + * up. + * + * Return: The variable value associated with field i for elt. + */ +u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->vars[i]); +} + +/** + * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Retrieve the value of the variable i associated with the specified + * tracing_map_elt instance, and reset the variable to the 'not set' + * state. The index i is the index returned by the call to + * tracing_map_add_var() when the tracing map was set up. The reset + * essentially makes the variable a read-once variable if it's only + * accessed using this function. + * + * Return: The variable value associated with field i for elt. + */ +u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i) +{ + elt->var_set[i] = false; + return (u64)atomic64_read(&elt->vars[i]); +} + int tracing_map_cmp_string(void *val_a, void *val_b) { char *a = val_a; @@ -171,6 +238,28 @@ int tracing_map_add_sum_field(struct tracing_map *map) } /** + * tracing_map_add_var - Add a field describing a tracing_map var + * @map: The tracing_map + * + * Add a var to the map and return the index identifying it in the map + * and associated tracing_map_elts. This is the index used for + * instance to update a var for a particular tracing_map_elt using + * tracing_map_update_var() or reading it via tracing_map_read_var(). + * + * Return: The index identifying the var in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_var(struct tracing_map *map) +{ + int ret = -EINVAL; + + if (map->n_vars < TRACING_MAP_VARS_MAX) + ret = map->n_vars++; + + return ret; +} + +/** * tracing_map_add_key_field - Add a field describing a tracing_map key * @map: The tracing_map * @offset: The offset within the key @@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt) if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) atomic64_set(&elt->fields[i].sum, 0); + for (i = 0; i < elt->map->n_vars; i++) { + atomic64_set(&elt->vars[i], 0); + elt->var_set[i] = false; + } + if (elt->map->ops && elt->map->ops->elt_clear) elt->map->ops->elt_clear(elt); } @@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) if (elt->map->ops && elt->map->ops->elt_free) elt->map->ops->elt_free(elt); kfree(elt->fields); + kfree(elt->vars); + kfree(elt->var_set); kfree(elt->key); kfree(elt); } @@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) goto free; } + elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL); + if (!elt->vars) { + err = -ENOMEM; + goto free; + } + + elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL); + if (!elt->var_set) { + err = -ENOMEM; + goto free; + } + tracing_map_elt_init_fields(elt); if (map->ops && map->ops->elt_alloc) { @@ -414,7 +522,9 @@ static inline struct tracing_map_elt * __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) { u32 idx, key_hash, test_key; + int dup_try = 0; struct tracing_map_entry *entry; + struct tracing_map_elt *val; key_hash = jhash(key, map->key_size, 0); if (key_hash == 0) @@ -426,11 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) entry = TRACING_MAP_ENTRY(map->map, idx); test_key = entry->key; - if (test_key && test_key == key_hash && entry->val && - keys_match(key, entry->val->key, map->key_size)) { - if (!lookup_only) - atomic64_inc(&map->hits); - return entry->val; + if (test_key && test_key == key_hash) { + val = READ_ONCE(entry->val); + if (val && + keys_match(key, val->key, map->key_size)) { + if (!lookup_only) + atomic64_inc(&map->hits); + return val; + } else if (unlikely(!val)) { + /* + * The key is present. But, val (pointer to elt + * struct) is still NULL. which means some other + * thread is in the process of inserting an + * element. + * + * On top of that, it's key_hash is same as the + * one being inserted right now. So, it's + * possible that the element has the same + * key as well. + */ + + dup_try++; + if (dup_try > map->map_size) { + atomic64_inc(&map->drops); + break; + } + continue; + } } if (!test_key) { @@ -452,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) atomic64_inc(&map->hits); return entry->val; + } else { + /* + * cmpxchg() failed. Loop around once + * more to check what key was inserted. + */ + dup_try++; + continue; } } @@ -816,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) return sort_entry; } -static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) -{ - struct tracing_map_elt *dup_elt; - unsigned int i; - - dup_elt = tracing_map_elt_alloc(elt->map); - if (IS_ERR(dup_elt)) - return NULL; - - if (elt->map->ops && elt->map->ops->elt_copy) - elt->map->ops->elt_copy(dup_elt, elt); - - dup_elt->private_data = elt->private_data; - memcpy(dup_elt->key, elt->key, elt->map->key_size); - - for (i = 0; i < elt->map->n_fields; i++) { - atomic64_set(&dup_elt->fields[i].sum, - atomic64_read(&elt->fields[i].sum)); - dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn; - } - - return dup_elt; -} - -static int merge_dup(struct tracing_map_sort_entry **sort_entries, - unsigned int target, unsigned int dup) -{ - struct tracing_map_elt *target_elt, *elt; - bool first_dup = (target - dup) == 1; - int i; - - if (first_dup) { - elt = sort_entries[target]->elt; - target_elt = copy_elt(elt); - if (!target_elt) - return -ENOMEM; - sort_entries[target]->elt = target_elt; - sort_entries[target]->elt_copied = true; - } else - target_elt = sort_entries[target]->elt; - - elt = sort_entries[dup]->elt; - - for (i = 0; i < elt->map->n_fields; i++) - atomic64_add(atomic64_read(&elt->fields[i].sum), - &target_elt->fields[i].sum); - - sort_entries[dup]->dup = true; - - return 0; -} - -static int merge_dups(struct tracing_map_sort_entry **sort_entries, +static void detect_dups(struct tracing_map_sort_entry **sort_entries, int n_entries, unsigned int key_size) { unsigned int dups = 0, total_dups = 0; - int err, i, j; + int i; void *key; if (n_entries < 2) - return total_dups; + return; sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), (int (*)(const void *, const void *))cmp_entries_dup, NULL); @@ -885,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries, for (i = 1; i < n_entries; i++) { if (!memcmp(sort_entries[i]->key, key, key_size)) { dups++; total_dups++; - err = merge_dup(sort_entries, i - dups, i); - if (err) - return err; continue; } key = sort_entries[i]->key; dups = 0; } - if (!total_dups) - return total_dups; - - for (i = 0, j = 0; i < n_entries; i++) { - if (!sort_entries[i]->dup) { - sort_entries[j] = sort_entries[i]; - if (j++ != i) - sort_entries[i] = NULL; - } else { - destroy_sort_entry(sort_entries[i]); - sort_entries[i] = NULL; - } - } - - return total_dups; + WARN_ONCE(total_dups > 0, + "Duplicates detected: %d\n", total_dups); } static bool is_key(struct tracing_map *map, unsigned int field_idx) @@ -1034,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map, return 1; } - ret = merge_dups(entries, n_entries, map->key_size); - if (ret < 0) - goto free; - n_entries -= ret; + detect_dups(entries, n_entries, map->key_size); if (is_key(map, sort_keys[0].field_idx)) cmp_entries_fn = cmp_entries_key; diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 5b5bbf8ae550..053eb92b2d31 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -10,6 +10,7 @@ #define TRACING_MAP_VALS_MAX 3 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ TRACING_MAP_VALS_MAX) +#define TRACING_MAP_VARS_MAX 16 #define TRACING_MAP_SORT_KEYS_MAX 2 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); @@ -137,6 +138,8 @@ struct tracing_map_field { struct tracing_map_elt { struct tracing_map *map; struct tracing_map_field *fields; + atomic64_t *vars; + bool *var_set; void *key; void *private_data; }; @@ -192,6 +195,7 @@ struct tracing_map { int key_idx[TRACING_MAP_KEYS_MAX]; unsigned int n_keys; struct tracing_map_sort_key sort_key; + unsigned int n_vars; atomic64_t hits; atomic64_t drops; }; @@ -215,11 +219,6 @@ struct tracing_map { * Element allocation occurs before tracing begins, when the * tracing_map_init() call is made by client code. * - * @elt_copy: At certain points in the lifetime of an element, it may - * need to be copied. The copy should include a copy of the - * client-allocated data, which can be copied into the 'to' - * element from the 'from' element. - * * @elt_free: When a tracing_map_elt is freed, this function is called * and allows client-allocated per-element data to be freed. * @@ -233,8 +232,6 @@ struct tracing_map { */ struct tracing_map_ops { int (*elt_alloc)(struct tracing_map_elt *elt); - void (*elt_copy)(struct tracing_map_elt *to, - struct tracing_map_elt *from); void (*elt_free)(struct tracing_map_elt *elt); void (*elt_clear)(struct tracing_map_elt *elt); void (*elt_init)(struct tracing_map_elt *elt); @@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits, extern int tracing_map_init(struct tracing_map *map); extern int tracing_map_add_sum_field(struct tracing_map *map); +extern int tracing_map_add_var(struct tracing_map *map); extern int tracing_map_add_key_field(struct tracing_map *map, unsigned int offset, tracing_map_cmp_fn_t cmp_fn); @@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b); extern void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n); +extern void tracing_map_set_var(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i); + extern void tracing_map_set_field_descr(struct tracing_map *map, unsigned int i, unsigned int key_offset, diff --git a/kernel/ucount.c b/kernel/ucount.c index b4eeee03934f..f48d1b6376a4 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -10,6 +10,7 @@ #include <linux/slab.h> #include <linux/cred.h> #include <linux/hash.h> +#include <linux/kmemleak.h> #include <linux/user_namespace.h> #define UCOUNTS_HASHTABLE_BITS 10 diff --git a/kernel/uid16.c b/kernel/uid16.c index ef1da2a5f9bd..af6925d8599b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -18,44 +18,46 @@ #include <linux/uaccess.h> +#include "uid16.h" + SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - return sys_chown(filename, low2highuid(user), low2highgid(group)); + return ksys_chown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - return sys_lchown(filename, low2highuid(user), low2highgid(group)); + return ksys_lchown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) { - return sys_fchown(fd, low2highuid(user), low2highgid(group)); + return ksys_fchown(fd, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) { - return sys_setregid(low2highgid(rgid), low2highgid(egid)); + return __sys_setregid(low2highgid(rgid), low2highgid(egid)); } SYSCALL_DEFINE1(setgid16, old_gid_t, gid) { - return sys_setgid(low2highgid(gid)); + return __sys_setgid(low2highgid(gid)); } SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) { - return sys_setreuid(low2highuid(ruid), low2highuid(euid)); + return __sys_setreuid(low2highuid(ruid), low2highuid(euid)); } SYSCALL_DEFINE1(setuid16, old_uid_t, uid) { - return sys_setuid(low2highuid(uid)); + return __sys_setuid(low2highuid(uid)); } SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) { - return sys_setresuid(low2highuid(ruid), low2highuid(euid), + return __sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); } @@ -78,11 +80,10 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) { - return sys_setresgid(low2highgid(rgid), low2highgid(egid), + return __sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); } - SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) { const struct cred *cred = current_cred(); @@ -102,12 +103,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) { - return sys_setfsuid(low2highuid(uid)); + return __sys_setfsuid(low2highuid(uid)); } SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) { - return sys_setfsgid(low2highgid(gid)); + return __sys_setfsgid(low2highgid(gid)); } static int groups16_to_user(old_gid_t __user *grouplist, diff --git a/kernel/uid16.h b/kernel/uid16.h new file mode 100644 index 000000000000..cdca040f7602 --- /dev/null +++ b/kernel/uid16.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_UID16_H +#define LINUX_UID16_H + +long __sys_setuid(uid_t uid); +long __sys_setgid(gid_t gid); +long __sys_setreuid(uid_t ruid, uid_t euid); +long __sys_setregid(gid_t rgid, gid_t egid); +long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid); +long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid); +long __sys_setfsuid(uid_t uid); +long __sys_setfsgid(gid_t gid); + +#endif /* LINUX_UID16_H */ diff --git a/kernel/umh.c b/kernel/umh.c index 18e5fa4b0e71..f76b3ff876cf 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -118,7 +118,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; - /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) { @@ -135,7 +135,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) * * Thus the __user pointer cast is valid here. */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); + kernel_wait4(pid, (int __user *)&ret, 0, NULL); /* * If ret is 0, either call_usermodehelper_exec_async failed and diff --git a/kernel/user.c b/kernel/user.c index 9a20acce460d..36288d840675 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,6 +101,7 @@ struct user_struct root_user = { .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, + .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* @@ -191,6 +192,8 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; atomic_set(&new->__count, 1); + ratelimit_state_init(&new->ratelimit, HZ, 100); + ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced diff --git a/kernel/utsname.c b/kernel/utsname.c index 913fe4336d2b..dcd6be1996fe 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -19,6 +19,8 @@ #include <linux/proc_ns.h> #include <linux/sched/task.h> +static struct kmem_cache *uts_ns_cache __ro_after_init; + static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); @@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; - uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) kref_init(&uts_ns->kref); return uts_ns; @@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void) /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone - * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise + * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) @@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, return ns; fail_free: - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); fail_dec: dec_uts_namespaces(ucounts); fail: @@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref) dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) @@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = { .install = utsns_install, .owner = utsns_owner, }; + +void __init uts_ns_init(void) +{ + uts_ns_cache = kmem_cache_create_usercopy( + "uts_namespace", sizeof(struct uts_namespace), 0, + SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct uts_namespace, name), + sizeof_field(struct uts_namespace, name), + NULL); +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8dd2e66e8383..ca7959be8aaa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -153,10 +153,9 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ struct list_head worklist; /* L: list of pending works */ - int nr_workers; /* L: total number of workers */ - /* nr_idle includes the ones off idle_list for rebinding */ - int nr_idle; /* L: currently idle ones */ + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle workers */ struct list_head idle_list; /* X: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ @@ -166,7 +165,6 @@ struct worker_pool { DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ - /* see manage_workers() for details on the two manager mutexes */ struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ @@ -1604,6 +1602,40 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(mod_delayed_work_on); +static void rcu_work_rcufn(struct rcu_head *rcu) +{ + struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); + + /* read the comment in __queue_work() */ + local_irq_disable(); + __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); + local_irq_enable(); +} + +/** + * queue_rcu_work - queue work after a RCU grace period + * @wq: workqueue to use + * @rwork: work to queue + * + * Return: %false if @rwork was already pending, %true otherwise. Note + * that a full RCU grace period is guaranteed only after a %true return. + * While @rwork is guarnateed to be executed after a %false return, the + * execution may happen before a full RCU grace period has passed. + */ +bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) +{ + struct work_struct *work = &rwork->work; + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + rwork->wq = wq; + call_rcu(&rwork->rcu, rcu_work_rcufn); + return true; + } + + return false; +} +EXPORT_SYMBOL(queue_rcu_work); + /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state @@ -3001,6 +3033,26 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +/** + * flush_rcu_work - wait for a rwork to finish executing the last queueing + * @rwork: the rcu work to flush + * + * Return: + * %true if flush_rcu_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_rcu_work(struct rcu_work *rwork) +{ + if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { + rcu_barrier(); + flush_work(&rwork->work); + return true; + } else { + return flush_work(&rwork->work); + } +} +EXPORT_SYMBOL(flush_rcu_work); + static bool __cancel_work(struct work_struct *work, bool is_dwork) { unsigned long flags; @@ -3018,14 +3070,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) return ret; } -/* - * See cancel_delayed_work() - */ -bool cancel_work(struct work_struct *work) -{ - return __cancel_work(work, false); -} - /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -3807,6 +3851,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, return ret; } +EXPORT_SYMBOL_GPL(apply_workqueue_attrs); /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug @@ -4179,6 +4224,22 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** + * current_work - retrieve %current task's work struct + * + * Determine if %current task is a workqueue worker and what it's working on. + * Useful to find out the context that the %current task is running in. + * + * Return: work struct if %current task is a workqueue worker, %NULL otherwise. + */ +struct work_struct *current_work(void) +{ + struct worker *worker = current_wq_worker(); + + return worker ? worker->current_work : NULL; +} +EXPORT_SYMBOL(current_work); + +/** * current_is_workqueue_rescuer - is %current workqueue rescuer? * * Determine whether %current is a workqueue rescuer. Can be used from @@ -5320,7 +5381,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) ret = device_register(&wq_dev->dev); if (ret) { - kfree(wq_dev); + put_device(&wq_dev->dev); wq->wq_dev = NULL; return ret; } @@ -5564,12 +5625,13 @@ static void __init wq_numa_init(void) int __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |