diff options
Diffstat (limited to 'kernel/bpf/verifier.c')
-rw-r--r-- | kernel/bpf/verifier.c | 1096 |
1 files changed, 887 insertions, 209 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b2116ca78d9a..d6db6de3e9ea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24,6 +24,7 @@ #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> #include <linux/poison.h> +#include <linux/module.h> #include "disasm.h" @@ -302,6 +303,10 @@ struct bpf_kfunc_call_arg_meta { enum bpf_dynptr_type type; u32 id; } initialized_dynptr; + struct { + u8 spi; + u8 frameno; + } iter; u64 mem_size; }; @@ -330,61 +335,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off) return &linfo[i - 1]; } -void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, - va_list args) -{ - unsigned int n; - - n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - - WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, - "verifier log line truncated - local buffer too short\n"); - - if (log->level == BPF_LOG_KERNEL) { - bool newline = n > 0 && log->kbuf[n - 1] == '\n'; - - pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); - return; - } - - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) - log->len_used += n; - else - log->ubuf = NULL; -} - -static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) -{ - char zero = 0; - - if (!bpf_verifier_log_needed(log)) - return; - - log->len_used = new_pos; - if (put_user(zero, log->ubuf + new_pos)) - log->ubuf = NULL; -} - -/* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program - */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...) -{ - va_list args; - - if (!bpf_verifier_log_needed(&env->log)) - return; - - va_start(args, fmt); - bpf_verifier_vlog(&env->log, fmt, args); - va_end(args); -} -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); - __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) { struct bpf_verifier_env *env = private_data; @@ -398,20 +348,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } -__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, - const char *fmt, ...) -{ - va_list args; - - if (!bpf_verifier_log_needed(log)) - return; - - va_start(args, fmt); - bpf_verifier_vlog(log, fmt, args); - va_end(args); -} -EXPORT_SYMBOL_GPL(bpf_log); - static const char *ltrim(const char *s) { while (isspace(*s)) @@ -481,8 +417,17 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool type_may_be_null(u32 type) +{ + return type & PTR_MAYBE_NULL; +} + static bool reg_type_not_null(enum bpf_reg_type type) { + if (type_may_be_null(type)) + return false; + + type = base_type(type); return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || @@ -526,11 +471,6 @@ static bool type_is_rdonly_mem(u32 type) return type & MEM_RDONLY; } -static bool type_may_be_null(u32 type) -{ - return type & PTR_MAYBE_NULL; -} - static bool is_acquire_function(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -668,6 +608,7 @@ static char slot_type_char[] = { [STACK_MISC] = 'm', [STACK_ZERO] = '0', [STACK_DYNPTR] = 'd', + [STACK_ITER] = 'i', }; static void print_liveness(struct bpf_verifier_env *env, @@ -742,7 +683,12 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS); } -static const char *kernel_type_name(const struct btf* btf, u32 id) +static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) +{ + return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); +} + +static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } @@ -766,6 +712,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type) } } +static const char *iter_type_str(const struct btf *btf, u32 btf_id) +{ + if (!btf || btf_id == 0) + return "<invalid>"; + + /* we already validated that type is valid and has conforming name */ + return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; +} + +static const char *iter_state_str(enum bpf_iter_state state) +{ + switch (state) { + case BPF_ITER_STATE_ACTIVE: + return "active"; + case BPF_ITER_STATE_DRAINED: + return "drained"; + case BPF_ITER_STATE_INVALID: + return "<invalid>"; + default: + WARN_ONCE(1, "unknown iter state %d\n", state); + return "<unknown>"; + } +} + static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; @@ -1118,6 +1088,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg } } +static void __mark_reg_known_zero(struct bpf_reg_state *reg); + +static int mark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int insn_idx, + struct btf *btf, u32 btf_id, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j, id; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + + id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + __mark_reg_known_zero(st); + st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ + st->live |= REG_LIVE_WRITTEN; + st->ref_obj_id = i == 0 ? id : 0; + st->iter.btf = btf; + st->iter.btf_id = btf_id; + st->iter.state = BPF_ITER_STATE_ACTIVE; + st->iter.depth = 0; + + for (j = 0; j < BPF_REG_SIZE; j++) + slot->slot_type[j] = STACK_ITER; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + +static int unmark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + if (i == 0) + WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + + __mark_reg_not_init(env, st); + + /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ + st->live |= REG_LIVE_WRITTEN; + + for (j = 0; j < BPF_REG_SIZE; j++) + slot->slot_type[j] = STACK_INVALID; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + +static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + /* For -ERANGE (i.e. spi not falling into allocated stack slots), we + * will do check_mem_access to check and update stack bounds later, so + * return true for that case. + */ + spi = iter_get_spi(env, reg, nr_slots); + if (spi == -ERANGE) + return true; + if (spi < 0) + return false; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + + for (j = 0; j < BPF_REG_SIZE; j++) + if (slot->slot_type[j] == STACK_ITER) + return false; + } + + return true; +} + +static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + struct btf *btf, u32 btf_id, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return false; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + /* only main (first) slot has ref_obj_id set */ + if (i == 0 && !st->ref_obj_id) + return false; + if (i != 0 && st->ref_obj_id) + return false; + if (st->iter.btf != btf || st->iter.btf_id != btf_id) + return false; + + for (j = 0; j < BPF_REG_SIZE; j++) + if (slot->slot_type[j] != STACK_ITER) + return false; + } + + return true; +} + +/* Check if given stack slot is "special": + * - spilled register state (STACK_SPILL); + * - dynptr state (STACK_DYNPTR); + * - iter state (STACK_ITER). + */ +static bool is_stack_slot_special(const struct bpf_stack_state *stack) +{ + enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1]; + + switch (type) { + case STACK_SPILL: + case STACK_DYNPTR: + case STACK_ITER: + return true; + case STACK_INVALID: + case STACK_MISC: + case STACK_ZERO: + return false; + default: + WARN_ONCE(1, "unknown stack slot type %d\n", type); + return true; + } +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -1164,7 +1285,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "%s", reg_type_str(env, t)); if (base_type(t) == PTR_TO_BTF_ID) - verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); + verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); verbose(env, "("); /* * _a stands for append, was shortened to avoid multiline statements below. @@ -1267,6 +1388,19 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (reg->ref_obj_id) verbose(env, "(ref_id=%d)", reg->ref_obj_id); break; + case STACK_ITER: + /* only main slot has ref_obj_id set; skip others */ + reg = &state->stack[i].spilled_ptr; + if (!reg->ref_obj_id) + continue; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + iter_type_str(reg->iter.btf, reg->iter.btf_id), + reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->iter.depth); + break; case STACK_MISC: case STACK_ZERO: default: @@ -1305,10 +1439,10 @@ static inline u32 vlog_alignment(u32 pos) static void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) { - if (env->prev_log_len && env->prev_log_len == env->log.len_used) { + if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { /* remove new line character */ - bpf_vlog_reset(&env->log, env->prev_log_len - 1); - verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' '); + bpf_vlog_reset(&env->log, env->prev_log_pos - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); } else { verbose(env, "%d:", env->insn_idx); } @@ -1616,7 +1750,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; - elem->log_pos = env->log.len_used; + elem->log_pos = env->log.end_pos; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -1946,9 +2080,9 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) struct tnum var64_off = tnum_intersect(reg->var_off, tnum_range(reg->umin_value, reg->umax_value)); - struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off), - tnum_range(reg->u32_min_value, - reg->u32_max_value)); + struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off), + tnum_range(reg->u32_min_value, + reg->u32_max_value)); reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } @@ -2152,7 +2286,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; - elem->log_pos = env->log.len_used; + elem->log_pos = env->log.end_pos; env->head = elem; env->stack_size++; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { @@ -2710,6 +2844,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state * state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64); } +static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int spi, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int err, i; + + for (i = 0; i < nr_slots; i++) { + struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; + + err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); + if (err) + return err; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + /* This function is supposed to be used by the following 32-bit optimization * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. @@ -3691,8 +3844,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* regular write of data into stack destroys any spilled ptr */ state->stack[spi].spilled_ptr.type = NOT_INIT; - /* Mark slots as STACK_MISC if they belonged to spilled ptr. */ - if (is_spilled_reg(&state->stack[spi])) + /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ + if (is_stack_slot_special(&state->stack[spi])) for (i = 0; i < BPF_REG_SIZE; i++) scrub_spilled_slot(&state->stack[spi].slot_type[i]); @@ -4085,17 +4238,13 @@ static int check_stack_read(struct bpf_verifier_env *env, } /* Variable offset is prohibited for unprivileged mode for simplicity * since it requires corresponding support in Spectre masking for stack - * ALU. See also retrieve_ptr_limit(). + * ALU. See also retrieve_ptr_limit(). The check in + * check_stack_access_for_ptr_arithmetic() called by + * adjust_ptr_min_max_vals() prevents users from creating stack pointers + * with variable offsets, therefore no check is required here. Further, + * just checking it here would be insufficient as speculative stack + * writes could still lead to unsafe speculative behaviour. */ - if (!env->bypass_spec_v1 && var_off) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", - ptr_regno, tn_buf); - return -EACCES; - } - if (!var_off) { off += reg->var_off.value; err = check_stack_read_fixed_off(env, state, off, size, @@ -4301,7 +4450,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct btf_field *kptr_field, struct bpf_reg_state *reg, u32 regno) { - const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); + const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; const char *reg_name = ""; @@ -4317,7 +4466,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, return -EINVAL; } /* We need to verify reg->type and reg->btf, before accessing reg->btf */ - reg_name = kernel_type_name(reg->btf, reg->btf_id); + reg_name = btf_type_name(reg->btf, reg->btf_id); /* For ref_ptr case, release function check should ensure we get one * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the @@ -4381,6 +4530,8 @@ static bool in_rcu_cs(struct bpf_verifier_env *env) BTF_SET_START(rcu_protected_types) BTF_ID(struct, prog_test_ref_kfunc) BTF_ID(struct, cgroup) +BTF_ID(struct, bpf_cpumask) +BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) @@ -4754,6 +4905,11 @@ static bool is_rcu_reg(const struct bpf_reg_state *reg) return reg->type & MEM_RCU; } +static void clear_trusted_flags(enum bpf_type_flag *flag) +{ + *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU); +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -5158,6 +5314,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) } #define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) +#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null) #define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) /* @@ -5174,18 +5331,39 @@ BTF_TYPE_SAFE_RCU(struct task_struct) { struct task_struct *group_leader; }; +BTF_TYPE_SAFE_RCU(struct cgroup) { + /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */ + struct kernfs_node *kn; +}; + BTF_TYPE_SAFE_RCU(struct css_set) { struct cgroup *dfl_cgrp; }; +/* RCU trusted: these fields are trusted in RCU CS and can be NULL */ +BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { + struct file __rcu *exe_file; +}; + +/* skb->sk, req->sk are not RCU protected, but we mark them as such + * because bpf prog accessible sockets are SOCK_RCU_FREE. + */ +BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) { + struct sock *sk; +}; + +BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) { + struct sock *sk; +}; + /* full trusted: these fields are trusted even outside of RCU CS and never NULL */ BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) { - __bpf_md_ptr(struct seq_file *, seq); + struct seq_file *seq; }; BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) { - __bpf_md_ptr(struct bpf_iter_meta *, meta); - __bpf_md_ptr(struct task_struct *, task); + struct bpf_iter_meta *meta; + struct task_struct *task; }; BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) { @@ -5207,17 +5385,29 @@ BTF_TYPE_SAFE_TRUSTED(struct socket) { static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int off) + const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); - return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu"); + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); +} + +static bool type_is_rcu_or_null(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const char *field_name, u32 btf_id) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock)); + + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null"); } static bool type_is_trusted(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int off) + const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); @@ -5226,7 +5416,7 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); - return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted"); + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, @@ -5238,8 +5428,9 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); + const char *field_name = NULL; enum bpf_type_flag flag = 0; - u32 btf_id; + u32 btf_id = 0; int ret; if (!env->allow_ptr_leaks) { @@ -5284,12 +5475,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) { + if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) { if (!btf_is_kernel(reg->btf)) { verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); return -EFAULT; } - ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); + ret = env->ops->btf_struct_access(&env->log, reg, off, size); } else { /* Writes are permitted with default btf_struct_access for * program allocated objects (which always have ref_obj_id > 0), @@ -5306,7 +5497,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EFAULT; } - ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); + ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name); } if (ret < 0) @@ -5334,20 +5525,21 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * A regular RCU-protected pointer with __rcu tag can also be deemed * trusted if we are in an RCU CS. Such pointer can be NULL. */ - if (type_is_trusted(env, reg, off)) { + if (type_is_trusted(env, reg, field_name, btf_id)) { flag |= PTR_TRUSTED; } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { - if (type_is_rcu(env, reg, off)) { + if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ flag |= MEM_RCU; - } else if (flag & MEM_RCU) { + } else if (flag & MEM_RCU || + type_is_rcu_or_null(env, reg, field_name, btf_id)) { /* __rcu tagged pointers can be NULL */ - flag |= PTR_MAYBE_NULL; + flag |= MEM_RCU | PTR_MAYBE_NULL; } else if (flag & (MEM_PERCPU | MEM_USER)) { /* keep as-is */ } else { - /* walking unknown pointers yields untrusted pointer */ - flag = PTR_UNTRUSTED; + /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */ + clear_trusted_flags(&flag); } } else { /* @@ -5361,7 +5553,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } } else { /* Old compat. Deprecated */ - flag &= ~PTR_TRUSTED; + clear_trusted_flags(&flag); } if (atype == BPF_READ && value_regno >= 0) @@ -5420,7 +5612,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, /* Simulate access to a PTR_TO_BTF_ID */ memset(&map_reg, 0, sizeof(map_reg)); mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); - ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag); + ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL); if (ret < 0) return ret; @@ -6086,6 +6278,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER, meta); + case PTR_TO_BTF_ID: + return check_ptr_to_btf_access(env, regs, regno, reg->off, + access_size, BPF_READ, -1); case PTR_TO_CTX: /* in case the function doesn't know how to access the context, * (because we are in a program of type SYSCALL for example), we @@ -6506,6 +6701,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return err; } +static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) +{ + struct bpf_func_state *state = func(env, reg); + + return state->stack[spi].spilled_ptr.ref_obj_id; +} + +static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); +} + +static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_NEW; +} + +static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_NEXT; +} + +static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_DESTROY; +} + +static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg) +{ + /* btf_check_iter_kfuncs() guarantees that first argument of any iter + * kfunc is iter state pointer + */ + return arg == 0 && is_iter_kfunc(meta); +} + +static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + const struct btf_type *t; + const struct btf_param *arg; + int spi, err, i, nr_slots; + u32 btf_id; + + /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */ + arg = &btf_params(meta->func_proto)[0]; + t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */ + t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */ + nr_slots = t->size / BPF_REG_SIZE; + + if (is_iter_new_kfunc(meta)) { + /* bpf_iter_<type>_new() expects pointer to uninit iter state */ + if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { + verbose(env, "expected uninitialized iter_%s as arg #%d\n", + iter_type_str(meta->btf, btf_id), regno); + return -EINVAL; + } + + for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { + err = check_mem_access(env, insn_idx, regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; + } + + err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots); + if (err) + return err; + } else { + /* iter_next() or iter_destroy() expect initialized iter state*/ + if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) { + verbose(env, "expected an initialized iter_%s as arg #%d\n", + iter_type_str(meta->btf, btf_id), regno); + return -EINVAL; + } + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + + err = mark_iter_read(env, reg, spi, nr_slots); + if (err) + return err; + + /* remember meta->iter info for process_iter_next_call() */ + meta->iter.spi = spi; + meta->iter.frameno = reg->frameno; + meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); + + if (is_iter_destroy_kfunc(meta)) { + err = unmark_stack_slots_iter(env, reg, nr_slots); + if (err) + return err; + } + } + + return 0; +} + +/* process_iter_next_call() is called when verifier gets to iterator's next + * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer + * to it as just "iter_next()" in comments below. + * + * BPF verifier relies on a crucial contract for any iter_next() + * implementation: it should *eventually* return NULL, and once that happens + * it should keep returning NULL. That is, once iterator exhausts elements to + * iterate, it should never reset or spuriously return new elements. + * + * With the assumption of such contract, process_iter_next_call() simulates + * a fork in the verifier state to validate loop logic correctness and safety + * without having to simulate infinite amount of iterations. + * + * In current state, we first assume that iter_next() returned NULL and + * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such + * conditions we should not form an infinite loop and should eventually reach + * exit. + * + * Besides that, we also fork current state and enqueue it for later + * verification. In a forked state we keep iterator state as ACTIVE + * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We + * also bump iteration depth to prevent erroneous infinite loop detection + * later on (see iter_active_depths_differ() comment for details). In this + * state we assume that we'll eventually loop back to another iter_next() + * calls (it could be in exactly same location or in some other instruction, + * it doesn't matter, we don't make any unnecessary assumptions about this, + * everything revolves around iterator state in a stack slot, not which + * instruction is calling iter_next()). When that happens, we either will come + * to iter_next() with equivalent state and can conclude that next iteration + * will proceed in exactly the same way as we just verified, so it's safe to + * assume that loop converges. If not, we'll go on another iteration + * simulation with a different input state, until all possible starting states + * are validated or we reach maximum number of instructions limit. + * + * This way, we will either exhaustively discover all possible input states + * that iterator loop can start with and eventually will converge, or we'll + * effectively regress into bounded loop simulation logic and either reach + * maximum number of instructions if loop is not provably convergent, or there + * is some statically known limit on number of iterations (e.g., if there is + * an explicit `if n > 100 then break;` statement somewhere in the loop). + * + * One very subtle but very important aspect is that we *always* simulate NULL + * condition first (as the current state) before we simulate non-NULL case. + * This has to do with intricacies of scalar precision tracking. By simulating + * "exit condition" of iter_next() returning NULL first, we make sure all the + * relevant precision marks *that will be set **after** we exit iterator loop* + * are propagated backwards to common parent state of NULL and non-NULL + * branches. Thanks to that, state equivalence checks done later in forked + * state, when reaching iter_next() for ACTIVE iterator, can assume that + * precision marks are finalized and won't change. Because simulating another + * ACTIVE iterator iteration won't change them (because given same input + * states we'll end up with exactly same output states which we are currently + * comparing; and verification after the loop already propagated back what + * needs to be **additionally** tracked as precise). It's subtle, grok + * precision tracking for more intuitive understanding. + */ +static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_verifier_state *cur_st = env->cur_state, *queued_st; + struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; + struct bpf_reg_state *cur_iter, *queued_iter; + int iter_frameno = meta->iter.frameno; + int iter_spi = meta->iter.spi; + + BTF_TYPE_EMIT(struct bpf_iter); + + cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + + if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && + cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { + verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n", + cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); + return -EFAULT; + } + + if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) { + /* branch out active iter state */ + queued_st = push_stack(env, insn_idx + 1, insn_idx, false); + if (!queued_st) + return -ENOMEM; + + queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; + queued_iter->iter.depth++; + + queued_fr = queued_st->frame[queued_st->curframe]; + mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]); + } + + /* switch to DRAINED state, but keep the depth unchanged */ + /* mark current iter state as drained and assume returned NULL */ + cur_iter->iter.state = BPF_ITER_STATE_DRAINED; + __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]); + + return 0; +} + static bool arg_type_is_mem_size(enum bpf_arg_type type) { return type == ARG_CONST_SIZE || @@ -6600,6 +6992,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MEM, PTR_TO_MEM | MEM_RINGBUF, PTR_TO_BUF, + PTR_TO_BTF_ID | PTR_TRUSTED, }, }; @@ -6709,6 +7102,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (arg_type & PTR_MAYBE_NULL) type &= ~PTR_MAYBE_NULL; + if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC) + type &= ~MEM_ALLOC; + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -6728,10 +7124,23 @@ found: if (base_type(reg->type) != PTR_TO_BTF_ID) return 0; + if (compatible == &mem_types) { + if (!(arg_type & MEM_RDONLY)) { + verbose(env, + "%s() may write into memory pointed by R%d type=%s\n", + func_id_name(meta->func_id), + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + return 0; + } + switch ((int)reg->type) { case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: + case PTR_TO_BTF_ID | PTR_MAYBE_NULL: + case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU: { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This @@ -6740,6 +7149,12 @@ found: bool strict_type_match = arg_type_is_release(arg_type) && meta->func_id != BPF_FUNC_sk_release; + if (type_may_be_null(reg->type) && + (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) { + verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); + return -EACCES; + } + if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -6763,15 +7178,16 @@ found: btf_vmlinux, *arg_btf_id, strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf, reg->btf_id), - kernel_type_name(btf_vmlinux, *arg_btf_id)); + regno, btf_type_name(reg->btf, reg->btf_id), + btf_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } } break; } case PTR_TO_BTF_ID | MEM_ALLOC: - if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { + if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && + meta->func_id != BPF_FUNC_kptr_xchg) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } @@ -6834,7 +7250,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, verbose(env, "R%d must have zero offset when passed to release func\n", regno); verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, - kernel_type_name(reg->btf, reg->btf_id), reg->off); + btf_type_name(reg->btf, reg->btf_id), reg->off); return -EINVAL; } @@ -8737,6 +9153,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_kptr_xchg) { ret_btf = meta.kptr_field->kptr.btf; ret_btf_id = meta.kptr_field->kptr.btf_id; + if (!btf_is_kernel(ret_btf)) + regs[BPF_REG_0].type |= MEM_ALLOC; } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); @@ -8870,7 +9288,7 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) { - return meta->kfunc_flags & KF_TRUSTED_ARGS; + return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta); } static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) @@ -9099,6 +9517,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */ KF_ARG_PTR_TO_DYNPTR, + KF_ARG_PTR_TO_ITER, KF_ARG_PTR_TO_LIST_HEAD, KF_ARG_PTR_TO_LIST_NODE, KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ @@ -9220,6 +9639,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_DYNPTR; + if (is_kfunc_arg_iter(meta, argno)) + return KF_ARG_PTR_TO_ITER; + if (is_kfunc_arg_list_head(meta->btf, &args[argno])) return KF_ARG_PTR_TO_LIST_HEAD; @@ -9848,6 +10270,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_KPTR: case KF_ARG_PTR_TO_DYNPTR: + case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: case KF_ARG_PTR_TO_RB_ROOT: @@ -9944,6 +10367,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } + case KF_ARG_PTR_TO_ITER: + ret = process_iter_arg(env, regno, insn_idx, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { @@ -10079,24 +10507,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return 0; } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx_p) +static int fetch_kfunc_meta(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_kfunc_call_arg_meta *meta, + const char **kfunc_name) { - const struct btf_type *t, *func, *func_proto, *ptr_type; - u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id; - struct bpf_reg_state *regs = cur_regs(env); - const char *func_name, *ptr_type_name; - bool sleepable, rcu_lock, rcu_unlock; - struct bpf_kfunc_call_arg_meta meta; - int err, insn_idx = *insn_idx_p; - const struct btf_param *args; - const struct btf_type *ret_t; + const struct btf_type *func, *func_proto; + u32 func_id, *kfunc_flags; + const char *func_name; struct btf *desc_btf; - u32 *kfunc_flags; - /* skip for now, but return error when we find this in fixup_kfunc_call */ + if (kfunc_name) + *kfunc_name = NULL; + if (!insn->imm) - return 0; + return -EINVAL; desc_btf = find_kfunc_desc_btf(env, insn->off); if (IS_ERR(desc_btf)) @@ -10105,22 +10530,53 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_id = insn->imm; func = btf_type_by_id(desc_btf, func_id); func_name = btf_name_by_offset(desc_btf, func->name_off); + if (kfunc_name) + *kfunc_name = func_name; func_proto = btf_type_by_id(desc_btf, func->type); kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id); if (!kfunc_flags) { - verbose(env, "calling kernel function %s is not allowed\n", - func_name); return -EACCES; } - /* Prepare kfunc call metadata */ - memset(&meta, 0, sizeof(meta)); - meta.btf = desc_btf; - meta.func_id = func_id; - meta.kfunc_flags = *kfunc_flags; - meta.func_proto = func_proto; - meta.func_name = func_name; + memset(meta, 0, sizeof(*meta)); + meta->btf = desc_btf; + meta->func_id = func_id; + meta->kfunc_flags = *kfunc_flags; + meta->func_proto = func_proto; + meta->func_name = func_name; + + return 0; +} + +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) +{ + const struct btf_type *t, *ptr_type; + u32 i, nargs, ptr_type_id, release_ref_obj_id; + struct bpf_reg_state *regs = cur_regs(env); + const char *func_name, *ptr_type_name; + bool sleepable, rcu_lock, rcu_unlock; + struct bpf_kfunc_call_arg_meta meta; + struct bpf_insn_aux_data *insn_aux; + int err, insn_idx = *insn_idx_p; + const struct btf_param *args; + const struct btf_type *ret_t; + struct btf *desc_btf; + + /* skip for now, but return error when we find this in fixup_kfunc_call */ + if (!insn->imm) + return 0; + + err = fetch_kfunc_meta(env, insn, &meta, &func_name); + if (err == -EACCES && func_name) + verbose(env, "calling kernel function %s is not allowed\n", func_name); + if (err) + return err; + desc_btf = meta.btf; + insn_aux = &env->insn_aux_data[insn_idx]; + + insn_aux->is_iter_next = is_iter_next_kfunc(&meta); if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); @@ -10173,7 +10629,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, err = release_reference(env, regs[meta.release_regno].ref_obj_id); if (err) { verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10185,14 +10641,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", - func_name, func_id); + func_name, meta.func_id); return err; } err = release_reference(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10202,7 +10658,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, set_rbtree_add_callback_state); if (err) { verbose(env, "kfunc %s#%d failed callback verification\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10211,7 +10667,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_reg_not_init(env, regs, caller_saved[i]); /* Check return type */ - t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); + t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { /* Only exception is bpf_obj_new_impl */ @@ -10260,13 +10716,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; - env->insn_aux_data[insn_idx].obj_new_size = ret_t->size; - env->insn_aux_data[insn_idx].kptr_struct_meta = + insn_aux->obj_new_size = ret_t->size; + insn_aux->kptr_struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { - env->insn_aux_data[insn_idx].kptr_struct_meta = - btf_find_struct_meta(meta.arg_obj_drop.btf, - meta.arg_obj_drop.btf_id); } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { struct btf_field *field = meta.arg_list_head.field; @@ -10395,10 +10847,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; - } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ + } else if (btf_type_is_void(t)) { + if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta.arg_obj_drop.btf, + meta.arg_obj_drop.btf_id); + } + } + } - nargs = btf_type_vlen(func_proto); - args = (const struct btf_param *)(func_proto + 1); + nargs = btf_type_vlen(meta.func_proto); + args = (const struct btf_param *)(meta.func_proto + 1); for (i = 0; i < nargs; i++) { u32 regno = i + 1; @@ -10410,6 +10870,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_btf_func_reg_size(env, regno, t->size); } + if (is_iter_next_kfunc(&meta)) { + err = process_iter_next_call(env, insn_idx, &meta); + if (err) + return err; + } + return 0; } @@ -12116,10 +12582,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) case BPF_JEQ: if (tnum_is_const(subreg)) return !!tnum_equals_const(subreg, val); + else if (val < reg->u32_min_value || val > reg->u32_max_value) + return 0; break; case BPF_JNE: if (tnum_is_const(subreg)) return !tnum_equals_const(subreg, val); + else if (val < reg->u32_min_value || val > reg->u32_max_value) + return 1; break; case BPF_JSET: if ((~subreg.mask & subreg.value) & val) @@ -12189,10 +12659,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) case BPF_JEQ: if (tnum_is_const(reg->var_off)) return !!tnum_equals_const(reg->var_off, val); + else if (val < reg->umin_value || val > reg->umax_value) + return 0; break; case BPF_JNE: if (tnum_is_const(reg->var_off)) return !tnum_equals_const(reg->var_off, val); + else if (val < reg->umin_value || val > reg->umax_value) + return 1; break; case BPF_JSET: if ((~reg->var_off.mask & reg->var_off.value) & val) @@ -12813,6 +13287,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, src_reg->var_off.value, opcode, is_jmp32); + } else if (dst_reg->type == SCALAR_VALUE && + is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) { + pred = is_branch_taken(src_reg, + tnum_subreg(dst_reg->var_off).value, + flip_opcode(opcode), + is_jmp32); + } else if (dst_reg->type == SCALAR_VALUE && + !is_jmp32 && tnum_is_const(dst_reg->var_off)) { + pred = is_branch_taken(src_reg, + dst_reg->var_off.value, + flip_opcode(opcode), + is_jmp32); } else if (reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg) && !is_jmp32) { @@ -13407,6 +13893,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].prune_point; } +static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].force_checkpoint = true; +} + +static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].force_checkpoint; +} + + enum { DONE_EXPLORING = 0, KEEP_EXPLORING = 1, @@ -13522,6 +14019,26 @@ static int visit_insn(int t, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + struct bpf_kfunc_call_arg_meta meta; + + ret = fetch_kfunc_meta(env, insn, &meta, NULL); + if (ret == 0 && is_iter_next_kfunc(&meta)) { + mark_prune_point(env, t); + /* Checking and saving state checkpoints at iter_next() call + * is crucial for fast convergence of open-coded iterator loop + * logic, so we need to force it. If we don't do that, + * is_state_visited() might skip saving a checkpoint, causing + * unnecessarily long sequence of not checkpointed + * instructions and jumps, leading to exhaustion of jump + * history buffer, and potentially other undesired outcomes. + * It is expected that with correct open-coded iterators + * convergence will happen quickly, so we don't run a risk of + * exhausting memory. + */ + mark_force_checkpoint(env, t); + } + } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: @@ -14275,6 +14792,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * didn't use them */ for (i = 0; i < old->allocated_stack; i++) { + struct bpf_reg_state *old_reg, *cur_reg; + spi = i / BPF_REG_SIZE; if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { @@ -14331,9 +14850,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return false; break; case STACK_DYNPTR: - { - const struct bpf_reg_state *old_reg, *cur_reg; - old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; if (old_reg->dynptr.type != cur_reg->dynptr.type || @@ -14341,7 +14857,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) return false; break; - } + case STACK_ITER: + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + /* iter.depth is not compared between states as it + * doesn't matter for correctness and would otherwise + * prevent convergence; we maintain it only to prevent + * infinite loop check triggering, see + * iter_active_depths_differ() + */ + if (old_reg->iter.btf != cur_reg->iter.btf || + old_reg->iter.btf_id != cur_reg->iter.btf_id || + old_reg->iter.state != cur_reg->iter.state || + /* ignore {old_reg,cur_reg}->iter.depth, see above */ + !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; case STACK_MISC: case STACK_ZERO: case STACK_INVALID: @@ -14555,10 +15086,11 @@ static int propagate_precision(struct bpf_verifier_env *env, state_reg = state->regs; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) + !state_reg->precise || + !(state_reg->live & REG_LIVE_READ)) continue; if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "frame %d: propagating r%d\n", i, fr); + verbose(env, "frame %d: propagating r%d\n", fr, i); err = mark_chain_precision_frame(env, fr, i); if (err < 0) return err; @@ -14569,11 +15101,12 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) + !state_reg->precise || + !(state_reg->live & REG_LIVE_READ)) continue; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "frame %d: propagating fp%d\n", - (-i - 1) * BPF_REG_SIZE, fr); + fr, (-i - 1) * BPF_REG_SIZE); err = mark_chain_precision_stack_frame(env, fr, i); if (err < 0) return err; @@ -14600,6 +15133,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old, return true; } +static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].is_iter_next; +} + +/* is_state_visited() handles iter_next() (see process_iter_next_call() for + * terminology) calls specially: as opposed to bounded BPF loops, it *expects* + * states to match, which otherwise would look like an infinite loop. So while + * iter_next() calls are taken care of, we still need to be careful and + * prevent erroneous and too eager declaration of "ininite loop", when + * iterators are involved. + * + * Here's a situation in pseudo-BPF assembly form: + * + * 0: again: ; set up iter_next() call args + * 1: r1 = &it ; <CHECKPOINT HERE> + * 2: call bpf_iter_num_next ; this is iter_next() call + * 3: if r0 == 0 goto done + * 4: ... something useful here ... + * 5: goto again ; another iteration + * 6: done: + * 7: r1 = &it + * 8: call bpf_iter_num_destroy ; clean up iter state + * 9: exit + * + * This is a typical loop. Let's assume that we have a prune point at 1:, + * before we get to `call bpf_iter_num_next` (e.g., because of that `goto + * again`, assuming other heuristics don't get in a way). + * + * When we first time come to 1:, let's say we have some state X. We proceed + * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit. + * Now we come back to validate that forked ACTIVE state. We proceed through + * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we + * are converging. But the problem is that we don't know that yet, as this + * convergence has to happen at iter_next() call site only. So if nothing is + * done, at 1: verifier will use bounded loop logic and declare infinite + * looping (and would be *technically* correct, if not for iterator's + * "eventual sticky NULL" contract, see process_iter_next_call()). But we + * don't want that. So what we do in process_iter_next_call() when we go on + * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's + * a different iteration. So when we suspect an infinite loop, we additionally + * check if any of the *ACTIVE* iterator states depths differ. If yes, we + * pretend we are not looping and wait for next iter_next() call. + * + * This only applies to ACTIVE state. In DRAINED state we don't expect to + * loop, because that would actually mean infinite loop, as DRAINED state is + * "sticky", and so we'll keep returning into the same instruction with the + * same state (at least in one of possible code paths). + * + * This approach allows to keep infinite loop heuristic even in the face of + * active iterator. E.g., C snippet below is and will be detected as + * inifintely looping: + * + * struct bpf_iter_num it; + * int *p, x; + * + * bpf_iter_num_new(&it, 0, 10); + * while ((p = bpf_iter_num_next(&t))) { + * x = p; + * while (x--) {} // <<-- infinite loop here + * } + * + */ +static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) +{ + struct bpf_reg_state *slot, *cur_slot; + struct bpf_func_state *state; + int i, fr; + + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_ITER) + continue; + + slot = &state->stack[i].spilled_ptr; + if (slot->iter.state != BPF_ITER_STATE_ACTIVE) + continue; + + cur_slot = &cur->frame[fr]->stack[i].spilled_ptr; + if (cur_slot->iter.depth != slot->iter.depth) + return true; + } + } + return false; +} static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { @@ -14607,7 +15226,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - bool add_new_state = env->test_state_freq ? true : false; + bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); + bool add_new_state = force_new_state; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -14647,8 +15267,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * Since the verifier still needs to catch infinite loops * inside async callbacks. */ - } else if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur)) { + goto skip_inf_loop_check; + } + /* BPF open-coded iterators loop detection is special. + * states_maybe_looping() logic is too simplistic in detecting + * states that *might* be equivalent, because it doesn't know + * about ID remapping, so don't even perform it. + * See process_iter_next_call() and iter_active_depths_differ() + * for overview of the logic. When current and one of parent + * states are detected as equivalent, it's a good thing: we prove + * convergence and can stop simulating further iterations. + * It's safe to assume that iterator loop will finish, taking into + * account iter_next() contract of eventually returning + * sticky NULL result. + */ + if (is_iter_next_insn(env, insn_idx)) { + if (states_equal(env, &sl->state, cur)) { + struct bpf_func_state *cur_frame; + struct bpf_reg_state *iter_state, *iter_reg; + int spi; + + cur_frame = cur->frame[cur->curframe]; + /* btf_check_iter_kfuncs() enforces that + * iter state pointer is always the first arg + */ + iter_reg = &cur_frame->regs[BPF_REG_1]; + /* current state is valid due to states_equal(), + * so we can assume valid iter and reg state, + * no need for extra (re-)validations + */ + spi = __get_spi(iter_reg->off + iter_reg->var_off.value); + iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; + if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) + goto hit; + } + goto skip_inf_loop_check; + } + /* attempt to detect infinite loop to avoid unnecessary doomed work */ + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur) && + !iter_active_depths_differ(&sl->state, cur)) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); return -EINVAL; @@ -14665,13 +15323,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * This threshold shouldn't be too high either, since states * at the end of the loop are likely to be useful in pruning. */ - if (!env->test_state_freq && +skip_inf_loop_check: + if (!force_new_state && env->jmps_processed - env->prev_jmps_processed < 20 && env->insn_processed - env->prev_insn_processed < 100) add_new_state = false; goto miss; } if (states_equal(env, &sl->state, cur)) { +hit: sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. @@ -14978,11 +15638,11 @@ static int do_check(struct bpf_verifier_env *env) print_insn_state(env, state->frame[state->curframe]); verbose_linfo(env, env->insn_idx, "; "); - env->prev_log_len = env->log.len_used; + env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); - env->prev_insn_print_len = env->log.len_used - env->prev_log_len; - env->prev_log_len = env->log.len_used; + env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; + env->prev_log_pos = env->log.end_pos; } if (bpf_prog_is_offloaded(env->prog->aux)) { @@ -15301,8 +15961,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, goto err_put; } - if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + if (!btf_type_is_var(t) && !btf_type_is_func(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id); err = -EINVAL; goto err_put; } @@ -15315,6 +15975,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, err = -ENOENT; goto err_put; } + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + if (btf_type_is_func(t)) { + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; + aux->btf_var.mem_size = 0; + goto check_btf; + } datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { @@ -15327,9 +15995,6 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, } } - insn[0].imm = (u32)addr; - insn[1].imm = addr >> 32; - type = t->type; t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { @@ -15357,7 +16022,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } - +check_btf: /* check whether we recorded this BTF (and maybe module) already */ for (i = 0; i < env->used_btf_cnt; i++) { if (env->used_btfs[i].btf == btf) { @@ -17032,21 +17697,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); BUILD_BUG_ON(!__same_type(ops->map_delete_elem, - (int (*)(struct bpf_map *map, void *key))NULL)); + (long (*)(struct bpf_map *map, void *key))NULL)); BUILD_BUG_ON(!__same_type(ops->map_update_elem, - (int (*)(struct bpf_map *map, void *key, void *value, + (long (*)(struct bpf_map *map, void *key, void *value, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_push_elem, - (int (*)(struct bpf_map *map, void *value, + (long (*)(struct bpf_map *map, void *value, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_pop_elem, - (int (*)(struct bpf_map *map, void *value))NULL)); + (long (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, - (int (*)(struct bpf_map *map, void *value))NULL)); + (long (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_redirect, - (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); + (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, - (int (*)(struct bpf_map *map, + (long (*)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); @@ -17654,6 +18319,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, const char *tname; struct btf *btf; long addr = 0; + struct module *mod = NULL; if (!btf_id) { bpf_log(log, "Tracing programs must provide btf_id\n"); @@ -17827,8 +18493,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, else addr = (long) tgt_prog->aux->func[subprog]->bpf_func; } else { - addr = kallsyms_lookup_name(tname); + if (btf_is_module(btf)) { + mod = btf_try_get_module(btf); + if (mod) + addr = find_kallsyms_symbol_value(mod, tname); + else + addr = 0; + } else { + addr = kallsyms_lookup_name(tname); + } if (!addr) { + module_put(mod); bpf_log(log, "The address of function %s cannot be found\n", tname); @@ -17868,11 +18543,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, break; } if (ret) { + module_put(mod); bpf_log(log, "%s is not sleepable\n", tname); return ret; } } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { if (tgt_prog) { + module_put(mod); bpf_log(log, "can't modify return codes of BPF programs\n"); return -EINVAL; } @@ -17881,6 +18558,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, !check_attach_modify_return(addr, tname)) ret = 0; if (ret) { + module_put(mod); bpf_log(log, "%s() is not modifiable\n", tname); return ret; } @@ -17891,6 +18569,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, tgt_info->tgt_addr = addr; tgt_info->tgt_name = tname; tgt_info->tgt_type = t; + tgt_info->tgt_mod = mod; return 0; } @@ -17970,6 +18649,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) /* store info about the attachment target that will be used later */ prog->aux->attach_func_proto = tgt_info.tgt_type; prog->aux->attach_func_name = tgt_info.tgt_name; + prog->aux->mod = tgt_info.tgt_mod; if (tgt_prog) { prog->aux->saved_dst_prog_type = tgt_prog->type; @@ -18014,12 +18694,12 @@ struct btf *bpf_get_btf_vmlinux(void) return btf_vmlinux; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; - struct bpf_verifier_log *log; - int i, len, ret = -EINVAL; + int i, len, ret = -EINVAL, err; + u32 log_true_size; bool is_priv; /* no program is valid */ @@ -18032,7 +18712,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; - log = &env->log; len = (*prog)->len; env->insn_aux_data = @@ -18053,20 +18732,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) if (!is_priv) mutex_lock(&bpf_verifier_lock); - if (attr->log_level || attr->log_buf || attr->log_size) { - /* user requested verbose verifier output - * and supplied buffer to store the verification trace - */ - log->level = attr->log_level; - log->ubuf = (char __user *) (unsigned long) attr->log_buf; - log->len_total = attr->log_size; - - /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) { - ret = -EINVAL; - goto err_unlock; - } - } + /* user could have requested verbose verifier output + * and supplied buffer to store the verification trace + */ + ret = bpf_vlog_init(&env->log, attr->log_level, + (char __user *) (unsigned long) attr->log_buf, + attr->log_size); + if (ret) + goto err_unlock; mark_verifier_state_clean(env); @@ -18180,9 +18853,14 @@ skip_full_check: print_verification_stats(env); env->prog->aux->verified_insns = env->insn_processed; - if (log->level && bpf_verifier_log_full(log)) - ret = -ENOSPC; - if (log->level && !log->ubuf) { + /* preserve original error even if log finalization is successful */ + err = bpf_vlog_finalize(&env->log, &log_true_size); + if (err) + ret = err; + + if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && + copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), + &log_true_size, sizeof(log_true_size))) { ret = -EFAULT; goto err_release_maps; } |