diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-03-06 20:36:39 -0800 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-03-06 20:36:39 -0800 |
commit | 36e5e391a25af28dc1f4586f95d577b38ff4ed72 (patch) | |
tree | 3b58175e4a148f54338d22c926d7dd2a6283d317 /kernel | |
parent | 5ca26d6039a6b42341f7f5cc8d10d30ca1561a7b (diff) | |
parent | 8f4c92f0024ff2a30f002e85f87e531d49dc023c (diff) |
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says:
====================
pull-request: bpf-next 2023-03-06
We've added 85 non-merge commits during the last 13 day(s) which contain
a total of 131 files changed, 7102 insertions(+), 1792 deletions(-).
The main changes are:
1) Add skb and XDP typed dynptrs which allow BPF programs for more
ergonomic and less brittle iteration through data and variable-sized
accesses, from Joanne Koong.
2) Bigger batch of BPF verifier improvements to prepare for upcoming BPF
open-coded iterators allowing for less restrictive looping capabilities,
from Andrii Nakryiko.
3) Rework RCU enforcement in the verifier, add kptr_rcu and enforce BPF
programs to NULL-check before passing such pointers into kfunc,
from Alexei Starovoitov.
4) Add support for kptrs in percpu hashmaps, percpu LRU hashmaps and in
local storage maps, from Kumar Kartikeya Dwivedi.
5) Add BPF verifier support for ST instructions in convert_ctx_access()
which will help new -mcpu=v4 clang flag to start emitting them,
from Eduard Zingerman.
6) Make uprobe attachment Android APK aware by supporting attachment
to functions inside ELF objects contained in APKs via function names,
from Daniel Müller.
7) Add a new flag BPF_F_TIMER_ABS flag for bpf_timer_start() helper
to start the timer with absolute expiration value instead of relative
one, from Tero Kristo.
8) Add a new kfunc bpf_cgroup_from_id() to look up cgroups via id,
from Tejun Heo.
9) Extend libbpf to support users manually attaching kprobes/uprobes
in the legacy/perf/link mode, from Menglong Dong.
10) Implement workarounds in the mips BPF JIT for DADDI/R4000,
from Jiaxun Yang.
11) Enable mixing bpf2bpf and tailcalls for the loongarch BPF JIT,
from Hengqi Chen.
12) Extend BPF instruction set doc with describing the encoding of BPF
instructions in terms of how bytes are stored under big/little endian,
from Jose E. Marchesi.
13) Follow-up to enable kfunc support for riscv BPF JIT, from Pu Lehui.
14) Fix bpf_xdp_query() backwards compatibility on old kernels,
from Yonghong Song.
15) Fix BPF selftest cross compilation with CLANG_CROSS_FLAGS,
from Florent Revest.
16) Improve bpf_cpumask_ma to only allocate one bpf_mem_cache,
from Hou Tao.
17) Fix BPF verifier's check_subprogs to not unnecessarily mark
a subprogram with has_tail_call, from Ilya Leoshkevich.
18) Fix arm syscall regs spec in libbpf's bpf_tracing.h, from Puranjay Mohan.
* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (85 commits)
selftests/bpf: Add test for legacy/perf kprobe/uprobe attach mode
selftests/bpf: Split test_attach_probe into multi subtests
libbpf: Add support to set kprobe/uprobe attach mode
tools/resolve_btfids: Add /libsubcmd to .gitignore
bpf: add support for fixed-size memory pointer returns for kfuncs
bpf: generalize dynptr_get_spi to be usable for iters
bpf: mark PTR_TO_MEM as non-null register type
bpf: move kfunc_call_arg_meta higher in the file
bpf: ensure that r0 is marked scratched after any function call
bpf: fix visit_insn()'s detection of BPF_FUNC_timer_set_callback helper
bpf: clean up visit_insn()'s instruction processing
selftests/bpf: adjust log_fixup's buffer size for proper truncation
bpf: honor env->test_state_freq flag in is_state_visited()
selftests/bpf: enhance align selftest's expected log matching
bpf: improve regsafe() checks for PTR_TO_{MEM,BUF,TP_BUFFER}
bpf: improve stack slot state printing
selftests/bpf: Disassembler tests for verifier.c:convert_ctx_access()
selftests/bpf: test if pointer type is tracked for BPF_ST_MEM
bpf: allow ctx writes using BPF_ST_MEM instruction
bpf: Use separate RCU callbacks for freeing selem
...
====================
Link: https://lore.kernel.org/r/20230307004346.27578-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/bpf_local_storage.c | 85 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 42 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 53 | ||||
-rw-r--r-- | kernel/bpf/cpumask.c | 46 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 59 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 257 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 8 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 974 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 4 |
9 files changed, 1124 insertions, 404 deletions
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 35f4138a54dc..3d320393a12c 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -51,11 +51,21 @@ owner_storage(struct bpf_local_storage_map *smap, void *owner) return map->ops->map_owner_storage_ptr(owner); } +static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed_lockless(&selem->snode); +} + static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->snode); } +static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed_lockless(&selem->map_node); +} + static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); @@ -75,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { if (value) copy_map_value(&smap->map, SDATA(selem)->data, value); + /* No need to call check_and_init_map_value as memory is zero init */ return selem; } @@ -98,7 +109,28 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } -static void bpf_selem_free_rcu(struct rcu_head *rcu) +static void bpf_selem_free_fields_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* protected by the rcu_barrier*() */ + smap = rcu_dereference_protected(SDATA(selem)->smap, true); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + kfree(selem); +} + +static void bpf_selem_free_fields_trace_rcu(struct rcu_head *rcu) +{ + /* Free directly if Tasks Trace RCU GP also implies RCU GP */ + if (rcu_trace_implies_rcu_gp()) + bpf_selem_free_fields_rcu(rcu); + else + call_rcu(rcu, bpf_selem_free_fields_rcu); +} + +static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; @@ -119,6 +151,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor { struct bpf_local_storage_map *smap; bool free_local_storage; + struct btf_record *rec; void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); @@ -159,10 +192,26 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - if (use_trace_rcu) - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); - else - kfree_rcu(selem, rcu); + /* A different RCU callback is chosen whenever we need to free + * additional fields in selem data before freeing selem. + * bpf_local_storage_map_free only executes rcu_barrier to wait for RCU + * callbacks when it has special fields, hence we can only conditionally + * dereference smap, as by this time the map might have already been + * freed without waiting for our call_rcu callback if it did not have + * any special fields. + */ + rec = smap->map.record; + if (use_trace_rcu) { + if (!IS_ERR_OR_NULL(rec)) + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu); + else + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); + } else { + if (!IS_ERR_OR_NULL(rec)) + call_rcu(&selem->rcu, bpf_selem_free_fields_rcu); + else + kfree_rcu(selem, rcu); + } return free_local_storage; } @@ -174,7 +223,7 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool free_local_storage = false; unsigned long flags; - if (unlikely(!selem_linked_to_storage(selem))) + if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ return; @@ -208,7 +257,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) struct bpf_local_storage_map_bucket *b; unsigned long flags; - if (unlikely(!selem_linked_to_map(selem))) + if (unlikely(!selem_linked_to_map_lockless(selem))) /* selem has already be unlinked from smap */ return; @@ -420,7 +469,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, err = check_flags(old_sdata, map_flags); if (err) return ERR_PTR(err); - if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) { + if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); return old_sdata; @@ -713,6 +762,26 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); + /* Only delay freeing of smap, buckets are not needed anymore */ kvfree(smap->buckets); + + /* When local storage has special fields, callbacks for + * bpf_selem_free_fields_rcu and bpf_selem_free_fields_trace_rcu will + * keep using the map BTF record, we need to execute an RCU barrier to + * wait for them as the record will be freed right after our map_free + * callback. + */ + if (!IS_ERR_OR_NULL(smap->map.record)) { + rcu_barrier_tasks_trace(); + /* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp() + * is true, because while call_rcu invocation is skipped in that + * case in bpf_selem_free_fields_trace_rcu (and all local + * storage maps pass use_trace_rcu = true), there can be + * call_rcu callbacks based on use_trace_rcu = false in the + * while ((selem = ...)) loop above or when owner's free path + * calls bpf_local_storage_unlink_nolock. + */ + rcu_barrier(); + } bpf_map_area_free(smap); } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index fa22ec79ac0e..a8cb09e5973b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -207,6 +207,11 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_FMODRET, + BTF_KFUNC_HOOK_CGROUP_SKB, + BTF_KFUNC_HOOK_SCHED_ACT, + BTF_KFUNC_HOOK_SK_SKB, + BTF_KFUNC_HOOK_SOCKET_FILTER, + BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_MAX, }; @@ -3283,9 +3288,9 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; - if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_UNREF; - else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_REF; else return -EINVAL; @@ -5683,6 +5688,10 @@ again: * int socket_filter_bpf_prog(struct __sk_buff *skb) * { // no fields of skb are ever used } */ + if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) + return ctx_type; + if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) + return ctx_type; if (strcmp(ctx_tname, tname)) { /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again @@ -6154,6 +6163,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; + *flag = 0; again: tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { @@ -6320,6 +6330,15 @@ error: * of this field or inside of this struct */ if (btf_type_is_struct(mtype)) { + if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION && + btf_type_vlen(mtype) != 1) + /* + * walking unions yields untrusted pointers + * with exception of __bpf_md_ptr and other + * unions with a single member + */ + *flag |= PTR_UNTRUSTED; + /* our field must be inside that union or struct */ t = mtype; @@ -6364,7 +6383,7 @@ error: stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; - *flag = tmp_flag; + *flag |= tmp_flag; return WALK_PTR; } } @@ -7704,6 +7723,19 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_TRACING; case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; + case BPF_PROG_TYPE_CGROUP_SKB: + return BTF_KFUNC_HOOK_CGROUP_SKB; + case BPF_PROG_TYPE_SCHED_ACT: + return BTF_KFUNC_HOOK_SCHED_ACT; + case BPF_PROG_TYPE_SK_SKB: + return BTF_KFUNC_HOOK_SK_SKB; + case BPF_PROG_TYPE_SOCKET_FILTER: + return BTF_KFUNC_HOOK_SOCKET_FILTER; + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + return BTF_KFUNC_HOOK_LWT; default: return BTF_KFUNC_HOOK_MAX; } @@ -8335,7 +8367,7 @@ out: bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off) + int off, const char *suffix) { struct btf *btf = reg->btf; const struct btf_type *walk_type, *safe_type; @@ -8352,7 +8384,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, tname = btf_name_by_offset(btf, walk_type->name_off); - ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname); + ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix); if (ret < 0) return false; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bf2fdb33fb31..53edb8ad2471 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2223,10 +2223,12 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, ppos)); - *insn++ = BPF_STX_MEM( - BPF_SIZEOF(u32), treg, si->src_reg, + *insn++ = BPF_RAW_INSN( + BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32), + treg, si->src_reg, bpf_ctx_narrow_access_offset( - 0, sizeof(u32), sizeof(loff_t))); + 0, sizeof(u32), sizeof(loff_t)), + si->imm); *insn++ = BPF_LDX_MEM( BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, tmp_reg)); @@ -2376,10 +2378,17 @@ static bool cg_sockopt_is_valid_access(int off, int size, return true; } -#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ - T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sockopt_kern, F)) +#define CG_SOCKOPT_READ_FIELD(F) \ + BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +#define CG_SOCKOPT_WRITE_FIELD(F) \ + BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \ + BPF_MEM | BPF_CLASS(si->code)), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F), \ + si->imm) static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, @@ -2391,25 +2400,25 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct bpf_sockopt, sk): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + *insn++ = CG_SOCKOPT_READ_FIELD(sk); break; case offsetof(struct bpf_sockopt, level): if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + *insn++ = CG_SOCKOPT_WRITE_FIELD(level); else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + *insn++ = CG_SOCKOPT_READ_FIELD(level); break; case offsetof(struct bpf_sockopt, optname): if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + *insn++ = CG_SOCKOPT_WRITE_FIELD(optname); else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + *insn++ = CG_SOCKOPT_READ_FIELD(optname); break; case offsetof(struct bpf_sockopt, optlen): if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen); else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + *insn++ = CG_SOCKOPT_READ_FIELD(optlen); break; case offsetof(struct bpf_sockopt, retval): BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); @@ -2429,9 +2438,11 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), treg, treg, offsetof(struct task_struct, bpf_ctx)); - *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), - treg, si->src_reg, - offsetof(struct bpf_cg_run_ctx, retval)); + *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM | + BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + treg, si->src_reg, + offsetof(struct bpf_cg_run_ctx, retval), + si->imm); *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sockopt_kern, tmp_reg)); } else { @@ -2447,10 +2458,10 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, } break; case offsetof(struct bpf_sockopt, optval): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + *insn++ = CG_SOCKOPT_READ_FIELD(optval); break; case offsetof(struct bpf_sockopt, optval_end): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + *insn++ = CG_SOCKOPT_READ_FIELD(optval_end); break; } @@ -2529,10 +2540,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 52b981512a35..b6587ec40f1b 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -55,7 +55,7 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) /* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */ BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0); - cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask)); + cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma); if (!cpumask) return NULL; @@ -123,7 +123,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) if (refcount_dec_and_test(&cpumask->usage)) { migrate_disable(); - bpf_mem_free(&bpf_cpumask_ma, cpumask); + bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); migrate_enable(); } } @@ -427,26 +427,26 @@ BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU) BTF_SET8_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { @@ -468,7 +468,7 @@ static int __init cpumask_kfunc_init(void) }, }; - ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false); + ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5dfcb5ad0d06..653aeb481c79 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -249,7 +249,18 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + if (htab_is_percpu(htab)) { + void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); + int cpu; + + for_each_possible_cpu(cpu) { + bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); + cond_resched(); + } + } else { + bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } cond_resched(); } } @@ -759,9 +770,17 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + if (htab_is_percpu(htab)) { + void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); + int cpu; - bpf_obj_free_fields(htab->map.record, map_value); + for_each_possible_cpu(cpu) + bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); + } else { + void *map_value = elem->key + round_up(htab->map.key_size, 8); + + bpf_obj_free_fields(htab->map.record, map_value); + } } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -858,9 +877,9 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { + check_and_free_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); - check_and_free_fields(htab, l); bpf_mem_cache_free(&htab->ma, l); } @@ -918,14 +937,13 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, { if (!onallcpus) { /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + copy_map_value(&htab->map, this_cpu_ptr(pptr), value); } else { u32 size = round_up(htab->map.value_size, 8); int off = 0, cpu; for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); + copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); off += size; } } @@ -940,16 +958,14 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, * (onallcpus=false always when coming from bpf prog). */ if (!onallcpus) { - u32 size = round_up(htab->map.value_size, 8); int current_cpu = raw_smp_processor_id(); int cpu; for_each_possible_cpu(cpu) { if (cpu == current_cpu) - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, - size); - else - memset(per_cpu_ptr(pptr, cpu), 0, size); + copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); + else /* Since elem is preallocated, we cannot touch special fields */ + zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else { pcpu_copy_value(htab, pptr, value, onallcpus); @@ -1575,9 +1591,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, pptr = htab_elem_get_ptr(l, key_size); for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, - per_cpu_ptr(pptr, cpu), - roundup_value_size); + copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, value + off); off += roundup_value_size; } } else { @@ -1772,8 +1787,8 @@ again_nocopy: pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { - bpf_long_memcpy(dst_val + off, - per_cpu_ptr(pptr, cpu), size); + copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val + off); off += size; } } else { @@ -2046,9 +2061,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); for_each_possible_cpu(cpu) { - bpf_long_memcpy(info->percpu_value_buf + off, - per_cpu_ptr(pptr, cpu), - roundup_value_size); + copy_map_value_long(map, info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, info->percpu_value_buf + off); off += roundup_value_size; } ctx.value = info->percpu_value_buf; @@ -2292,8 +2307,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) */ pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, - per_cpu_ptr(pptr, cpu), size); + copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value + off); off += size; } ret = 0; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5b278a38ae58..637ac4e92e75 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1264,10 +1264,11 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla { struct bpf_hrtimer *t; int ret = 0; + enum hrtimer_mode mode; if (in_nmi()) return -EOPNOTSUPP; - if (flags) + if (flags > BPF_F_TIMER_ABS) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; @@ -1275,7 +1276,13 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla ret = -EINVAL; goto out; } - hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT); + + if (flags & BPF_F_TIMER_ABS) + mode = HRTIMER_MODE_ABS_SOFT; + else + mode = HRTIMER_MODE_REL_SOFT; + + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; @@ -1420,11 +1427,21 @@ static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) return ptr->size & DYNPTR_RDONLY_BIT; } +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) +{ + ptr->size |= DYNPTR_RDONLY_BIT; +} + static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) { ptr->size |= type << DYNPTR_TYPE_SHIFT; } +static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr) +{ + return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; +} + u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; @@ -1497,6 +1514,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { + enum bpf_dynptr_type type; int err; if (!src->data || flags) @@ -1506,13 +1524,25 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern if (err) return err; - /* Source and destination may possibly overlap, hence use memmove to - * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr - * pointing to overlapping PTR_TO_MAP_VALUE regions. - */ - memmove(dst, src->data + src->offset + offset, len); + type = bpf_dynptr_get_type(src); - return 0; + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + case BPF_DYNPTR_TYPE_RINGBUF: + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst, src->data + src->offset + offset, len); + return 0; + case BPF_DYNPTR_TYPE_SKB: + return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_XDP: + return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); + default: + WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); + return -EFAULT; + } } static const struct bpf_func_proto bpf_dynptr_read_proto = { @@ -1529,22 +1559,40 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { + enum bpf_dynptr_type type; int err; - if (!dst->data || flags || bpf_dynptr_is_rdonly(dst)) + if (!dst->data || bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); if (err) return err; - /* Source and destination may possibly overlap, hence use memmove to - * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr - * pointing to overlapping PTR_TO_MAP_VALUE regions. - */ - memmove(dst->data + dst->offset + offset, src, len); + type = bpf_dynptr_get_type(dst); - return 0; + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + case BPF_DYNPTR_TYPE_RINGBUF: + if (flags) + return -EINVAL; + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst->data + dst->offset + offset, src, len); + return 0; + case BPF_DYNPTR_TYPE_SKB: + return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len, + flags); + case BPF_DYNPTR_TYPE_XDP: + if (flags) + return -EINVAL; + return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); + default: + WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); + return -EFAULT; + } } static const struct bpf_func_proto bpf_dynptr_write_proto = { @@ -1560,6 +1608,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { + enum bpf_dynptr_type type; int err; if (!ptr->data) @@ -1572,7 +1621,20 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 if (bpf_dynptr_is_rdonly(ptr)) return 0; - return (unsigned long)(ptr->data + ptr->offset + offset); + type = bpf_dynptr_get_type(ptr); + + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + case BPF_DYNPTR_TYPE_RINGBUF: + return (unsigned long)(ptr->data + ptr->offset + offset); + case BPF_DYNPTR_TYPE_SKB: + case BPF_DYNPTR_TYPE_XDP: + /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ + return 0; + default: + WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type); + return 0; + } } static const struct bpf_func_proto bpf_dynptr_data_proto = { @@ -1693,6 +1755,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; #endif default: break; @@ -2097,10 +2163,28 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) if (level > cgrp->level || level < 0) return NULL; + /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */ ancestor = cgrp->ancestors[level]; - cgroup_get(ancestor); + if (!cgroup_tryget(ancestor)) + return NULL; return ancestor; } + +/** + * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this + * kfunc which is not subsequently stored in a map, must be released by calling + * bpf_cgroup_release(). + * @cgid: cgroup id. + */ +__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) +{ + struct cgroup *cgrp; + + cgrp = cgroup_get_from_id(cgid); + if (IS_ERR(cgrp)) + return NULL; + return cgrp; +} #endif /* CONFIG_CGROUPS */ /** @@ -2122,6 +2206,140 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) return p; } +/** + * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. + * @ptr: The dynptr whose data slice to retrieve + * @offset: Offset into the dynptr + * @buffer: User-provided buffer to copy contents into + * @buffer__szk: Size (in bytes) of the buffer. This is the length of the + * requested slice. This must be a constant. + * + * For non-skb and non-xdp type dynptrs, there is no difference between + * bpf_dynptr_slice and bpf_dynptr_data. + * + * If the intention is to write to the data slice, please use + * bpf_dynptr_slice_rdwr. + * + * The user must check that the returned pointer is not null before using it. + * + * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice + * does not change the underlying packet data pointers, so a call to + * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in + * the bpf program. + * + * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only + * data slice (can be either direct pointer to the data or a pointer to the user + * provided buffer, with its contents containing the data, if unable to obtain + * direct pointer) + */ +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset, + void *buffer, u32 buffer__szk) +{ + enum bpf_dynptr_type type; + u32 len = buffer__szk; + int err; + + if (!ptr->data) + return NULL; + + err = bpf_dynptr_check_off_len(ptr, offset, len); + if (err) + return NULL; + + type = bpf_dynptr_get_type(ptr); + + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + case BPF_DYNPTR_TYPE_RINGBUF: + return ptr->data + ptr->offset + offset; + case BPF_DYNPTR_TYPE_SKB: + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer); + case BPF_DYNPTR_TYPE_XDP: + { + void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); + if (xdp_ptr) + return xdp_ptr; + + bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false); + return buffer; + } + default: + WARN_ONCE(true, "unknown dynptr type %d\n", type); + return NULL; + } +} + +/** + * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. + * @ptr: The dynptr whose data slice to retrieve + * @offset: Offset into the dynptr + * @buffer: User-provided buffer to copy contents into + * @buffer__szk: Size (in bytes) of the buffer. This is the length of the + * requested slice. This must be a constant. + * + * For non-skb and non-xdp type dynptrs, there is no difference between + * bpf_dynptr_slice and bpf_dynptr_data. + * + * The returned pointer is writable and may point to either directly the dynptr + * data at the requested offset or to the buffer if unable to obtain a direct + * data pointer to (example: the requested slice is to the paged area of an skb + * packet). In the case where the returned pointer is to the buffer, the user + * is responsible for persisting writes through calling bpf_dynptr_write(). This + * usually looks something like this pattern: + * + * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer)); + * if (!eth) + * return TC_ACT_SHOT; + * + * // mutate eth header // + * + * if (eth == buffer) + * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); + * + * Please note that, as in the example above, the user must check that the + * returned pointer is not null before using it. + * + * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr + * does not change the underlying packet data pointers, so a call to + * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in + * the bpf program. + * + * Return: NULL if the call failed (eg invalid dynptr), pointer to a + * data slice (can be either direct pointer to the data or a pointer to the user + * provided buffer, with its contents containing the data, if unable to obtain + * direct pointer) + */ +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, + void *buffer, u32 buffer__szk) +{ + if (!ptr->data || bpf_dynptr_is_rdonly(ptr)) + return NULL; + + /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. + * + * For skb-type dynptrs, it is safe to write into the returned pointer + * if the bpf program allows skb data writes. There are two possiblities + * that may occur when calling bpf_dynptr_slice_rdwr: + * + * 1) The requested slice is in the head of the skb. In this case, the + * returned pointer is directly to skb data, and if the skb is cloned, the + * verifier will have uncloned it (see bpf_unclone_prologue()) already. + * The pointer can be directly written into. + * + * 2) Some portion of the requested slice is in the paged buffer area. + * In this case, the requested data will be copied out into the buffer + * and the returned pointer will be a pointer to the buffer. The skb + * will not be pulled. To persist the write, the user will need to call + * bpf_dynptr_write(), which will pull the skb and commit the write. + * + * Similarly for xdp programs, if the requested slice is not across xdp + * fragments, then a direct pointer will be returned, otherwise the data + * will be copied out into the buffer and the user will need to call + * bpf_dynptr_write() to commit changes. + */ + return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk); +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2166,7 +2384,8 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_SET8_END(generic_btf_ids) @@ -2190,6 +2409,8 @@ BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) BTF_ID_FLAGS(func, bpf_rdonly_cast) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) +BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index adc83cb82f37..ede5f987484f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1059,9 +1059,15 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { ret = -EOPNOTSUPP; goto free_map_tab; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 272563a0b770..b2116ca78d9a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -268,7 +268,41 @@ struct bpf_call_arg_meta { u32 ret_btf_id; u32 subprogno; struct btf_field *kptr_field; - u8 uninit_dynptr_regno; +}; + +struct bpf_kfunc_call_arg_meta { + /* In parameters */ + struct btf *btf; + u32 func_id; + u32 kfunc_flags; + const struct btf_type *func_proto; + const char *func_name; + /* Out parameters */ + u32 ref_obj_id; + u8 release_regno; + bool r0_rdonly; + u32 ret_btf_id; + u64 r0_size; + u32 subprogno; + struct { + u64 value; + bool found; + } arg_constant; + struct { + struct btf *btf; + u32 btf_id; + } arg_obj_drop; + struct { + struct btf_field *field; + } arg_list_head; + struct { + struct btf_field *field; + } arg_rbtree_root; + struct { + enum bpf_dynptr_type type; + u32 id; + } initialized_dynptr; + u64 mem_size; }; struct btf *btf_vmlinux; @@ -453,7 +487,8 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || - type == PTR_TO_SOCK_COMMON; + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_MEM; } static bool type_is_ptr_alloc_obj(u32 type) @@ -675,37 +710,62 @@ static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_sl return spi - nr_slots + 1 >= 0 && spi < allocated_slots; } -static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + const char *obj_kind, int nr_slots) { int off, spi; if (!tnum_is_const(reg->var_off)) { - verbose(env, "dynptr has to be at a constant offset\n"); + verbose(env, "%s has to be at a constant offset\n", obj_kind); return -EINVAL; } off = reg->off + reg->var_off.value; if (off % BPF_REG_SIZE) { - verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; } spi = __get_spi(off); - if (spi < 1) { - verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + if (spi + 1 < nr_slots) { + verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; } - if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS)) + if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots)) return -ERANGE; return spi; } +static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS); +} + static const char *kernel_type_name(const struct btf* btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } +static const char *dynptr_type_str(enum bpf_dynptr_type type) +{ + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + return "local"; + case BPF_DYNPTR_TYPE_RINGBUF: + return "ringbuf"; + case BPF_DYNPTR_TYPE_SKB: + return "skb"; + case BPF_DYNPTR_TYPE_XDP: + return "xdp"; + case BPF_DYNPTR_TYPE_INVALID: + return "<invalid>"; + default: + WARN_ONCE(1, "unknown dynptr type %d\n", type); + return "<unknown>"; + } +} + static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; @@ -751,11 +811,31 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_LOCAL; case DYNPTR_TYPE_RINGBUF: return BPF_DYNPTR_TYPE_RINGBUF; + case DYNPTR_TYPE_SKB: + return BPF_DYNPTR_TYPE_SKB; + case DYNPTR_TYPE_XDP: + return BPF_DYNPTR_TYPE_XDP; default: return BPF_DYNPTR_TYPE_INVALID; } } +static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) +{ + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + return DYNPTR_TYPE_LOCAL; + case BPF_DYNPTR_TYPE_RINGBUF: + return DYNPTR_TYPE_RINGBUF; + case BPF_DYNPTR_TYPE_SKB: + return DYNPTR_TYPE_SKB; + case BPF_DYNPTR_TYPE_XDP: + return DYNPTR_TYPE_XDP; + default: + return 0; + } +} + static bool dynptr_type_refcounted(enum bpf_dynptr_type type) { return type == BPF_DYNPTR_TYPE_RINGBUF; @@ -895,6 +975,14 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re static void __mark_reg_unknown(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); +static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + if (!env->allow_ptr_leaks) + __mark_reg_not_init(env, reg); + else + __mark_reg_unknown(env, reg); +} + static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { @@ -934,12 +1022,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) continue; - if (dreg->dynptr_id == dynptr_id) { - if (!env->allow_ptr_leaks) - __mark_reg_not_init(env, dreg); - else - __mark_reg_unknown(env, dreg); - } + if (dreg->dynptr_id == dynptr_id) + mark_reg_invalid(env, dreg); })); /* Do not release reference state, we are destroying dynptr on stack, @@ -955,39 +1039,49 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, return 0; } -static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi) +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { + int spi; + if (reg->type == CONST_PTR_TO_DYNPTR) return false; - /* For -ERANGE (i.e. spi not falling into allocated stack slots), we - * will do check_mem_access to check and update stack bounds later, so - * return true for that case. + spi = dynptr_get_spi(env, reg); + + /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an + * error because this just means the stack state hasn't been updated yet. + * We will do check_mem_access to check and update stack bounds later. */ - if (spi < 0) - return spi == -ERANGE; - /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see - * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to - * ensure dynptr objects at the slots we are touching are completely - * destructed before we reinitialize them for a new one. For referenced - * ones, destroy_if_dynptr_stack_slot returns an error early instead of - * delaying it until the end where the user will get "Unreleased + if (spi < 0 && spi != -ERANGE) + return false; + + /* We don't need to check if the stack slots are marked by previous + * dynptr initializations because we allow overwriting existing unreferenced + * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls + * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are + * touching are completely destructed before we reinitialize them for a new + * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early + * instead of delaying it until the end where the user will get "Unreleased * reference" error. */ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int i; + int i, spi; - /* This already represents first slot of initialized bpf_dynptr */ + /* This already represents first slot of initialized bpf_dynptr. + * + * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to + * check_func_arg_reg_off's logic, so we don't need to check its + * offset and alignment. + */ if (reg->type == CONST_PTR_TO_DYNPTR) return true; + spi = dynptr_get_spi(env, reg); if (spi < 0) return false; if (!state->stack[spi].spilled_ptr.dynptr.first_slot) @@ -1143,26 +1237,49 @@ static void print_verifier_state(struct bpf_verifier_env *env, for (j = 0; j < BPF_REG_SIZE; j++) { if (state->stack[i].slot_type[j] != STACK_INVALID) valid = true; - types_buf[j] = slot_type_char[ - state->stack[i].slot_type[j]]; + types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; } types_buf[BPF_REG_SIZE] = 0; if (!valid) continue; if (!print_all && !stack_slot_scratched(env, i)) continue; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); - if (is_spilled_reg(&state->stack[i])) { + switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: reg = &state->stack[i].spilled_ptr; t = reg->type; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) verbose(env, "%lld", reg->var_off.value + reg->off); - } else { + break; + case STACK_DYNPTR: + i += BPF_DYNPTR_NR_SLOTS - 1; + reg = &state->stack[i].spilled_ptr; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); + if (reg->ref_obj_id) + verbose(env, "(ref_id=%d)", reg->ref_obj_id); + break; + case STACK_MISC: + case STACK_ZERO: + default: + reg = &state->stack[i].spilled_ptr; + + for (j = 0; j < BPF_REG_SIZE; j++) + types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; + types_buf[BPF_REG_SIZE] = 0; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); verbose(env, "=%s", types_buf); + break; } } if (state->acquired_refs && state->refs[0].id) { @@ -1664,6 +1781,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) reg->type == PTR_TO_PACKET_END; } +static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) +{ + return base_type(reg->type) == PTR_TO_MEM && + (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); +} + /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, enum bpf_reg_type which) @@ -2475,8 +2598,8 @@ static int check_subprogs(struct bpf_verifier_env *env) u8 code = insn[i].code; if (code == (BPF_JMP | BPF_CALL) && - insn[i].imm == BPF_FUNC_tail_call && - insn[i].src_reg != BPF_PSEUDO_CALL) + insn[i].src_reg == 0 && + insn[i].imm == BPF_FUNC_tail_call) subprog[cur_subprog].has_tail_call = true; if (BPF_CLASS(code) == BPF_LD && (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) @@ -3826,6 +3949,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, continue; if (type == STACK_MISC) continue; + if (type == STACK_INVALID && env->allow_uninit_stack) + continue; verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -3863,6 +3988,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, continue; if (type == STACK_ZERO) continue; + if (type == STACK_INVALID && env->allow_uninit_stack) + continue; verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -4175,7 +4302,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno) { const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); - int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED; + int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; const char *reg_name = ""; /* Only unreferenced case accepts untrusted pointers */ @@ -4242,6 +4369,34 @@ bad_type: return -EINVAL; } +/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() + * can dereference RCU protected pointers and result is PTR_TRUSTED. + */ +static bool in_rcu_cs(struct bpf_verifier_env *env) +{ + return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable; +} + +/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ +BTF_SET_START(rcu_protected_types) +BTF_ID(struct, prog_test_ref_kfunc) +BTF_ID(struct, cgroup) +BTF_SET_END(rcu_protected_types) + +static bool rcu_protected_object(const struct btf *btf, u32 btf_id) +{ + if (!btf_is_kernel(btf)) + return false; + return btf_id_set_contains(&rcu_protected_types, btf_id); +} + +static bool rcu_safe_kptr(const struct btf_field *field) +{ + const struct btf_field_kptr *kptr = &field->kptr; + + return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); +} + static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, struct btf_field *kptr_field) @@ -4276,7 +4431,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + kptr_field->kptr.btf_id, + rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ? + PTR_MAYBE_NULL | MEM_RCU : + PTR_MAYBE_NULL | PTR_UNTRUSTED); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -4999,23 +5157,76 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) return 0; } -#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields) +#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) +#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) -BTF_TYPE_SAFE_NESTED(struct task_struct) { +/* + * Allow list few fields as RCU trusted or full trusted. + * This logic doesn't allow mix tagging and will be removed once GCC supports + * btf_type_tag. + */ + +/* RCU trusted: these fields are trusted in RCU CS and never NULL */ +BTF_TYPE_SAFE_RCU(struct task_struct) { const cpumask_t *cpus_ptr; + struct css_set __rcu *cgroups; + struct task_struct __rcu *real_parent; + struct task_struct *group_leader; }; -static bool nested_ptr_is_trusted(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - int off) +BTF_TYPE_SAFE_RCU(struct css_set) { + struct cgroup *dfl_cgrp; +}; + +/* full trusted: these fields are trusted even outside of RCU CS and never NULL */ +BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) { + __bpf_md_ptr(struct seq_file *, seq); +}; + +BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); +}; + +BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) { + struct file *file; +}; + +BTF_TYPE_SAFE_TRUSTED(struct file) { + struct inode *f_inode; +}; + +BTF_TYPE_SAFE_TRUSTED(struct dentry) { + /* no negative dentry-s in places where bpf can see it */ + struct inode *d_inode; +}; + +BTF_TYPE_SAFE_TRUSTED(struct socket) { + struct sock *sk; +}; + +static bool type_is_rcu(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + int off) { - /* If its parent is not trusted, it can't regain its trusted status. */ - if (!is_trusted_reg(reg)) - return false; + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct)); + return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu"); +} - return btf_nested_type_is_trusted(&env->log, reg, off); +static bool type_is_trusted(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + int off) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); + + return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted"); } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, @@ -5101,41 +5312,56 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - /* If this is an untrusted pointer, all pointers formed by walking it - * also inherit the untrusted flag. - */ - if (type_flag(reg->type) & PTR_UNTRUSTED) - flag |= PTR_UNTRUSTED; - - /* By default any pointer obtained from walking a trusted pointer is no - * longer trusted, unless the field being accessed has explicitly been - * marked as inheriting its parent's state of trust. - * - * An RCU-protected pointer can also be deemed trusted if we are in an - * RCU read region. This case is handled below. - */ - if (nested_ptr_is_trusted(env, reg, off)) - flag |= PTR_TRUSTED; - else - flag &= ~PTR_TRUSTED; + if (ret != PTR_TO_BTF_ID) { + /* just mark; */ - if (flag & MEM_RCU) { - /* Mark value register as MEM_RCU only if it is protected by - * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU - * itself can already indicate trustedness inside the rcu - * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since - * it could be null in some cases. + } else if (type_flag(reg->type) & PTR_UNTRUSTED) { + /* If this is an untrusted pointer, all pointers formed by walking it + * also inherit the untrusted flag. */ - if (!env->cur_state->active_rcu_lock || - !(is_trusted_reg(reg) || is_rcu_reg(reg))) - flag &= ~MEM_RCU; - else - flag |= PTR_MAYBE_NULL; - } else if (reg->type & MEM_RCU) { - /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged - * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. + flag = PTR_UNTRUSTED; + + } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) { + /* By default any pointer obtained from walking a trusted pointer is no + * longer trusted, unless the field being accessed has explicitly been + * marked as inheriting its parent's state of trust (either full or RCU). + * For example: + * 'cgroups' pointer is untrusted if task->cgroups dereference + * happened in a sleepable program outside of bpf_rcu_read_lock() + * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU). + * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED. + * + * A regular RCU-protected pointer with __rcu tag can also be deemed + * trusted if we are in an RCU CS. Such pointer can be NULL. */ - flag |= PTR_UNTRUSTED; + if (type_is_trusted(env, reg, off)) { + flag |= PTR_TRUSTED; + } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { + if (type_is_rcu(env, reg, off)) { + /* ignore __rcu tag and mark it MEM_RCU */ + flag |= MEM_RCU; + } else if (flag & MEM_RCU) { + /* __rcu tagged pointers can be NULL */ + flag |= PTR_MAYBE_NULL; + } else if (flag & (MEM_PERCPU | MEM_USER)) { + /* keep as-is */ + } else { + /* walking unknown pointers yields untrusted pointer */ + flag = PTR_UNTRUSTED; + } + } else { + /* + * If not in RCU CS or MEM_RCU pointer can be NULL then + * aggressively mark as untrusted otherwise such + * pointers will be plain PTR_TO_BTF_ID without flags + * and will be allowed to be passed into helpers for + * compat reasons. + */ + flag = PTR_UNTRUSTED; + } + } else { + /* Old compat. Deprecated */ + flag &= ~PTR_TRUSTED; } if (atype == BPF_READ && value_regno >= 0) @@ -5754,7 +5980,8 @@ static int check_stack_range_initialized( stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; if (*stype == STACK_MISC) goto mark; - if (*stype == STACK_ZERO) { + if ((*stype == STACK_ZERO) || + (*stype == STACK_INVALID && env->allow_uninit_stack)) { if (clobber) { /* helper can write anything into the stack */ *stype = STACK_MISC; @@ -6206,11 +6433,11 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument * type, and declare it as 'const struct bpf_dynptr *' in their prototype. */ -int process_dynptr_func(struct bpf_verifier_env *env, int regno, - enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, + enum bpf_arg_type arg_type) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - int spi = 0; + int err; /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): @@ -6219,15 +6446,6 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); return -EFAULT; } - /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to - * check_func_arg_reg_off's logic. We only need to check offset - * and its alignment for PTR_TO_STACK. - */ - if (reg->type == PTR_TO_STACK) { - spi = dynptr_get_spi(env, reg); - if (spi < 0 && spi != -ERANGE) - return spi; - } /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. @@ -6245,30 +6463,30 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, * to. */ if (arg_type & MEM_UNINIT) { - if (!is_dynptr_reg_valid_uninit(env, reg, spi)) { + int i; + + if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); return -EINVAL; } - /* We only support one dynptr being uninitialized at the moment, - * which is sufficient for the helper functions we have right now. - */ - if (meta->uninit_dynptr_regno) { - verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); - return -EFAULT; + /* we write BPF_DW bits (8 bytes) at a time */ + for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { + err = check_mem_access(env, insn_idx, regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; } - meta->uninit_dynptr_regno = regno; + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx); } else /* MEM_RDONLY and None case from above */ { - int err; - /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); return -EINVAL; } - if (!is_dynptr_reg_valid_init(env, reg, spi)) { + if (!is_dynptr_reg_valid_init(env, reg)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", regno); @@ -6277,30 +6495,15 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { - const char *err_extra = ""; - - switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { - case DYNPTR_TYPE_LOCAL: - err_extra = "local"; - break; - case DYNPTR_TYPE_RINGBUF: - err_extra = "ringbuf"; - break; - default: - err_extra = "<unknown>"; - break; - } verbose(env, "Expected a dynptr of type %s as arg #%d\n", - err_extra, regno); + dynptr_type_str(arg_to_dynptr_type(arg_type)), regno); return -EINVAL; } err = mark_dynptr_read(env, reg); - if (err) - return err; } - return 0; + return err; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -6522,7 +6725,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EACCES; found: - if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) { + if (base_type(reg->type) != PTR_TO_BTF_ID) + return 0; + + switch ((int)reg->type) { + case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | PTR_TRUSTED: + case PTR_TO_BTF_ID | MEM_RCU: + { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This * allows bpf_sk_release to work for multiple socket types. @@ -6558,13 +6768,23 @@ found: return -EACCES; } } - } else if (type_is_alloc(reg->type)) { + break; + } + case PTR_TO_BTF_ID | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } + /* Handled by helper specific checks */ + break; + case PTR_TO_BTF_ID | MEM_PERCPU: + case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: + /* Handled by helper specific checks */ + break; + default: + verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n"); + return -EFAULT; } - return 0; } @@ -6651,7 +6871,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | MEM_ALLOC: case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: - case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset @@ -6666,6 +6885,28 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, } } +static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, + const struct bpf_func_proto *fn, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *state = NULL; + int i; + + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) + if (arg_type_is_dynptr(fn->arg_type[i])) { + if (state) { + verbose(env, "verifier internal error: multiple dynptr args\n"); + return NULL; + } + state = ®s[BPF_REG_1 + i]; + } + + if (!state) + verbose(env, "verifier internal error: no dynptr arg found\n"); + + return state; +} + static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); @@ -6692,9 +6933,28 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state return state->stack[spi].spilled_ptr.ref_obj_id; } +static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->dynptr.type; + + spi = __get_spi(reg->off); + if (spi < 0) { + verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); + return BPF_DYNPTR_TYPE_INVALID; + } + + return state->stack[spi].spilled_ptr.dynptr.type; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, - const struct bpf_func_proto *fn) + const struct bpf_func_proto *fn, + int insn_idx) { u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -6907,7 +7167,7 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, regno, arg_type, meta); + err = process_dynptr_func(env, regno, insn_idx, arg_type); if (err) return err; break; @@ -7126,22 +7386,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_MAP_TYPE_SK_STORAGE: if (func_id != BPF_FUNC_sk_storage_get && - func_id != BPF_FUNC_sk_storage_delete) + func_id != BPF_FUNC_sk_storage_delete && + func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_INODE_STORAGE: if (func_id != BPF_FUNC_inode_storage_get && - func_id != BPF_FUNC_inode_storage_delete) + func_id != BPF_FUNC_inode_storage_delete && + func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_TASK_STORAGE: if (func_id != BPF_FUNC_task_storage_get && - func_id != BPF_FUNC_task_storage_delete) + func_id != BPF_FUNC_task_storage_delete && + func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_CGRP_STORAGE: if (func_id != BPF_FUNC_cgrp_storage_get && - func_id != BPF_FUNC_cgrp_storage_delete) + func_id != BPF_FUNC_cgrp_storage_delete && + func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_BLOOM_FILTER: @@ -7355,6 +7619,9 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. + * + * This also applies to dynptr slices belonging to skb and xdp dynptrs, + * since these slices point to packet data. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { @@ -7362,8 +7629,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) struct bpf_reg_state *reg; bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg_is_pkt_pointer_any(reg)) - __mark_reg_unknown(env, reg); + if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg)) + mark_reg_invalid(env, reg); })); } @@ -7408,12 +7675,8 @@ static int release_reference(struct bpf_verifier_env *env, return err; bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { - if (!env->allow_ptr_leaks) - __mark_reg_not_init(env, reg); - else - __mark_reg_unknown(env, reg); - } + if (reg->ref_obj_id == ref_obj_id) + mark_reg_invalid(env, reg); })); return 0; @@ -7426,7 +7689,7 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env) bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ if (type_is_non_owning_ref(reg->type)) - __mark_reg_unknown(env, reg); + mark_reg_invalid(env, reg); })); } @@ -8197,7 +8460,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { - err = check_func_arg(env, i, &meta, fn); + err = check_func_arg(env, i, &meta, fn, insn_idx); if (err) return err; } @@ -8222,30 +8485,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); - /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr - * is safe to do directly. - */ - if (meta.uninit_dynptr_regno) { - if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { - verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); - return -EFAULT; - } - /* we write BPF_DW bits (8 bytes) at a time */ - for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { - err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, - i, BPF_DW, BPF_WRITE, -1, false); - if (err) - return err; - } - - err = mark_stack_slots_dynptr(env, ®s[meta.uninit_dynptr_regno], - fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1], - insn_idx); - if (err) - return err; - } - if (meta.release_regno) { err = -EINVAL; /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot @@ -8330,43 +8569,62 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_dynptr_data: - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { - if (arg_type_is_dynptr(fn->arg_type[i])) { - struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; - int id, ref_obj_id; + { + struct bpf_reg_state *reg; + int id, ref_obj_id; - if (meta.dynptr_id) { - verbose(env, "verifier internal error: meta.dynptr_id already set\n"); - return -EFAULT; - } + reg = get_dynptr_arg_reg(env, fn, regs); + if (!reg) + return -EFAULT; - if (meta.ref_obj_id) { - verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); - return -EFAULT; - } - id = dynptr_id(env, reg); - if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); - return id; - } + if (meta.dynptr_id) { + verbose(env, "verifier internal error: meta.dynptr_id already set\n"); + return -EFAULT; + } + if (meta.ref_obj_id) { + verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); + return -EFAULT; + } - ref_obj_id = dynptr_ref_obj_id(env, reg); - if (ref_obj_id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); - return ref_obj_id; - } + id = dynptr_id(env, reg); + if (id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + return id; + } - meta.dynptr_id = id; - meta.ref_obj_id = ref_obj_id; - break; - } + ref_obj_id = dynptr_ref_obj_id(env, reg); + if (ref_obj_id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + return ref_obj_id; } - if (i == MAX_BPF_FUNC_REG_ARGS) { - verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n"); + + meta.dynptr_id = id; + meta.ref_obj_id = ref_obj_id; + + break; + } + case BPF_FUNC_dynptr_write: + { + enum bpf_dynptr_type dynptr_type; + struct bpf_reg_state *reg; + + reg = get_dynptr_arg_reg(env, fn, regs); + if (!reg) return -EFAULT; - } + + dynptr_type = dynptr_get_type(env, reg); + if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) + return -EFAULT; + + if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + /* this will trigger clear_all_pkt_pointers(), which will + * invalidate all dynptr slices associated with the skb + */ + changes_data = true; + break; + } case BPF_FUNC_user_ringbuf_drain: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_user_ringbuf_callback_state); @@ -8595,36 +8853,6 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } -struct bpf_kfunc_call_arg_meta { - /* In parameters */ - struct btf *btf; - u32 func_id; - u32 kfunc_flags; - const struct btf_type *func_proto; - const char *func_name; - /* Out parameters */ - u32 ref_obj_id; - u8 release_regno; - bool r0_rdonly; - u32 ret_btf_id; - u64 r0_size; - u32 subprogno; - struct { - u64 value; - bool found; - } arg_constant; - struct { - struct btf *btf; - u32 btf_id; - } arg_obj_drop; - struct { - struct btf_field *field; - } arg_list_head; - struct { - struct btf_field *field; - } arg_rbtree_root; -}; - static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_ACQUIRE; @@ -8696,6 +8924,19 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, return __kfunc_param_match_suffix(btf, arg, "__sz"); } +static bool is_kfunc_arg_const_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + const struct btf_type *t; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + return __kfunc_param_match_suffix(btf, arg, "__szk"); +} + static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return __kfunc_param_match_suffix(btf, arg, "__k"); @@ -8711,6 +8952,11 @@ static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param return __kfunc_param_match_suffix(btf, arg, "__alloc"); } +static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__uninit"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -8877,6 +9123,10 @@ enum special_kfunc_type { KF_bpf_rbtree_remove, KF_bpf_rbtree_add, KF_bpf_rbtree_first, + KF_bpf_dynptr_from_skb, + KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_slice, + KF_bpf_dynptr_slice_rdwr, }; BTF_SET_START(special_kfunc_set) @@ -8891,6 +9141,10 @@ BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_slice) +BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -8907,6 +9161,10 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_slice) +BTF_ID(func, bpf_dynptr_slice_rdwr) static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { @@ -8986,7 +9244,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; - if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])) + + if (argno + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) arg_mem_size = true; /* This is the catch all argument type of register types supported by @@ -9206,7 +9467,6 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ ptr = reg->map_ptr; break; case PTR_TO_BTF_ID | MEM_ALLOC: - case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: ptr = reg->btf; break; default: @@ -9455,7 +9715,8 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, &meta->arg_rbtree_root.field); } -static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) +static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, + int insn_idx) { const char *func_name = meta->func_name, *ref_tname; const struct btf *btf = meta->btf; @@ -9538,7 +9799,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - if (is_kfunc_trusted_args(meta) && + if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && (register_is_null(reg) || type_may_be_null(reg->type))) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; @@ -9646,16 +9907,43 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_DYNPTR: + { + enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; + if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); return -EINVAL; } - ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); + if (reg->type == CONST_PTR_TO_DYNPTR) + dynptr_arg_type |= MEM_RDONLY; + + if (is_kfunc_arg_uninit(btf, &args[i])) + dynptr_arg_type |= MEM_UNINIT; + + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) + dynptr_arg_type |= DYNPTR_TYPE_SKB; + else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) + dynptr_arg_type |= DYNPTR_TYPE_XDP; + + ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type); if (ret < 0) return ret; + + if (!(dynptr_arg_type & MEM_UNINIT)) { + int id = dynptr_id(env, reg); + + if (id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + return id; + } + meta->initialized_dynptr.id = id; + meta->initialized_dynptr.type = dynptr_get_type(env, reg); + } + break; + } case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { @@ -9749,14 +10037,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_MEM_SIZE: - ret = check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1); + { + struct bpf_reg_state *size_reg = ®s[regno + 1]; + const struct btf_param *size_arg = &args[i + 1]; + + ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); return ret; } - /* Skip next '__sz' argument */ + + if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { + if (meta->arg_constant.found) { + verbose(env, "verifier internal error: only one constant argument permitted\n"); + return -EFAULT; + } + if (!tnum_is_const(size_reg->var_off)) { + verbose(env, "R%d must be a known constant\n", regno + 1); + return -EINVAL; + } + meta->arg_constant.found = true; + meta->arg_constant.value = size_reg->var_off.value; + } + + /* Skip next '__sz' or '__szk' argument */ i++; break; + } case KF_ARG_PTR_TO_CALLBACK: meta->subprogno = reg->subprogno; break; @@ -9828,10 +10135,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); - if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) { - verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name); - return -EACCES; - } if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; @@ -9860,7 +10163,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } /* Check the arguments */ - err = check_kfunc_args(env, &meta); + err = check_kfunc_args(env, &meta, insn_idx); if (err < 0) return err; /* In case of release function, we get register number of refcounted @@ -9991,6 +10294,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].btf_id = meta.arg_constant.value; + } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] || + meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type); + + mark_reg_known_zero(env, regs, BPF_REG_0); + + if (!meta.arg_constant.found) { + verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); + return -EFAULT; + } + + regs[BPF_REG_0].mem_size = meta.arg_constant.value; + + /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ + regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; + + if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { + regs[BPF_REG_0].type |= MEM_RDONLY; + } else { + /* this will set env->seen_direct_write to true */ + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "the prog does not allow writes to packet data\n"); + return -EINVAL; + } + } + + if (!meta.initialized_dynptr.id) { + verbose(env, "verifier internal error: no dynptr id\n"); + return -EFAULT; + } + regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id; + + /* we don't need to set BPF_REG_0's ref obj id + * because packet slices are not refcounted (see + * dynptr_type_refcounted) + */ } else { verbose(env, "kernel function %s unhandled dynamic return type\n", meta.func_name); @@ -9998,6 +10337,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } else if (!__btf_type_is_struct(ptr_type)) { if (!meta.r0_size) { + __u32 sz; + + if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) { + meta.r0_size = sz; + meta.r0_rdonly = true; + } + } + if (!meta.r0_size) { ptr_type_name = btf_name_by_offset(desc_btf, ptr_type->name_off); verbose(env, @@ -13152,44 +13499,43 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, */ static int visit_insn(int t, struct bpf_verifier_env *env) { - struct bpf_insn *insns = env->prog->insnsi; + struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; int ret; - if (bpf_pseudo_func(insns + t)) + if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ - if (BPF_CLASS(insns[t].code) != BPF_JMP && - BPF_CLASS(insns[t].code) != BPF_JMP32) + if (BPF_CLASS(insn->code) != BPF_JMP && + BPF_CLASS(insn->code) != BPF_JMP32) return push_insn(t, t + 1, FALLTHROUGH, env, false); - switch (BPF_OP(insns[t].code)) { + switch (BPF_OP(insn->code)) { case BPF_EXIT: return DONE_EXPLORING; case BPF_CALL: - if (insns[t].imm == BPF_FUNC_timer_set_callback) + if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) /* Mark this call insn as a prune point to trigger * is_state_visited() check before call itself is * processed by __check_func_call(). Otherwise new * async state will be pushed for further exploration. */ mark_prune_point(env, t); - return visit_func_call_insn(t, insns, env, - insns[t].src_reg == BPF_PSEUDO_CALL); + return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: - if (BPF_SRC(insns[t].code) != BPF_K) + if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; /* unconditional jump with single edge */ - ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, + ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env, true); if (ret) return ret; - mark_prune_point(env, t + insns[t].off + 1); - mark_jmp_point(env, t + insns[t].off + 1); + mark_prune_point(env, t + insn->off + 1); + mark_jmp_point(env, t + insn->off + 1); return ret; @@ -13201,7 +13547,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (ret) return ret; - return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); + return push_insn(t, t + insn->off + 1, BRANCH, env, true); } } @@ -13877,13 +14223,17 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, tnum_in(rold->var_off, rcur->var_off); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_BUF: + case PTR_TO_TP_BUFFER: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off) && - check_ids(rold->id, rcur->id, idmap); + check_ids(rold->id, rcur->id, idmap) && + check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: /* We must have at least as much range as the old ptr @@ -13936,6 +14286,10 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + if (env->allow_uninit_stack && + old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC) + continue; + /* explored stack has more populated slots than current stack * and these slots were used */ @@ -14311,7 +14665,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * This threshold shouldn't be too high either, since states * at the end of the loop are likely to be useful in pruning. */ - if (env->jmps_processed - env->prev_jmps_processed < 20 && + if (!env->test_state_freq && + env->jmps_processed - env->prev_jmps_processed < 20 && env->insn_processed - env->prev_insn_processed < 100) add_new_state = false; goto miss; @@ -14500,6 +14855,44 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) !reg_type_mismatch_ok(prev)); } +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, + bool allow_trust_missmatch) +{ + enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; + + if (*prev_type == NOT_INIT) { + /* Saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * save type to validate intersecting paths + */ + *prev_type = type; + } else if (reg_type_mismatch(type, *prev_type)) { + /* Abuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + if (allow_trust_missmatch && + base_type(type) == PTR_TO_BTF_ID && + base_type(*prev_type) == PTR_TO_BTF_ID) { + /* + * Have to support a use case when one path through + * the program yields TRUSTED pointer while another + * is UNTRUSTED. Fallback to UNTRUSTED to generate + * BPF_PROBE_MEM. + */ + *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + } else { + verbose(env, "same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } + + return 0; +} + static int do_check(struct bpf_verifier_env *env) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); @@ -14609,7 +15002,7 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type *prev_src_type, src_reg_type; + enum bpf_reg_type src_reg_type; /* check for reserved fields is already done */ @@ -14633,29 +15026,11 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; - - if (*prev_src_type == NOT_INIT) { - /* saw a valid insn - * dst_reg = *(u32 *)(src_reg + off) - * save type to validate intersecting paths - */ - *prev_src_type = src_reg_type; - - } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { - /* ABuser program is trying to use the same insn - * dst_reg = *(u32*) (src_reg + off) - * with different pointer types: - * src_reg == ctx in one branch and - * src_reg == stack|map in some other branch. - * Reject it. - */ - verbose(env, "same insn cannot be used with different pointers\n"); - return -EINVAL; - } - + err = save_aux_ptr_type(env, src_reg_type, true); + if (err) + return err; } else if (class == BPF_STX) { - enum bpf_reg_type *prev_dst_type, dst_reg_type; + enum bpf_reg_type dst_reg_type; if (BPF_MODE(insn->code) == BPF_ATOMIC) { err = check_atomic(env, env->insn_idx, insn); @@ -14688,16 +15063,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; - - if (*prev_dst_type == NOT_INIT) { - *prev_dst_type = dst_reg_type; - } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) { - verbose(env, "same insn cannot be used with different pointers\n"); - return -EINVAL; - } - + err = save_aux_ptr_type(env, dst_reg_type, false); + if (err) + return err; } else if (class == BPF_ST) { + enum bpf_reg_type dst_reg_type; + if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { verbose(env, "BPF_ST uses reserved fields\n"); @@ -14708,12 +15079,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_ST stores into R%d %s is not allowed\n", - insn->dst_reg, - reg_type_str(env, reg_state(env, insn->dst_reg)->type)); - return -EACCES; - } + dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, @@ -14722,6 +15088,9 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + err = save_aux_ptr_type(env, dst_reg_type, false); + if (err) + return err; } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); @@ -14756,6 +15125,8 @@ static int do_check(struct bpf_verifier_env *env) err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; + + mark_reg_scratched(env, BPF_REG_0); } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -15830,14 +16201,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { bpf_convert_ctx_access_t convert_ctx_access; - bool ctx_access; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { type = BPF_READ; - ctx_access = true; } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || insn->code == (BPF_STX | BPF_MEM | BPF_H) || insn->code == (BPF_STX | BPF_MEM | BPF_W) || @@ -15847,7 +16216,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - ctx_access = BPF_CLASS(insn->code) == BPF_STX; } else { continue; } @@ -15870,9 +16238,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } - if (!ctx_access) - continue; - switch ((int)env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) @@ -16321,6 +16686,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + bool seen_direct_write = env->seen_direct_write; + bool is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); + + if (is_rdonly) + insn->imm = BPF_CALL_IMM(bpf_dynptr_from_skb_rdonly); + + /* restore env->seen_direct_write to its original value, since + * may_access_direct_pkt_data mutates it + */ + env->seen_direct_write = seen_direct_write; } return 0; } @@ -17712,8 +18088,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); - env->rcu_tag_supported = btf_vmlinux && - btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0; if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e8da032bb6fc..bcf91bc7bf71 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1453,10 +1453,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) NULL : &bpf_probe_read_compat_str_proto; #endif #ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; case BPF_FUNC_cgrp_storage_get: return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: |