From a76ab5731e32d50ff5b1ae97e9dc4b23f41c23f5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:07 -0800 Subject: bpf: Find eligible subprogs for private stack support Private stack will be allocated with percpu allocator in jit time. To avoid complexity at runtime, only one copy of private stack is available per cpu per prog. So runtime recursion check is necessary to avoid stack corruption. Current private stack only supports kprobe/perf_event/tp/raw_tp which has recursion check in the kernel, and prog types that use bpf trampoline recursion check. For trampoline related prog types, currently only tracing progs have recursion checking. To avoid complexity, all async_cb subprogs use normal kernel stack including those subprogs used by both main prog subtree and async_cb subtree. Any prog having tail call also uses kernel stack. To avoid jit penalty with private stack support, a subprog stack size threshold is set such that only if the stack size is no less than the threshold, private stack is supported. The current threshold is 64 bytes. This avoids jit penality if the stack usage is small. A useless 'continue' is also removed from a loop in func check_max_stack_depth(). Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163907.2223839-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 5 +++ kernel/bpf/verifier.c | 96 +++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 91 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 233ea78f8f1b..14d9288441f2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void) return false; } +bool __weak bpf_jit_supports_private_stack(void) +{ + return false; +} + void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f5de8d4fbd0..fb23793ac53d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem { #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 +#define BPF_PRIV_STACK_MIN_SIZE 64 + static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -6090,6 +6092,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog) +{ + if (!bpf_jit_supports_private_stack()) + return NO_PRIV_STACK; + + /* bpf_prog_check_recur() checks all prog types that use bpf trampoline + * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked + * explicitly. + */ + switch (prog->type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return PRIV_STACK_ADAPTIVE; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + if (bpf_prog_check_recur(prog)) + return PRIV_STACK_ADAPTIVE; + fallthrough; + default: + break; + } + + return NO_PRIV_STACK; +} + static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) { if (env->prog->jit_requested) @@ -6107,17 +6137,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, + bool priv_stack_supported) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int depth = 0, frame = 0, i, subprog_end; + int depth = 0, frame = 0, i, subprog_end, subprog_depth; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j; i = subprog[idx].start; + if (!priv_stack_supported) + subprog[idx].priv_stack_mode = NO_PRIV_STACK; process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack @@ -6144,11 +6177,31 @@ process_func: depth); return -EACCES; } - depth += round_up_stack_depth(env, subprog[idx].stack_depth); - if (depth > MAX_BPF_STACK) { - verbose(env, "combined stack size of %d calls is %d. Too large\n", - frame + 1, depth); - return -EACCES; + + subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); + if (priv_stack_supported) { + /* Request private stack support only if the subprog stack + * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to + * avoid jit penalty if the stack usage is small. + */ + if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN && + subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) + subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE; + } + + if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + if (subprog_depth > MAX_BPF_STACK) { + verbose(env, "stack size of subprog %d is %d. Too large\n", + idx, subprog_depth); + return -EACCES; + } + } else { + depth += subprog_depth; + if (depth > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + frame + 1, depth); + return -EACCES; + } } continue_func: subprog_end = subprog[idx + 1].start; @@ -6205,6 +6258,8 @@ continue_func: } i = next_insn; idx = sidx; + if (!priv_stack_supported) + subprog[idx].priv_stack_mode = NO_PRIV_STACK; if (subprog[idx].has_tail_call) tail_call_reachable = true; @@ -6238,7 +6293,8 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up_stack_depth(env, subprog[idx].stack_depth); + if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE) + depth -= round_up_stack_depth(env, subprog[idx].stack_depth); frame--; i = ret_insn[frame]; idx = ret_prog[frame]; @@ -6247,16 +6303,36 @@ continue_func: static int check_max_stack_depth(struct bpf_verifier_env *env) { + enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN; struct bpf_subprog_info *si = env->subprog_info; + bool priv_stack_supported; int ret; for (int i = 0; i < env->subprog_cnt; i++) { + if (si[i].has_tail_call) { + priv_stack_mode = NO_PRIV_STACK; + break; + } + } + + if (priv_stack_mode == PRIV_STACK_UNKNOWN) + priv_stack_mode = bpf_enable_priv_stack(env->prog); + + /* All async_cb subprogs use normal kernel stack. If a particular + * subprog appears in both main prog and async_cb subtree, that + * subprog will use normal kernel stack to avoid potential nesting. + * The reverse subprog traversal ensures when main prog subtree is + * checked, the subprogs appearing in async_cb subtrees are already + * marked as using normal kernel stack, so stack size checking can + * be done properly. + */ + for (int i = env->subprog_cnt - 1; i >= 0; i--) { if (!i || si[i].is_async_cb) { - ret = check_max_stack_depth_subprog(env, i); + priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE; + ret = check_max_stack_depth_subprog(env, i, priv_stack_supported); if (ret < 0) return ret; } - continue; } return 0; } -- cgit v1.2.3-70-g09d2 From e00931c02568dc6ac76f94b1ab471de05e6fdfe8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:12 -0800 Subject: bpf: Enable private stack for eligible subprogs If private stack is used by any subprog, set that subprog prog->aux->jits_use_priv_stack to be true so later jit can allocate private stack for that subprog properly. Also set env->prog->aux->jits_use_priv_stack to be true if any subprog uses private stack. This is a use case for a single main prog (no subprogs) to use private stack, and also a use case for later struct-ops progs where env->prog->aux->jits_use_priv_stack will enable recursion check if any subprog uses private stack. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163912.2224007-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7da41ae2eac8..129b29e85cec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1523,6 +1523,7 @@ struct bpf_prog_aux { bool exception_cb; bool exception_boundary; bool is_extended; /* true if extended by freplace program */ + bool jits_use_priv_stack; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fb23793ac53d..176d19ad9d07 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6334,6 +6334,14 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) return ret; } } + + for (int i = 0; i < env->subprog_cnt; i++) { + if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + env->prog->aux->jits_use_priv_stack = true; + break; + } + } + return 0; } @@ -20274,6 +20282,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; + if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) + func[i]->aux->jits_use_priv_stack = true; + func[i]->jit_requested = 1; func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; -- cgit v1.2.3-70-g09d2 From 5bd36da1e37e7a78e8b38efd287de6e1394b7d6e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:33 -0800 Subject: bpf: Support private stack for struct_ops progs For struct_ops progs, whether a particular prog uses private stack depends on prog->aux->priv_stack_requested setting before actual insn-level verification for that prog. One particular implementation is to piggyback on struct_ops->check_member(). The next patch has an example for this. The struct_ops->check_member() sets prog->aux->priv_stack_requested to be true which enables private stack usage. The struct_ops prog follows the same rule as kprobe/tracing progs after function bpf_enable_priv_stack(). For example, even a struct_ops prog requests private stack, it could still use normal kernel stack if the stack size is small (< 64 bytes). Similar to tracing progs, nested same cpu same prog run will be skipped. A field (recursion_detected()) is added to bpf_prog_aux structure. If bpf_prog->aux->recursion_detected is implemented by the struct_ops subsystem and nested same cpu/prog happens, the function will be triggered to report an error, collect related info, etc. Acked-by: Tejun Heo Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163933.2224962-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_verifier.h | 1 + kernel/bpf/trampoline.c | 4 ++++ kernel/bpf/verifier.c | 7 ++++++- 4 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d32cc373dfd1..10945c8858ce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1525,9 +1525,11 @@ struct bpf_prog_aux { bool exception_boundary; bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; + bool priv_stack_requested; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; + void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */ /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d62bb2ca1828..6b7c91629176 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -879,6 +879,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: return prog->expected_attach_type != BPF_TRACE_ITER; case BPF_PROG_TYPE_STRUCT_OPS: + return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: return false; default: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9f36c049f4c2..a8d188b31da5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -899,6 +899,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); @@ -975,6 +977,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 176d19ad9d07..f4c39bb50511 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6110,7 +6110,7 @@ static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: - if (bpf_prog_check_recur(prog)) + if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog)) return PRIV_STACK_ADAPTIVE; fallthrough; default: @@ -22053,6 +22053,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } } + if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) { + verbose(env, "Private stack not supported by jit\n"); + return -EACCES; + } + /* btf_ctx_access() used this to provide argument type info */ prog->aux->ctx_arg_info = st_ops_desc->arg_info[member_idx].info; -- cgit v1.2.3-70-g09d2