From c7603cfa04e7c3a435b31d065f7cbdc829428f6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Jul 2021 16:06:15 -0700 Subject: bpf: Add ambient BPF runtime context stored in current b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed the problem with cgroup-local storage use in BPF by pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate possible BPF program preemptions and nested executions. While this seems to work good in practice, it introduces new and unnecessary failure mode in which not all BPF programs might be executed if we fail to find an unused slot for cgroup storage, however unlikely it is. It might also not be so unlikely when/if we allow sleepable cgroup BPF programs in the future. Further, the way that cgroup storage is implemented as ambiently-available property during entire BPF program execution is a convenient way to pass extra information to BPF program and helpers without requiring user code to pass around extra arguments explicitly. So it would be good to have a generic solution that can allow implementing this without arbitrary restrictions. Ideally, such solution would work for both preemptable and sleepable BPF programs in exactly the same way. This patch introduces such solution, bpf_run_ctx. It adds one pointer field (bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of macros in such a way that it always stays valid throughout BPF program execution. BPF program preemption is handled by remembering previous current->bpf_ctx value locally while executing nested BPF program and restoring old value after nested BPF program finishes. This is handled by two helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are supposed to be used before and after BPF program runs, respectively. Restoring old value of the pointer handles preemption, while bpf_run_ctx pointer being a property of current task_struct naturally solves this problem for sleepable BPF programs by "following" BPF program execution as it is scheduled in and out of CPU. It would even allow CPU migration of BPF programs, even though it's not currently allowed by BPF infra. This patch cleans up cgroup local storage handling as a first application. The design itself is generic, though, with bpf_run_ctx being an empty struct that is supposed to be embedded into a specific struct for a given BPF program type (bpf_cg_run_ctx in this case). Follow up patches are planned that will expand this mechanism for other uses within tracing BPF programs. To verify that this change doesn't revert the fix to the original cgroup storage issue, I ran the same repro as in the original report ([0]) and didn't get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work). [0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/ Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org --- net/bpf/test_run.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index cda8375bbbaf..8d46e2962786 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -88,17 +88,19 @@ reset: static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_prog_array_item item = {.prog = prog}; + struct bpf_run_ctx *old_ctx; + struct bpf_cg_run_ctx run_ctx; struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; int ret; for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; + item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(item.cgroup_storage[stype])) { + item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } @@ -107,22 +109,19 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, repeat = 1; bpf_test_timer_enter(&t); + old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { - ret = bpf_cgroup_storage_set(storage); - if (ret) - break; - + run_ctx.prog_item = &item; if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx); - - bpf_cgroup_storage_unset(); } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } -- cgit v1.2.3-70-g09d2 From 525e2f9fd0229eb10cb460a9e6d978257f24804e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:05:41 -0700 Subject: tcp: seq_file: Avoid skipping sk during tcp_seek_last_pos st->bucket stores the current bucket number. st->offset stores the offset within this bucket that is the sk to be seq_show(). Thus, st->offset only makes sense within the same st->bucket. These two variables are an optimization for the common no-lseek case. When resuming the seq_file iteration (i.e. seq_start()), tcp_seek_last_pos() tries to continue from the st->offset at bucket st->bucket. However, it is possible that the bucket pointed by st->bucket has changed and st->offset may end up skipping the whole st->bucket without finding a sk. In this case, tcp_seek_last_pos() currently continues to satisfy the offset condition in the next (and incorrect) bucket. Instead, regardless of the offset value, the first sk of the next bucket should be returned. Thus, "bucket == st->bucket" check is added to tcp_seek_last_pos(). The chance of hitting this is small and the issue is a decade old, so targeting for the next tree. Fixes: a8b690f98baf ("tcp: Fix slowness in read /proc/net/tcp") Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200541.1033917-1-kafai@fb.com --- net/ipv4/tcp_ipv4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b9dc2d6197be..ee85abde968c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2451,6 +2451,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; + int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; @@ -2461,7 +2462,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) break; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_next(seq, NULL); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; @@ -2472,7 +2473,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); } -- cgit v1.2.3-70-g09d2 From ad2d61376a0517f19f49fc23de9e12d2b06484fc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:05:48 -0700 Subject: tcp: seq_file: Refactor net and family matching This patch refactors the net and family matching into two new helpers, seq_sk_match() and seq_file_family(). seq_file_family() is in the later part of the file to prepare the change of a following patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200548.1034629-1-kafai@fb.com --- net/ipv4/tcp_ipv4.c | 68 +++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ee85abde968c..f2583c4699fd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2277,6 +2277,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ +static unsigned short seq_file_family(const struct seq_file *seq); + +static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) +{ + unsigned short family = seq_file_family(seq); + + /* AF_UNSPEC is used as a match all */ + return ((family == AF_UNSPEC || family == sk->sk_family) && + net_eq(sock_net(sk), seq_file_net(seq))); +} + /* * Get next listener socket follow cur. If cur is NULL, get first socket * starting from bucket given in st->bucket; when st->bucket is zero the @@ -2284,18 +2295,11 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk = cur; - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); - if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; @@ -2311,10 +2315,7 @@ get_head: sk = sk_nulls_next(sk); get_sk: sk_nulls_for_each_from(sk, node) { - if (!net_eq(sock_net(sk), net)) - continue; - if (afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) + if (seq_sk_match(seq, sk)) return sk; } spin_unlock(&ilb->lock); @@ -2351,15 +2352,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - void *rc = NULL; - - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); st->offset = 0; for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { @@ -2373,32 +2366,20 @@ static void *established_get_first(struct seq_file *seq) spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if ((afinfo->family != AF_UNSPEC && - sk->sk_family != afinfo->family) || - !net_eq(sock_net(sk), net)) { - continue; - } - rc = sk; - goto out; + if (seq_sk_match(seq, sk)) + return sk; } spin_unlock_bh(lock); } -out: - return rc; + + return NULL; } static void *established_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo; struct sock *sk = cur; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); ++st->num; ++st->offset; @@ -2406,9 +2387,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { - if ((afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) && - net_eq(sock_net(sk), net)) + if (seq_sk_match(seq, sk)) return sk; } @@ -2754,6 +2733,19 @@ static const struct seq_operations bpf_iter_tcp_seq_ops = { .stop = bpf_iter_tcp_seq_stop, }; #endif +static unsigned short seq_file_family(const struct seq_file *seq) +{ + const struct tcp_iter_state *st = seq->private; + const struct tcp_seq_afinfo *afinfo = st->bpf_seq_afinfo; + + /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ + if (afinfo) + return AF_UNSPEC; + + /* Iterated from proc fs */ + afinfo = PDE_DATA(file_inode(seq->file)); + return afinfo->family; +} static const struct seq_operations tcp4_seq_ops = { .show = tcp4_seq_show, -- cgit v1.2.3-70-g09d2 From 62001372c2b6cdf2346afb2cf94ed3d950eee64c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:05:54 -0700 Subject: bpf: tcp: seq_file: Remove bpf_seq_afinfo from tcp_iter_state A following patch will create a separate struct to store extra bpf_iter state and it will embed the existing tcp_iter_state like this: struct bpf_tcp_iter_state { struct tcp_iter_state state; /* More bpf_iter specific states here ... */ } As a prep work, this patch removes the "struct tcp_seq_afinfo *bpf_seq_afinfo" where its purpose is to tell if it is iterating from bpf_iter instead of proc fs. Currently, if "*bpf_seq_afinfo" is not NULL, it is iterating from bpf_iter. The kernel should not filter by the addr family and leave this filtering decision to the bpf prog. Instead of adding a "*bpf_seq_afinfo" pointer, this patch uses the "seq->op == &bpf_iter_tcp_seq_ops" test to tell if it is iterating from the bpf iter. The bpf_iter_(init|fini)_tcp() is left here to prepare for the change of a following patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200554.1034982-1-kafai@fb.com --- include/net/tcp.h | 1 - net/ipv4/tcp_ipv4.c | 25 +++++-------------------- 2 files changed, 5 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 17df9b047ee4..ba3034123e1d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1959,7 +1959,6 @@ struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; - struct tcp_seq_afinfo *bpf_seq_afinfo; int bucket, offset, sbucket, num; loff_t last_pos; }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f2583c4699fd..665f99d14436 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2735,12 +2735,13 @@ static const struct seq_operations bpf_iter_tcp_seq_ops = { #endif static unsigned short seq_file_family(const struct seq_file *seq) { - const struct tcp_iter_state *st = seq->private; - const struct tcp_seq_afinfo *afinfo = st->bpf_seq_afinfo; + const struct tcp_seq_afinfo *afinfo; +#ifdef CONFIG_BPF_SYSCALL /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ - if (afinfo) + if (seq->op == &bpf_iter_tcp_seq_ops) return AF_UNSPEC; +#endif /* Iterated from proc fs */ afinfo = PDE_DATA(file_inode(seq->file)); @@ -2998,27 +2999,11 @@ DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) { - struct tcp_iter_state *st = priv_data; - struct tcp_seq_afinfo *afinfo; - int ret; - - afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); - if (!afinfo) - return -ENOMEM; - - afinfo->family = AF_UNSPEC; - st->bpf_seq_afinfo = afinfo; - ret = bpf_iter_init_seq_net(priv_data, aux); - if (ret) - kfree(afinfo); - return ret; + return bpf_iter_init_seq_net(priv_data, aux); } static void bpf_iter_fini_tcp(void *priv_data) { - struct tcp_iter_state *st = priv_data; - - kfree(st->bpf_seq_afinfo); bpf_iter_fini_seq_net(priv_data); } -- cgit v1.2.3-70-g09d2 From b72acf4501d7c31e96749f0f5052b3bcb25fc2cb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:06:00 -0700 Subject: tcp: seq_file: Add listening_get_first() The current listening_get_next() is overloaded by passing NULL to the 2nd arg, like listening_get_next(seq, NULL), to mean get_first(). This patch moves some logic from the listening_get_next() into a new function listening_get_first(). It will be equivalent to the current established_get_first() and established_get_next() setup. get_first() is to find a non empty bucket and return the first sk. get_next() is to find the next sk of the current bucket and then resorts to get_first() if the current bucket is exhausted. The next patch is to move the listener seq_file iteration from listening_hash (port only) to lhash2 (port+addr). Separating out listening_get_first() from listening_get_next() here will make the following lhash2 changes cleaner and easier to follow. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200600.1035353-1-kafai@fb.com --- net/ipv4/tcp_ipv4.c | 59 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 665f99d14436..48a0a3873c7a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2288,10 +2288,38 @@ static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) net_eq(sock_net(sk), seq_file_net(seq))); } -/* - * Get next listener socket follow cur. If cur is NULL, get first socket - * starting from bucket given in st->bucket; when st->bucket is zero the - * very first socket in the hash table is returned. +/* Find a non empty bucket (starting from st->bucket) + * and return the first sk from it. + */ +static void *listening_get_first(struct seq_file *seq) +{ + struct tcp_iter_state *st = seq->private; + + st->offset = 0; + for (; st->bucket < INET_LHTABLE_SIZE; st->bucket++) { + struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + struct sock *sk; + + ilb = &tcp_hashinfo.listening_hash[st->bucket]; + if (hlist_nulls_empty(&ilb->nulls_head)) + continue; + + spin_lock(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + if (seq_sk_match(seq, sk)) + return sk; + } + spin_unlock(&ilb->lock); + } + + return NULL; +} + +/* Find the next sk of "cur" within the same bucket (i.e. st->bucket). + * If "cur" is the last one in the st->bucket, + * call listening_get_first() to return the first sk of the next + * non empty bucket. */ static void *listening_get_next(struct seq_file *seq, void *cur) { @@ -2300,29 +2328,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct hlist_nulls_node *node; struct sock *sk = cur; - if (!sk) { -get_head: - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock(&ilb->lock); - sk = sk_nulls_head(&ilb->nulls_head); - st->offset = 0; - goto get_sk; - } - ilb = &tcp_hashinfo.listening_hash[st->bucket]; ++st->num; ++st->offset; sk = sk_nulls_next(sk); -get_sk: + sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) return sk; } + + ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_unlock(&ilb->lock); - st->offset = 0; - if (++st->bucket < INET_LHTABLE_SIZE) - goto get_head; - return NULL; + ++st->bucket; + return listening_get_first(seq); } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) @@ -2332,7 +2351,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) st->bucket = 0; st->offset = 0; - rc = listening_get_next(seq, NULL); + rc = listening_get_first(seq); while (rc && *pos) { rc = listening_get_next(seq, rc); @@ -2440,7 +2459,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket >= INET_LHTABLE_SIZE) break; st->state = TCP_SEQ_STATE_LISTENING; - rc = listening_get_next(seq, NULL); + rc = listening_get_first(seq); while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) -- cgit v1.2.3-70-g09d2 From 05c0b35709c58b83d4dc515d2ac52e9c0f197d03 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:06:06 -0700 Subject: tcp: seq_file: Replace listening_hash with lhash2 This patch moves the tcp seq_file iteration on listeners from the port only listening_hash to the port+addr lhash2. When iterating from the bpf iter, the next patch will need to lock the socket such that the bpf iter can call setsockopt (e.g. to change the TCP_CONGESTION). To avoid locking the bucket and then locking the sock, the bpf iter will first batch some sockets from the same bucket and then unlock the bucket. If the bucket size is small (which usually is), it is easier to batch the whole bucket such that it is less likely to miss a setsockopt on a socket due to changes in the bucket. However, the port only listening_hash could have many listeners hashed to a bucket (e.g. many individual VIP(s):443 and also multiple by the number of SO_REUSEPORT). We have seen bucket size in tens of thousands range. Also, the chance of having changes in some popular port buckets (e.g. 443) is also high. The port+addr lhash2 was introduced to solve this large listener bucket issue. Also, the listening_hash usage has already been replaced with lhash2 in the fast path inet[6]_lookup_listener(). This patch follows the same direction on moving to lhash2 and iterates the lhash2 instead of listening_hash. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200606.1035783-1-kafai@fb.com --- include/net/inet_hashtables.h | 6 ++++++ net/ipv4/tcp_ipv4.c | 35 ++++++++++++++++++----------------- 2 files changed, 24 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ca6a3ea9057e..f72ec113ae56 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -160,6 +160,12 @@ struct inet_hashinfo { ____cacheline_aligned_in_smp; }; +#define inet_lhash2_for_each_icsk_continue(__icsk) \ + hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node) + +#define inet_lhash2_for_each_icsk(__icsk, list) \ + hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node) + #define inet_lhash2_for_each_icsk_rcu(__icsk, list) \ hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 48a0a3873c7a..d38b4379dca4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2296,21 +2296,22 @@ static void *listening_get_first(struct seq_file *seq) struct tcp_iter_state *st = seq->private; st->offset = 0; - for (; st->bucket < INET_LHTABLE_SIZE; st->bucket++) { - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; + for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { + struct inet_listen_hashbucket *ilb2; + struct inet_connection_sock *icsk; struct sock *sk; - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - if (hlist_nulls_empty(&ilb->nulls_head)) + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + if (hlist_empty(&ilb2->head)) continue; - spin_lock(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->nulls_head) { + spin_lock(&ilb2->lock); + inet_lhash2_for_each_icsk(icsk, &ilb2->head) { + sk = (struct sock *)icsk; if (seq_sk_match(seq, sk)) return sk; } - spin_unlock(&ilb->lock); + spin_unlock(&ilb2->lock); } return NULL; @@ -2324,22 +2325,22 @@ static void *listening_get_first(struct seq_file *seq) static void *listening_get_next(struct seq_file *seq, void *cur) { struct tcp_iter_state *st = seq->private; - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; + struct inet_listen_hashbucket *ilb2; + struct inet_connection_sock *icsk; struct sock *sk = cur; ++st->num; ++st->offset; - sk = sk_nulls_next(sk); - - sk_nulls_for_each_from(sk, node) { + icsk = inet_csk(sk); + inet_lhash2_for_each_icsk_continue(icsk) { + sk = (struct sock *)icsk; if (seq_sk_match(seq, sk)) return sk; } - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_unlock(&ilb->lock); + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + spin_unlock(&ilb2->lock); ++st->bucket; return listening_get_first(seq); } @@ -2456,7 +2457,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) switch (st->state) { case TCP_SEQ_STATE_LISTENING: - if (st->bucket >= INET_LHTABLE_SIZE) + if (st->bucket > tcp_hashinfo.lhash2_mask) break; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_first(seq); @@ -2541,7 +2542,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); break; case TCP_SEQ_STATE_ESTABLISHED: if (v) -- cgit v1.2.3-70-g09d2 From 04c7820b776f1c4b48698574c47de9e940d368e8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:06:13 -0700 Subject: bpf: tcp: Bpf iter batching and lock_sock This patch does batching and lock_sock for the bpf tcp iter. It does not affect the proc fs iteration. With bpf-tcp-cc, new algo rollout happens more often. Instead of restarting the application to pick up the new tcp-cc, the next patch will allow bpf iter to do setsockopt(TCP_CONGESTION). This requires locking the sock. Also, unlike the proc iteration (cat /proc/net/tcp[6]), the bpf iter can inspect all fields of a tcp_sock. It will be useful to have a consistent view on some of the fields (e.g. the ones reported in tcp_get_info() that also acquires the sock lock). Double lock: locking the bucket first and then locking the sock could lead to deadlock. This patch takes a batching approach similar to inet_diag. While holding the bucket lock, it batch a number of sockets into an array first and then unlock the bucket. Before doing show(), it then calls lock_sock_fast(). In a machine with ~400k connections, the maximum number of sk in a bucket of the established hashtable is 7. 0.02% of the established connections fall into this bucket size. For listen hash (port+addr lhash2), the bucket is usually very small also except for the SO_REUSEPORT use case which the userspace could have one SO_REUSEPORT socket per thread. While batching is used, it can also minimize the chance of missing sock in the setsockopt use case if the whole bucket is batched. This patch will start with a batch array with INIT_BATCH_SZ (16) which will be enough for the most common cases. bpf_iter_tcp_batch() will try to realloc to a larger array to handle exception case (e.g. the SO_REUSEPORT case in the lhash2). Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@fb.com --- net/ipv4/tcp_ipv4.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 231 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d38b4379dca4..84ac0135d389 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2687,6 +2687,15 @@ out: } #ifdef CONFIG_BPF_SYSCALL +struct bpf_tcp_iter_state { + struct tcp_iter_state state; + unsigned int cur_sk; + unsigned int end_sk; + unsigned int max_sk; + struct sock **batch; + bool st_bucket_done; +}; + struct bpf_iter__tcp { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct sock_common *, sk_common); @@ -2705,16 +2714,204 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, return bpf_iter_run_prog(prog, &ctx); } +static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) +{ + while (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); +} + +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, + unsigned int new_batch_sz) +{ + struct sock **new_batch; + + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, + GFP_USER | __GFP_NOWARN); + if (!new_batch) + return -ENOMEM; + + bpf_iter_tcp_put_batch(iter); + kvfree(iter->batch); + iter->batch = new_batch; + iter->max_sk = new_batch_sz; + + return 0; +} + +static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, + struct sock *start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct inet_connection_sock *icsk; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + icsk = inet_csk(start_sk); + inet_lhash2_for_each_icsk_continue(icsk) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) { + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + expected++; + } + } + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); + + return expected; +} + +static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, + struct sock *start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct hlist_nulls_node *node; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + sk = sk_nulls_next(start_sk); + sk_nulls_for_each_from(sk, node) { + if (seq_sk_match(seq, sk)) { + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + expected++; + } + } + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + + return expected; +} + +static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + unsigned int expected; + bool resized = false; + struct sock *sk; + + /* The st->bucket is done. Directly advance to the next + * bucket instead of having the tcp_seek_last_pos() to skip + * one by one in the current bucket and eventually find out + * it has to advance to the next bucket. + */ + if (iter->st_bucket_done) { + st->offset = 0; + st->bucket++; + if (st->state == TCP_SEQ_STATE_LISTENING && + st->bucket > tcp_hashinfo.lhash2_mask) { + st->state = TCP_SEQ_STATE_ESTABLISHED; + st->bucket = 0; + } + } + +again: + /* Get a new batch */ + iter->cur_sk = 0; + iter->end_sk = 0; + iter->st_bucket_done = false; + + sk = tcp_seek_last_pos(seq); + if (!sk) + return NULL; /* Done */ + + if (st->state == TCP_SEQ_STATE_LISTENING) + expected = bpf_iter_tcp_listening_batch(seq, sk); + else + expected = bpf_iter_tcp_established_batch(seq, sk); + + if (iter->end_sk == expected) { + iter->st_bucket_done = true; + return sk; + } + + if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { + resized = true; + goto again; + } + + return sk; +} + +static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* bpf iter does not support lseek, so it always + * continue from where it was stop()-ped. + */ + if (*pos) + return bpf_iter_tcp_batch(seq); + + return SEQ_START_TOKEN; +} + +static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct sock *sk; + + /* Whenever seq_next() is called, the iter->cur_sk is + * done with seq_show(), so advance to the next sk in + * the batch. + */ + if (iter->cur_sk < iter->end_sk) { + /* Keeping st->num consistent in tcp_iter_state. + * bpf_iter_tcp does not use st->num. + * meta.seq_num is used instead. + */ + st->num++; + /* Move st->offset to the next sk in the bucket such that + * the future start() will resume at st->offset in + * st->bucket. See tcp_seek_last_pos(). + */ + st->offset++; + sock_put(iter->batch[iter->cur_sk++]); + } + + if (iter->cur_sk < iter->end_sk) + sk = iter->batch[iter->cur_sk]; + else + sk = bpf_iter_tcp_batch(seq); + + ++*pos; + /* Keeping st->last_pos consistent in tcp_iter_state. + * bpf iter does not do lseek, so st->last_pos always equals to *pos. + */ + st->last_pos = *pos; + return sk; +} + static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; + bool slow; uid_t uid; + int ret; if (v == SEQ_START_TOKEN) return 0; + if (sk_fullsock(sk)) + slow = lock_sock_fast(sk); + + if (unlikely(sk_unhashed(sk))) { + ret = SEQ_SKIP; + goto unlock; + } + if (sk->sk_state == TCP_TIME_WAIT) { uid = 0; } else if (sk->sk_state == TCP_NEW_SYN_RECV) { @@ -2728,11 +2925,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) meta.seq = seq; prog = bpf_iter_get_info(&meta, false); - return tcp_prog_seq_show(prog, &meta, v, uid); + ret = tcp_prog_seq_show(prog, &meta, v, uid); + +unlock: + if (sk_fullsock(sk)) + unlock_sock_fast(sk, slow); + return ret; + } static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) { + struct bpf_tcp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; @@ -2743,13 +2947,16 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) (void)tcp_prog_seq_show(prog, &meta, v, 0); } - tcp_seq_stop(seq, v); + if (iter->cur_sk < iter->end_sk) { + bpf_iter_tcp_put_batch(iter); + iter->st_bucket_done = false; + } } static const struct seq_operations bpf_iter_tcp_seq_ops = { .show = bpf_iter_tcp_seq_show, - .start = tcp_seq_start, - .next = tcp_seq_next, + .start = bpf_iter_tcp_seq_start, + .next = bpf_iter_tcp_seq_next, .stop = bpf_iter_tcp_seq_stop, }; #endif @@ -3017,21 +3224,39 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid) +#define INIT_BATCH_SZ 16 + static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) { - return bpf_iter_init_seq_net(priv_data, aux); + struct bpf_tcp_iter_state *iter = priv_data; + int err; + + err = bpf_iter_init_seq_net(priv_data, aux); + if (err) + return err; + + err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); + if (err) { + bpf_iter_fini_seq_net(priv_data); + return err; + } + + return 0; } static void bpf_iter_fini_tcp(void *priv_data) { + struct bpf_tcp_iter_state *iter = priv_data; + bpf_iter_fini_seq_net(priv_data); + kvfree(iter->batch); } static const struct bpf_iter_seq_info tcp_seq_info = { .seq_ops = &bpf_iter_tcp_seq_ops, .init_seq_private = bpf_iter_init_tcp, .fini_seq_private = bpf_iter_fini_tcp, - .seq_priv_size = sizeof(struct tcp_iter_state), + .seq_priv_size = sizeof(struct bpf_tcp_iter_state), }; static struct bpf_iter_reg tcp_reg_info = { -- cgit v1.2.3-70-g09d2 From 3cee6fb8e69ecd79be891c89a94974c48a25a437 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:06:19 -0700 Subject: bpf: tcp: Support bpf_(get|set)sockopt in bpf tcp iter This patch allows bpf tcp iter to call bpf_(get|set)sockopt. To allow a specific bpf iter (tcp here) to call a set of helpers, get_func_proto function pointer is added to bpf_iter_reg. The bpf iter is a tracing prog which currently requires CAP_PERFMON or CAP_SYS_ADMIN, so this patch does not impose other capability checks for bpf_(get|set)sockopt. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200619.1036715-1-kafai@fb.com --- include/linux/bpf.h | 8 ++++++++ kernel/bpf/bpf_iter.c | 22 ++++++++++++++++++++++ kernel/trace/bpf_trace.c | 7 ++++++- net/core/filter.c | 34 ++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 15 +++++++++++++++ 5 files changed, 85 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 978ebd16ae60..c8cc09013210 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1442,6 +1442,9 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, struct seq_file *seq); typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +typedef const struct bpf_func_proto * +(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id, + const struct bpf_prog *prog); enum bpf_iter_feature { BPF_ITER_RESCHED = BIT(0), @@ -1454,6 +1457,7 @@ struct bpf_iter_reg { bpf_iter_detach_target_t detach_target; bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; + bpf_iter_get_func_proto_t get_func_proto; u32 ctx_arg_info_size; u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; @@ -1476,6 +1480,8 @@ struct bpf_iter__bpf_map_elem { int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); +const struct bpf_func_proto * +bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); @@ -2050,6 +2056,8 @@ extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; +extern const struct bpf_func_proto bpf_sk_setsockopt_proto; +extern const struct bpf_func_proto bpf_sk_getsockopt_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2d4fbdbb194e..2e9d47bb40ff 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -360,6 +360,28 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } +const struct bpf_func_proto * +bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_iter_target_info *tinfo; + const struct bpf_func_proto *fn = NULL; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog->aux->attach_btf_id) { + const struct bpf_iter_reg *reg_info; + + reg_info = tinfo->reg_info; + if (reg_info->get_func_proto) + fn = reg_info->get_func_proto(func_id, prog); + break; + } + } + mutex_unlock(&targets_mutex); + + return fn; +} + static void bpf_iter_link_release(struct bpf_link *link) { struct bpf_iter_link *iter_link = diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1f22ce1fa971..c5e0b6a64091 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1461,6 +1461,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *fn; + switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_skb_output: @@ -1501,7 +1503,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_d_path: return &bpf_d_path_proto; default: - return raw_tp_prog_func_proto(func_id, prog); + fn = raw_tp_prog_func_proto(func_id, prog); + if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) + fn = bpf_iter_get_func_proto(func_id, prog); + return fn; } } diff --git a/net/core/filter.c b/net/core/filter.c index 3b4986e96e9c..faf29fd82276 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5016,6 +5016,40 @@ err_clear: return -EINVAL; } +BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_proto = { + .func = bpf_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_getsockopt_proto = { + .func = bpf_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 84ac0135d389..f9c6e47141fd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3259,6 +3259,20 @@ static const struct bpf_iter_seq_info tcp_seq_info = { .seq_priv_size = sizeof(struct bpf_tcp_iter_state), }; +static const struct bpf_func_proto * +bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sk_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sk_getsockopt_proto; + default: + return NULL; + } +} + static struct bpf_iter_reg tcp_reg_info = { .target = "tcp", .ctx_arg_info_size = 1, @@ -3266,6 +3280,7 @@ static struct bpf_iter_reg tcp_reg_info = { { offsetof(struct bpf_iter__tcp, sk_common), PTR_TO_BTF_ID_OR_NULL }, }, + .get_func_proto = bpf_iter_tcp_get_func_proto, .seq_info = &tcp_seq_info, }; -- cgit v1.2.3-70-g09d2 From 0b846445985895e75958ecd59061fd7bf77e0c3f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 23 Jul 2021 11:36:30 -0700 Subject: unix_bpf: Fix a potential deadlock in unix_dgram_bpf_recvmsg() As Eric noticed, __unix_dgram_recvmsg() may acquire u->iolock too, so we have to release it before calling this function. Fixes: 9825d866ce0d ("af_unix: Implement unix_dgram_bpf_recvmsg()") Reported-by: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: Andrii Nakryiko Acked-by: Jakub Sitnicki Acked-by: John Fastabend --- net/unix/unix_bpf.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index db0cda29fb2f..177e883f451e 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -44,7 +44,7 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg, { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; - int copied, ret; + int copied; psock = sk_psock_get(sk); if (unlikely(!psock)) @@ -53,8 +53,9 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg, mutex_lock(&u->iolock); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { - ret = __unix_dgram_recvmsg(sk, msg, len, flags); - goto out; + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return __unix_dgram_recvmsg(sk, msg, len, flags); } msg_bytes_ready: @@ -68,16 +69,15 @@ msg_bytes_ready: if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; - ret = __unix_dgram_recvmsg(sk, msg, len, flags); - goto out; + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return __unix_dgram_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } - ret = copied; -out: mutex_unlock(&u->iolock); sk_psock_put(sk, psock); - return ret; + return copied; } static struct proto *unix_prot_saved __read_mostly; -- cgit v1.2.3-70-g09d2