diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-05-24 21:57:57 -0700 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-05-24 21:57:57 -0700 |
commit | 0c615f1cc3b333775b9c0b56e369f8dbca1e0226 (patch) | |
tree | 8799101c67d85957c7354eb95f6cd020706b33dc /net/ipv4 | |
parent | 878ecb0897f4737a4c9401f3523fd49589025671 (diff) | |
parent | f726e03564ef4e754dd93beb54303e2e1671049e (diff) |
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf
Daniel Borkmann says:
====================
pull-request: bpf 2023-05-24
We've added 19 non-merge commits during the last 10 day(s) which contain
a total of 20 files changed, 738 insertions(+), 448 deletions(-).
The main changes are:
1) Batch of BPF sockmap fixes found when running against NGINX TCP tests,
from John Fastabend.
2) Fix a memleak in the LRU{,_PERCPU} hash map when bucket locking fails,
from Anton Protopopov.
3) Init the BPF offload table earlier than just late_initcall,
from Jakub Kicinski.
4) Fix ctx access mask generation for 32-bit narrow loads of 64-bit fields,
from Will Deacon.
5) Remove a now unsupported __fallthrough in BPF samples,
from Andrii Nakryiko.
6) Fix a typo in pkg-config call for building sign-file,
from Jeremy Sowden.
* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
bpf, sockmap: Test progs verifier error with latest clang
bpf, sockmap: Test FIONREAD returns correct bytes in rx buffer with drops
bpf, sockmap: Test FIONREAD returns correct bytes in rx buffer
bpf, sockmap: Test shutdown() correctly exits epoll and recv()=0
bpf, sockmap: Build helper to create connected socket pair
bpf, sockmap: Pull socket helpers out of listen test for general use
bpf, sockmap: Incorrectly handling copied_seq
bpf, sockmap: Wake up polling after data copy
bpf, sockmap: TCP data stall on recv before accept
bpf, sockmap: Handle fin correctly
bpf, sockmap: Improved check for empty queue
bpf, sockmap: Reschedule is now done through backlog
bpf, sockmap: Convert schedule_work into delayed_work
bpf, sockmap: Pass skb ownership through read_skb
bpf: fix a memory leak in the LRU and LRU_PERCPU hash maps
bpf: Fix mask generation for 32-bit narrow loads of 64-bit fields
samples/bpf: Drop unnecessary fallthrough
bpf: netdev: init the offload table earlier
selftests/bpf: Fix pkg-config call building sign-file
====================
Link: https://lore.kernel.org/r/20230524170839.13905-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/tcp.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_bpf.c | 79 | ||||
-rw-r--r-- | net/ipv4/udp.c | 7 |
3 files changed, 81 insertions, 16 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d6392c16b7a..a60f6f4e7cd9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1571,7 +1571,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -static void __tcp_cleanup_rbuf(struct sock *sk, int copied) +void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -1773,7 +1773,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); tcp_flags = TCP_SKB_CB(skb)->tcp_flags; used = recv_actor(sk, skb); - consume_skb(skb); if (used < 0) { if (!copied) copied = used; @@ -1787,14 +1786,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) break; } } - WRITE_ONCE(tp->copied_seq, seq); - - tcp_rcv_space_adjust(sk); - - /* Clean up data we have read: This will do ACK frames. */ - if (copied > 0) - __tcp_cleanup_rbuf(sk, copied); - return copied; } EXPORT_SYMBOL(tcp_read_skb); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 2e9547467edb..5f93918c063c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -11,6 +11,24 @@ #include <net/inet_common.h> #include <net/tls.h> +void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tcp; + int copied; + + if (!skb || !skb->len || !sk_is_tcp(sk)) + return; + + if (skb_bpf_strparser(skb)) + return; + + tcp = tcp_sk(sk); + copied = tcp->copied_seq + skb->len; + WRITE_ONCE(tcp->copied_seq, copied); + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, skb->len); +} + static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -174,14 +192,34 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, return ret; } +static bool is_next_msg_fin(struct sk_psock *psock) +{ + struct scatterlist *sge; + struct sk_msg *msg_rx; + int i; + + msg_rx = sk_psock_peek_msg(psock); + i = msg_rx->sg.start; + sge = sk_msg_elem(msg_rx, i); + if (!sge->length) { + struct sk_buff *skb = msg_rx->skb; + + if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + return true; + } + return false; +} + static int tcp_bpf_recvmsg_parser(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { + struct tcp_sock *tcp = tcp_sk(sk); + u32 seq = tcp->copied_seq; struct sk_psock *psock; - int copied; + int copied = 0; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -194,8 +232,43 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, return tcp_recvmsg(sk, msg, len, flags, addr_len); lock_sock(sk); + + /* We may have received data on the sk_receive_queue pre-accept and + * then we can not use read_skb in this context because we haven't + * assigned a sk_socket yet so have no link to the ops. The work-around + * is to check the sk_receive_queue and in these cases read skbs off + * queue again. The read_skb hook is not running at this point because + * of lock_sock so we avoid having multiple runners in read_skb. + */ + if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { + tcp_data_ready(sk); + /* This handles the ENOMEM errors if we both receive data + * pre accept and are already under memory pressure. At least + * let user know to retry. + */ + if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { + copied = -EAGAIN; + goto out; + } + } + msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + /* The typical case for EFAULT is the socket was gracefully + * shutdown with a FIN pkt. So check here the other case is + * some error on copy_page_to_iter which would be unexpected. + * On fin return correct return code to zero. + */ + if (copied == -EFAULT) { + bool is_fin = is_next_msg_fin(psock); + + if (is_fin) { + copied = 0; + seq++; + goto out; + } + } + seq += copied; if (!copied) { long timeo; int data; @@ -233,6 +306,10 @@ msg_bytes_ready: copied = -EAGAIN; } out: + WRITE_ONCE(tcp->copied_seq, seq); + tcp_rcv_space_adjust(sk); + if (copied > 0) + __tcp_cleanup_rbuf(sk, copied); release_sock(sk); sk_psock_put(sk, psock); return copied; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index aa32afd871ee..9482def1f310 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1818,7 +1818,7 @@ EXPORT_SYMBOL(__skb_recv_udp); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; - int err, copied; + int err; try_again: skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); @@ -1837,10 +1837,7 @@ try_again: } WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); - copied = recv_actor(sk, skb); - kfree_skb(skb); - - return copied; + return recv_actor(sk, skb); } EXPORT_SYMBOL(udp_read_skb); |