From 60557969951304dad829f2829019907dfb43ecb3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Thu, 28 Mar 2024 14:40:29 +0000 Subject: udp: annotate data-race in __udp_enqueue_schedule_skb() sk->sk_rcvbuf is read locklessly twice, while other threads could change its value. Use a READ_ONCE() to annotate the race. Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20240328144032.1864988-2-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv4/udp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 531882f321f2..6a39e7fa0616 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1492,13 +1492,14 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; - int size; + int size, rcvbuf; - /* try to avoid the costly atomic add/sub pair when the receive - * queue is full; always allow at least a packet + /* Immediately drop when the receive queue is full. + * Always allow at least one packet. */ rmem = atomic_read(&sk->sk_rmem_alloc); - if (rmem > sk->sk_rcvbuf) + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + if (rmem > rcvbuf) goto drop; /* Under mem pressure, it might be helpful to help udp_recvmsg() @@ -1507,7 +1508,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ - if (rmem > (sk->sk_rcvbuf >> 1)) { + if (rmem > (rcvbuf >> 1)) { skb_condense(skb); busy = busylock_acquire(sk); -- cgit v1.2.3-70-g09d2 From 6a1f12dd85a8b24f871dfcf467378660af9c064d Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Thu, 28 Mar 2024 14:40:30 +0000 Subject: udp: relax atomic operation on sk->sk_rmem_alloc atomic_add_return() is more expensive than atomic_add() and seems overkill in UDP rx fast path. Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20240328144032.1864988-3-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv4/udp.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6a39e7fa0616..19d7db4563ac 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1516,12 +1516,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) size = skb->truesize; udp_set_dev_scratch(skb); - /* we drop only if the receive buf is full and the receive - * queue contains some other skb - */ - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) - goto uncharge_drop; + atomic_add(size, &sk->sk_rmem_alloc); spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); -- cgit v1.2.3-70-g09d2 From 612b1c0dec5bc7367f90fc508448b8d0d7c05414 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Thu, 28 Mar 2024 14:40:31 +0000 Subject: udp: avoid calling sock_def_readable() if possible sock_def_readable() is quite expensive (particularly when ep_poll_callback() is in the picture). We must call sk->sk_data_ready() when : - receive queue was empty, or - SO_PEEK_OFF is enabled on the socket, or - sk->sk_data_ready is not sock_def_readable. We still need to call sk_wake_async(). Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Acked-by: Paolo Abeni <pabeni@redhat.com> Link: https://lore.kernel.org/r/20240328144032.1864988-4-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv4/udp.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 19d7db4563ac..143043cd2dcb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1492,6 +1492,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; + bool becomes_readable; int size, rcvbuf; /* Immediately drop when the receive queue is full. @@ -1532,12 +1533,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ sock_skb_set_dropcount(sk, skb); + becomes_readable = skb_queue_empty(list); __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); - + if (!sock_flag(sk, SOCK_DEAD)) { + if (becomes_readable || + sk->sk_data_ready != sock_def_readable || + READ_ONCE(sk->sk_peek_off) >= 0) + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } busylock_release(busy); return 0; -- cgit v1.2.3-70-g09d2 From 1abe267f173eae7ae76cf56232292e9641eb652f Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Thu, 28 Mar 2024 14:40:32 +0000 Subject: net: add sk_wake_async_rcu() helper While looking at UDP receive performance, I saw sk_wake_async() was no longer inlined. This matters at least on AMD Zen1-4 platforms (see SRSO) This might be because rcu_read_lock() and rcu_read_unlock() are no longer nops in recent kernels ? Add sk_wake_async_rcu() variant, which must be called from contexts already holding rcu lock. As SOCK_FASYNC is deprecated in modern days, use unlikely() to give a hint to the compiler. sk_wake_async_rcu() is properly inlined from __udp_enqueue_schedule_skb() and sock_def_readable(). Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20240328144032.1864988-5-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- crypto/af_alg.c | 4 ++-- include/net/sock.h | 6 ++++++ net/atm/common.c | 2 +- net/core/sock.c | 8 ++++---- net/dccp/output.c | 2 +- net/ipv4/udp.c | 2 +- net/iucv/af_iucv.c | 2 +- net/rxrpc/af_rxrpc.c | 2 +- net/sctp/socket.c | 2 +- net/smc/smc_rx.c | 4 ++-- net/unix/af_unix.c | 2 +- 11 files changed, 21 insertions(+), 15 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 68cc9290cabe..5bc6d0fa7498 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -847,7 +847,7 @@ void af_alg_wmem_wakeup(struct sock *sk) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup); @@ -914,7 +914,7 @@ static void af_alg_data_wakeup(struct sock *sk) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/include/net/sock.h b/include/net/sock.h index f57bfd8a2ad2..2253eefe2848 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2513,6 +2513,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) } } +static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) +{ + if (unlikely(sock_flag(sk, SOCK_FASYNC))) + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); +} + /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at diff --git a/net/atm/common.c b/net/atm/common.c index 2a1ec014e901..9b75699992ff 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -116,7 +116,7 @@ static void vcc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index 0963689a5950..5ed411231fc7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3338,7 +3338,7 @@ static void sock_def_error_report(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } @@ -3353,7 +3353,7 @@ void sock_def_readable(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -3373,7 +3373,7 @@ static void sock_def_write_space(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -3398,7 +3398,7 @@ static void sock_def_write_space_wfree(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } diff --git a/net/dccp/output.c b/net/dccp/output.c index fd2eb148d24d..5c2e24f3c39b 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -204,7 +204,7 @@ void dccp_write_space(struct sock *sk) wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 143043cd2dcb..11460d751e73 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1544,7 +1544,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); } busylock_release(busy); return 0; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 7c8c3adcac6e..c951bb9cc2e0 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -184,7 +184,7 @@ static void iucv_sock_wake_msglim(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 5222bc97d192..f4844683e120 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c67679a41044..e416b6d3d270 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9276,7 +9276,7 @@ void sctp_data_ready(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 9a2f3638d161..f0cbe77a80b4 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP); rcu_read_unlock(); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 27ca50ab1cd1..533fb682c954 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -546,7 +546,7 @@ static void unix_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } -- cgit v1.2.3-70-g09d2