diff options
Diffstat (limited to 'net/ipv4/udp.c')
| -rw-r--r-- | net/ipv4/udp.c | 78 |
1 files changed, 46 insertions, 32 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b32cf2eeeb41..49c622e743e8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -326,6 +326,8 @@ found: goto fail_unlock; } + sock_set_flag(sk, SOCK_RCU_FREE); + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -342,7 +344,7 @@ found: hslot2->count++; spin_unlock(&hslot2->lock); } - sock_set_flag(sk, SOCK_RCU_FREE); + error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); @@ -427,15 +429,21 @@ static struct sock *udp4_lib_lookup2(struct net *net, { struct sock *sk, *result; int score, badness; + bool need_rescore; result = NULL; badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + need_rescore = false; +rescore: + score = compute_score(need_rescore ? result : sk, net, saddr, + sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; + if (need_rescore) + continue; + if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; @@ -456,9 +464,14 @@ static struct sock *udp4_lib_lookup2(struct net *net, if (IS_ERR(result)) continue; - badness = compute_score(result, net, saddr, sport, - daddr, hnum, dif, sdif); - + /* compute_score is too long of a function to be + * inlined, and calling it again here yields + * measureable overhead for some + * workloads. Work around it by jumping + * backwards to rescore 'result'. + */ + need_rescore = true; + goto rescore; } } return result; @@ -927,8 +940,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, kfree_skb(skb); return -EINVAL; } - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) { + if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } @@ -1207,7 +1219,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (connected) - rt = (struct rtable *)sk_dst_check(sk, 0); + rt = dst_rtable(sk_dst_check(sk, 0)); if (!rt) { struct net *net = sock_net(sk); @@ -1501,13 +1513,15 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; - int size; + bool becomes_readable; + int size, rcvbuf; - /* try to avoid the costly atomic add/sub pair when the receive - * queue is full; always allow at least a packet + /* Immediately drop when the receive queue is full. + * Always allow at least one packet. */ rmem = atomic_read(&sk->sk_rmem_alloc); - if (rmem > sk->sk_rcvbuf) + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + if (rmem > rcvbuf) goto drop; /* Under mem pressure, it might be helpful to help udp_recvmsg() @@ -1516,7 +1530,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ - if (rmem > (sk->sk_rcvbuf >> 1)) { + if (rmem > (rcvbuf >> 1)) { skb_condense(skb); busy = busylock_acquire(sk); @@ -1524,12 +1538,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) size = skb->truesize; udp_set_dev_scratch(skb); - /* we drop only if the receive buf is full and the receive - * queue contains some other skb - */ - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) - goto uncharge_drop; + atomic_add(size, &sk->sk_rmem_alloc); spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); @@ -1545,12 +1554,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ sock_skb_set_dropcount(sk, skb); + becomes_readable = skb_queue_empty(list); __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); - + if (!sock_flag(sk, SOCK_DEAD)) { + if (becomes_readable || + sk->sk_data_ready != sock_def_readable || + READ_ONCE(sk->sk_peek_off) >= 0) + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + else + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); + } busylock_release(busy); return 0; @@ -2058,8 +2074,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2181,7 +2197,7 @@ csum_error: drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2215,7 +2231,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old; if (dst_hold_safe(dst)) { - old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst); + old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst))); dst_release(old); return old != dst; } @@ -2368,7 +2384,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { - struct sock *sk; + struct sock *sk = NULL; struct udphdr *uh; unsigned short ulen; struct rtable *rt = skb_rtable(skb); @@ -2445,7 +2461,7 @@ no_sk: * Hmm. We got an UDP packet to a port to which we * don't wanna listen. Ignore it. */ - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; short_packet: @@ -2470,7 +2486,7 @@ csum_error: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; } @@ -2697,8 +2713,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk); - fallthrough; - case UDP_ENCAP_ESPINUDP_NON_IKE: #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) WRITE_ONCE(up->encap_rcv, |
