diff options
author | Dave Airlie <airlied@redhat.com> | 2023-10-23 18:20:06 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2023-10-23 18:20:06 +1000 |
commit | 7cd62eab9babd1fed9c497141650b31168f4f430 (patch) | |
tree | 85ef2122ca0d88b2b598f45eb071fcd1f4faa3d2 /net/ipv4 | |
parent | 035fdc38c1f6ddce3544d8a489548e6cc4de508a (diff) | |
parent | 05d3ef8bba77c1b5f98d941d8b2d4aeab8118ef1 (diff) |
BackMerge tag 'v6.6-rc7' into drm-next
This is needed to add the msm pr which is based on a higher base.
Signed-off-by: Dave Airlie <airlied@redhat.com>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 10 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 15 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 4 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 1 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 24 | ||||
-rw-r--r-- | net/ipv4/route.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 26 | ||||
-rw-r--r-- | net/ipv4/tcp_bpf.c | 16 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 33 | ||||
-rw-r--r-- | net/ipv4/tcp_recovery.c | 2 |
13 files changed, 102 insertions, 53 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3d2e30e20473..2713c9b06c4c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -597,7 +597,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; - sk->sk_wait_pending++; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -613,7 +612,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; - sk->sk_wait_pending--; return timeo; } @@ -642,6 +640,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) { + sk->sk_disconnects++; err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; @@ -696,6 +695,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int writebias = (sk->sk_protocol == IPPROTO_TCP) && tcp_sk(sk)->fastopen_req && tcp_sk(sk)->fastopen_req->data ? 1 : 0; + int dis = sk->sk_disconnects; /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) @@ -704,6 +704,11 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; + + if (dis != sk->sk_disconnects) { + err = -EPIPE; + goto out; + } } /* Connection was closed by RST, timeout, ICMP error @@ -725,6 +730,7 @@ out: sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; + sk->sk_disconnects++; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 2be2d4922557..d18f0f092fe7 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -732,7 +732,9 @@ static inline int esp_remove_trailer(struct sk_buff *skb) skb->csum = csum_block_sub(skb->csum, csumdiff, skb->len - trimlen); } - pskb_trim(skb, skb->len - trimlen); + ret = pskb_trim(skb, skb->len - trimlen); + if (unlikely(ret)) + return ret; ret = nexthdr[1]; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index eafa4a033515..5eb1b8d302bb 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1325,15 +1325,18 @@ __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { struct fib_nh *nh; + __be32 saddr; if (nhc->nhc_family != AF_INET) return inet_select_addr(nhc->nhc_dev, 0, scope); nh = container_of(nhc, struct fib_nh, nh_common); - nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); - nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); - return nh->nh_saddr; + WRITE_ONCE(nh->nh_saddr, saddr); + WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid)); + + return saddr; } __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) @@ -1347,8 +1350,9 @@ __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); - if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) - return nh->nh_saddr; + if (READ_ONCE(nh->nh_saddr_genid) == + atomic_read(&net->ipv4.dev_addr_genid)) + return READ_ONCE(nh->nh_saddr); } return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); @@ -1887,6 +1891,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; + fi->pfsrc_removed = true; ret++; } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d13fb9e76b97..9bdfdab906fe 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2027,6 +2027,7 @@ void fib_table_flush_external(struct fib_table *tb) int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; + struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; @@ -2089,6 +2090,9 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); + if (fi->pfsrc_removed) + rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, + KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index aeebe8816689..394a498c2823 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1145,7 +1145,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_wait_pending = 0; inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c32f5e28758b..598c1b114d2c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -149,8 +149,14 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb2->family) - return false; + if (sk->sk_family != tb2->family) { + if (sk->sk_family == AF_INET) + return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) && + tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; + + return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && + sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr; + } if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb2->v6_rcv_saddr, @@ -819,19 +825,7 @@ static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, tb->l3mdev != l3mdev) return false; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb->family) { - if (sk->sk_family == AF_INET) - return ipv6_addr_v4mapped(&tb->v6_rcv_saddr) && - tb->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; - - return false; - } - - if (sk->sk_family == AF_INET6) - return ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); -#endif - return tb->rcv_saddr == sk->sk_rcv_saddr; + return inet_bind2_bucket_addr_match(tb, sk); } bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 66f419e7f9a7..b214b5a2e045 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1213,6 +1213,7 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct net_device *dev; struct ip_options opt; int res; @@ -1230,7 +1231,8 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; + res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) @@ -3415,6 +3417,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); + fri.offload_failed = + READ_ONCE(fa->offload_failed); break; } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c3040a63ebd..d3456cf840de 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -831,7 +831,9 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; - sk_wait_data(sk, &timeo, NULL); + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) + break; if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; @@ -1621,16 +1623,13 @@ EXPORT_SYMBOL(tcp_read_sock); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; struct sk_buff *skb; int copied = 0; - u32 offset; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; - while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u8 tcp_flags; int used; @@ -1643,13 +1642,10 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) copied = used; break; } - seq += used; copied += used; - if (tcp_flags & TCPHDR_FIN) { - ++seq; + if (tcp_flags & TCPHDR_FIN) break; - } } return copied; } @@ -2448,7 +2444,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); - sk_wait_data(sk, &timeo, last); + err = sk_wait_data(sk, &timeo, last); + if (err < 0) { + err = copied ? : err; + goto out; + } } if ((flags & MSG_PEEK) && @@ -2972,12 +2972,6 @@ int tcp_disconnect(struct sock *sk, int flags) int old_state = sk->sk_state; u32 seq; - /* Deny disconnect if other threads are blocked in sk_wait_event() - * or inet_wait_for_connect(). - */ - if (sk->sk_wait_pending) - return -EBUSY; - if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 81f0dff69e0b..53b0d62fd2c2 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -222,6 +222,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, int *addr_len) { struct tcp_sock *tcp = tcp_sk(sk); + int peek = flags & MSG_PEEK; u32 seq = tcp->copied_seq; struct sk_psock *psock; int copied = 0; @@ -306,15 +307,22 @@ msg_bytes_ready: } data = tcp_msg_wait_data(sk, psock, timeo); + if (data < 0) { + copied = data; + goto unlock; + } if (data && !sk_psock_queue_empty(psock)) goto msg_bytes_ready; copied = -EAGAIN; } out: - WRITE_ONCE(tcp->copied_seq, seq); + if (!peek) + WRITE_ONCE(tcp->copied_seq, seq); tcp_rcv_space_adjust(sk); if (copied > 0) __tcp_cleanup_rbuf(sk, copied); + +unlock: release_sock(sk); sk_psock_put(sk, psock); return copied; @@ -349,6 +357,10 @@ msg_bytes_ready: timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = tcp_msg_wait_data(sk, psock, timeo); + if (data < 0) { + ret = data; + goto unlock; + } if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -359,6 +371,8 @@ msg_bytes_ready: copied = -EAGAIN; } ret = copied; + +unlock: release_sock(sk); sk_psock_put(sk, psock); return ret; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 06fe1cf645d5..8afb0950a697 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -253,6 +253,19 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) if (unlikely(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE)) tcp_gro_dev_warn(sk, skb, len); + /* If the skb has a len of exactly 1*MSS and has the PSH bit + * set then it is likely the end of an application write. So + * more data may not be arriving soon, and yet the data sender + * may be waiting for an ACK if cwnd-bound or using TX zero + * copy. So we set ICSK_ACK_PUSHED here so that + * tcp_cleanup_rbuf() will send an ACK immediately if the app + * reads all of the data and is not ping-pong. If len > MSS + * then this logic does not matter (and does not hurt) because + * tcp_cleanup_rbuf() will always ACK immediately if the app + * reads data and there is more than an MSS of unACKed data. + */ + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 27140e5cdc06..4167e8a48b60 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1869,6 +1869,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TLS_DEVICE tail->decrypted != skb->decrypted || #endif + !mptcp_skb_can_collapse(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ccfc8bbf7455..f0723460753c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -177,8 +177,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, - u32 rcv_nxt) +static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -192,7 +191,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ - tcp_dec_quickack_mode(sk, pkts); + tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -1387,7 +1386,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); @@ -2457,6 +2456,7 @@ static int tcp_mtu_probe(struct sock *sk) /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { + tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } @@ -2542,6 +2542,18 @@ static bool tcp_pacing_check(struct sock *sk) return true; } +static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) +{ + const struct rb_node *node = sk->tcp_rtx_queue.rb_node; + + /* No skb in the rtx queue. */ + if (!node) + return true; + + /* Only one skb in rtx queue. */ + return !node->rb_left && !node->rb_right; +} + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) @@ -2579,12 +2591,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send skb if rtx queue is empty. + /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (tcp_rtx_queue_empty(sk)) + if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2788,7 +2800,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, rto_delta_us; + u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS @@ -2812,11 +2824,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { - timeout = usecs_to_jiffies(tp->srtt_us >> 2); + timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) - timeout += TCP_RTO_MIN; + timeout_us += tcp_rto_min_us(sk); else - timeout += TCP_TIMEOUT_MIN; + timeout_us += TCP_TIMEOUT_MIN_US; + timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index acf4869c5d3b..bba10110fbbc 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -104,7 +104,7 @@ bool tcp_rack_mark_lost(struct sock *sk) tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { - timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN; + timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } |