From 02db55718d53f9d426cee504c27fb768e9ed4ffe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Dec 2017 17:55:02 -0800 Subject: tcp: do not overshoot window_clamp in tcp_rcv_space_adjust() While rcvbuf is properly clamped by tcp_rmem[2], rcvwin is left to a potentially too big value. It has no serious effect, since : 1) tcp_grow_window() has very strict checks. 2) window_clamp can be mangled by user space to any value anyway. tcp_init_buffer_space() and companions use tcp_full_space(), we use tcp_win_from_space() to avoid reloading sk->sk_rcvbuf Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Wei Wang Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9550cc42de2d..746a6773c482 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -631,7 +631,7 @@ void tcp_rcv_space_adjust(struct sock *sk) sk->sk_rcvbuf = rcvbuf; /* Make the window clamp follow along. */ - tp->window_clamp = rcvwin; + tp->window_clamp = tcp_win_from_space(sk, rcvbuf); } } tp->rcvq_space.space = copied; -- cgit v1.2.3-70-g09d2 From 607065bad9931e72207b0cac365d7d4abc06bd99 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Dec 2017 17:55:03 -0800 Subject: tcp: avoid integer overflows in tcp_rcv_space_adjust() When using large tcp_rmem[2] values (I did tests with 500 MB), I noticed overflows while computing rcvwin. Lets fix this before the following patch. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Wei Wang Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp_input.c | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ca4a6361389b..4f93f0953c41 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -344,7 +344,7 @@ struct tcp_sock { /* Receiver queue space */ struct { - int space; + u32 space; u32 seq; u64 time; } rcvq_space; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 746a6773c482..2900e58738cd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -576,8 +576,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 copied; int time; - int copied; tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); @@ -600,12 +600,13 @@ void tcp_rcv_space_adjust(struct sock *sk) if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvwin, rcvmem, rcvbuf; + int rcvmem, rcvbuf; + u64 rcvwin; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ - rcvwin = (copied << 1) + 16 * tp->advmss; + rcvwin = ((u64)copied << 1) + 16 * tp->advmss; /* If rate increased by 25%, * assume slow start, rcvwin = 3 * copied @@ -625,8 +626,9 @@ void tcp_rcv_space_adjust(struct sock *sk) while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + do_div(rcvwin, tp->advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; -- cgit v1.2.3-70-g09d2 From c3916ad9320eed8eacd7c0b2cf7f881efceda892 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Dec 2017 17:55:04 -0800 Subject: tcp: smoother receiver autotuning Back in linux-3.13 (commit b0983d3c9b13 ("tcp: fix dynamic right sizing")) I addressed the pressing issues we had with receiver autotuning. But DRS suffers from extra latencies caused by rcv_rtt_est.rtt_us drifts. One common problem happens during slow start, since the apparent RTT measured by the receiver can be inflated by ~50%, at the end of one packet train. Also, a single drop can delay read() calls by one RTT, meaning tcp_rcv_space_adjust() can be called one RTT too late. By replacing the tri-modal heuristic with a continuous function, we can offset the effects of not growing 'at the optimal time'. The curve of the function matches prior behavior if the space increased by 25% and 50% exactly. Cost of added multiply/divide is small, considering a TCP flow typically would run this part of the code few times in its life. I tested this patch with 100 ms RTT / 1% loss link, 100 runs of (netperf -l 5), and got an average throughput of 4600 Mbit instead of 1700 Mbit. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Wei Wang Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2900e58738cd..fefb46c16de7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -601,26 +601,17 @@ void tcp_rcv_space_adjust(struct sock *sk) if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvmem, rcvbuf; - u64 rcvwin; + u64 rcvwin, grow; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - /* If rate increased by 25%, - * assume slow start, rcvwin = 3 * copied - * If rate increased by 50%, - * assume sender can use 2x growth, rcvwin = 4 * copied - */ - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) - rcvwin <<= 1; - else - rcvwin += (rcvwin >> 1); - } + /* Accommodate for sender rate increase (eg. slow start) */ + grow = rcvwin * (copied - tp->rcvq_space.space); + do_div(grow, tp->rcvq_space.space); + rcvwin += (grow << 1); rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); while (tcp_win_from_space(sk, rcvmem) < tp->advmss) -- cgit v1.2.3-70-g09d2 From c3fde1bd28f7c720d7bc587e85e54706df4f8163 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 29 Dec 2017 11:45:51 +0900 Subject: net: tcp: Add trace events for TCP congestion window tracing This adds an event to trace TCP stat variables with slightly intrusive trace-event. This uses ftrace/perf event log buffer to trace those state, no needs to prepare own ring-buffer, nor custom user apps. User can use ftrace to trace this event as below; # cd /sys/kernel/debug/tracing # echo 1 > events/tcp/tcp_probe/enable (run workloads) # cat trace Signed-off-by: Masami Hiramatsu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 3 ++ 2 files changed, 100 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index bb00459d2d4d..b5ae3fbb74c8 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tcp @@ -8,6 +9,7 @@ #include #include #include +#include #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ @@ -254,6 +256,101 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->saddr_v6, __entry->daddr_v6) ); + +#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk) \ + do { \ + struct sockaddr_in *v4 = (void *)__entry->saddr; \ + \ + v4->sin_family = AF_INET; \ + v4->sin_port = inet->inet_sport; \ + v4->sin_addr.s_addr = inet->inet_saddr; \ + v4 = (void *)__entry->daddr; \ + v4->sin_family = AF_INET; \ + v4->sin_port = inet->inet_dport; \ + v4->sin_addr.s_addr = inet->inet_daddr; \ + } while (0) + +#if IS_ENABLED(CONFIG_IPV6) + +#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \ + do { \ + if (sk->sk_family == AF_INET6) { \ + struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ + \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = inet->inet_sport; \ + v6->sin6_addr = inet6_sk(sk)->saddr; \ + v6 = (void *)__entry->daddr; \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = inet->inet_dport; \ + v6->sin6_addr = sk->sk_v6_daddr; \ + } else \ + TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); \ + } while (0) + +#else + +#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \ + TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); + +#endif + +TRACE_EVENT(tcp_probe, + + TP_PROTO(struct sock *sk, struct sk_buff *skb), + + TP_ARGS(sk, skb), + + TP_STRUCT__entry( + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + __field(__u16, sport) + __field(__u16, dport) + __field(__u32, mark) + __field(__u16, length) + __field(__u32, snd_nxt) + __field(__u32, snd_una) + __field(__u32, snd_cwnd) + __field(__u32, ssthresh) + __field(__u32, snd_wnd) + __field(__u32, srtt) + __field(__u32, rcv_wnd) + ), + + TP_fast_assign( + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + + /* For filtering use */ + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + __entry->mark = skb->mark; + + __entry->length = skb->len; + __entry->snd_nxt = tp->snd_nxt; + __entry->snd_una = tp->snd_una; + __entry->snd_cwnd = tp->snd_cwnd; + __entry->snd_wnd = tp->snd_wnd; + __entry->rcv_wnd = tp->rcv_wnd; + __entry->ssthresh = tcp_current_ssthresh(sk); + __entry->srtt = tp->srtt_us >> 3; + ), + + TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x " + "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u " + "rcv_wnd=%u", + __entry->saddr, __entry->daddr, __entry->mark, + __entry->length, __entry->snd_nxt, __entry->snd_una, + __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, + __entry->srtt, __entry->rcv_wnd) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4d55c4b338ee..ff71b18d9682 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5299,6 +5299,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk); + /* TCP congestion window tracking */ + trace_tcp_probe(sk, skb); + tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); -- cgit v1.2.3-70-g09d2 From eb36be0fd55e0a6f2cb3226acd711b2c7a2d7d09 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 17 Jan 2018 12:11:00 -0800 Subject: tcp: avoid min-RTT overestimation from delayed ACKs This patch avoids having TCP sender or congestion control overestimate the min RTT by orders of magnitude. This happens when all the samples in the windowed filter are one-packet transfer like small request and health-check like chit-chat, which is farily common for applications using persistent connections. This patch tries to conservatively labels and skip RTT samples obtained from this type of workload. Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ff71b18d9682..2c6797134553 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ +#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -2857,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, *rexmit = REXMIT_LOST; } -static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) +static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; struct tcp_sock *tp = tcp_sk(sk); + if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { + /* If the remote keeps returning delayed ACKs, eventually + * the min filter would pick it up and overestimate the + * prop. delay when it expires. Skip suspected delayed ACKs. + */ + return; + } minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } @@ -2901,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, * always taken together with ACK, SACK, or TS-opts. Any negative * values will be skipped with the seq_rtt_us < 0 check above. */ - tcp_update_rtt_min(sk, ca_rtt_us); + tcp_update_rtt_min(sk, ca_rtt_us, flag); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); @@ -3125,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); + + if (pkts_acked == 1 && last_in_flight < tp->mss_cache && + last_in_flight && !prior_sacked && fully_acked && + sack->rate->prior_delivered + 1 == tp->delivered && + !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { + /* Conservatively mark a delayed ACK. It's typically + * from a lone runt packet over the round trip to + * a receiver w/o out-of-order or CE events. + */ + flag |= FLAG_ACK_MAYBE_DELAYED; + } } if (sack->first_sackt) { sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); -- cgit v1.2.3-70-g09d2 From e42866031ff03c89a5bdd2056c76dd6cb41c3d35 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 17 Jan 2018 12:11:01 -0800 Subject: tcp: avoid min RTT bloat by skipping RTT from delayed-ACK in BBR A persistent connection may send tiny amount of data (e.g. health-check) for a long period of time. BBR's windowed min RTT filter may only see RTT samples from delayed ACKs causing BBR to grossly over-estimate the path delay depending how much the ACK was delayed at the receiver. This patch skips RTT samples that are likely coming from delayed ACKs. Note that it is possible the sender never obtains a valid measure to set the min RTT. In this case BBR will continue to set cwnd to initial window which seems fine because the connection is thin stream. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_bbr.c | 3 ++- net/ipv4/tcp_input.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6939e69d3c37..5a1d26a18599 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -953,6 +953,7 @@ struct rate_sample { u32 prior_in_flight; /* in flight before this ACK */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 8322f26e770e..785712be5b0d 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -766,7 +766,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && - (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + (rs->rtt_us <= bbr->min_rtt_us || + (filter_expired && !rs->is_ack_delayed))) { bbr->min_rtt_us = rs->rtt_us; bbr->min_rtt_stamp = tcp_jiffies32; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2c6797134553..cfa51cfd2d99 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3633,6 +3633,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); -- cgit v1.2.3-70-g09d2