From 00483690552c5fb6aa30bf3acb75b0ee89b4c0fd Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Thu, 10 May 2018 16:53:51 +1000 Subject: tcp: Add mark for TIMEWAIT sockets This version has some suggestions by Eric Dumazet: - Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP races. - Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark" statement. - Factorize code as sk_fullsock() check is not necessary. Aidan McGurn from Openwave Mobility systems reported the following bug: "Marked routing is broken on customer deployment. Its effects are large increase in Uplink retransmissions caused by the client never receiving the final ACK to their FINACK - this ACK misses the mark and routes out of the incorrect route." Currently marks are added to sk_buffs for replies when the "fwmark_reflect" sysctl is enabled. But not for TW sockets that had sk->sk_mark set via setsockopt(SO_MARK..). Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location. Then progate this so that the skb gets sent with the correct mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so that netfilter rules are still honored. Signed-off-by: Jon Maxwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f70586b50838..caf23de88f8a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) struct sock *sk1 = NULL; #endif struct net *net; + struct sock *ctl_sk; /* Never send a reset in response to a reset. */ if (th->rst) @@ -723,11 +724,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); + if (sk) + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); local_bh_enable(); @@ -759,6 +765,7 @@ static void tcp_v4_send_ack(const struct sock *sk, } rep; struct net *net = sock_net(sk); struct ip_reply_arg arg; + struct sock *ctl_sk; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -809,11 +816,16 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); + if (sk) + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); local_bh_enable(); } -- cgit v1.2.3-70-g09d2 From 6d82aa242092d73c6d2e210cfaf0ebfbe6de1ccf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 May 2018 14:47:28 -0700 Subject: tcp: add tcp_comp_sack_delay_ns sysctl This per netns sysctl allows for TCP SACK compression fine-tuning. Its default value is 1,000,000, or 1 ms to meet TSO autosizing period. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_ipv4.c | 1 + 5 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ea304a23c8d7..7ba952959bca 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -525,6 +525,13 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max tcp_sack - BOOLEAN Enable select acknowledgments (SACKS). +tcp_comp_sack_delay_ns - LONG INTEGER + TCP tries to reduce number of SACK sent, using a timer + based on 5% of SRTT, capped by this sysctl, in nano seconds. + The default is 1ms, based on TSO autosizing period. + + Default : 1,000,000 ns (1 ms) + tcp_slow_start_after_idle - BOOLEAN If set, provide RFC2861 behavior and time out the congestion window after an idle period. An idle period is defined at diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8491bc9c86b1..927318243cfa 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,7 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; + unsigned long sysctl_tcp_comp_sack_delay_ns; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4b195bac8ac0..11fbfdc1566e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1151,6 +1151,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, + { + .procname = "tcp_comp_sack_delay_ns", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cc2ac5346b92..6a1dae38c955 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5113,13 +5113,13 @@ send_now: if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; - /* compress ack timer : 5 % of rtt, but no more than 1 ms */ + /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ rtt = tp->rcv_rtt_est.rtt_us; if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; - delay = min_t(unsigned long, NSEC_PER_MSEC, + delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index caf23de88f8a..a3f4647341db 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2572,6 +2572,7 @@ static int __net_init tcp_sk_init(struct net *net) init_net.ipv4.sysctl_tcp_wmem, sizeof(init_net.ipv4.sysctl_tcp_wmem)); } + net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; -- cgit v1.2.3-70-g09d2 From 9c21d2fc41c0c8930600c9dd83eadac2e336fcfa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 May 2018 14:47:29 -0700 Subject: tcp: add tcp_comp_sack_nr sysctl This per netns sysctl allows for TCP SACK compression fine-tuning. This limits number of SACK that can be compressed. Using 0 disables SACK compression. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_ipv4.c | 1 + 5 files changed, 20 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 7ba952959bca..924bd51327b7 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -532,6 +532,12 @@ tcp_comp_sack_delay_ns - LONG INTEGER Default : 1,000,000 ns (1 ms) +tcp_comp_sack_nr - INTEGER + Max numer of SACK that can be compressed. + Using 0 disables SACK compression. + + Detault : 44 + tcp_slow_start_after_idle - BOOLEAN If set, provide RFC2861 behavior and time out the congestion window after an idle period. An idle period is defined at diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 927318243cfa..661348f23ea5 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,7 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; + int sysctl_tcp_comp_sack_nr; unsigned long sysctl_tcp_comp_sack_delay_ns; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 11fbfdc1566e..d2eed3ddcb0a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int comp_sack_nr_max = 255; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1158,6 +1159,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "tcp_comp_sack_nr", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &comp_sack_nr_max, + }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6a1dae38c955..aebb29ab2fdf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5106,7 +5106,8 @@ send_now: return; } - if (!tcp_is_sack(tp) || tp->compressed_ack >= 44) + if (!tcp_is_sack(tp) || + tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) goto send_now; tp->compressed_ack++; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a3f4647341db..adbdb503db0c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2573,6 +2573,7 @@ static int __net_init tcp_sk_init(struct net *net) sizeof(init_net.ipv4.sysctl_tcp_wmem)); } net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; -- cgit v1.2.3-70-g09d2 From 3d97d88e8091f3501e016f6b4ce45a32c4b8f2f6 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 29 May 2018 23:27:31 +0800 Subject: tcp: minor optimization around tcp_hdr() usage in receive path This is additional to the commit ea1627c20c34 ("tcp: minor optimizations around tcp_hdr() usage"). At this point, skb->data is same with tcp_hdr() as tcp header has not been pulled yet. So use the less expensive one to get the tcp header. Remove the third parameter of tcp_rcv_established() and put it into the function body. Furthermore, the local variables are listed as a reverse christmas tree :) Cc: Eric Dumazet Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +-- include/trace/events/tcp.h | 5 +++-- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 952d842a604a..029a51b91742 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -334,8 +334,7 @@ void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th); +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 703abb6e11fa..ac55b328d61b 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -248,8 +248,9 @@ TRACE_EVENT(tcp_probe, ), TP_fast_assign( - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); @@ -261,7 +262,7 @@ TRACE_EVENT(tcp_probe, __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; - __entry->data_len = skb->len - tcp_hdrlen(skb); + __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tp->snd_cwnd; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1191cac72109..d5ffb573ca4d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5390,11 +5390,11 @@ discard: * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { - unsigned int len = skb->len; + const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); + unsigned int len = skb->len; /* TCP congestion window tracking */ trace_tcp_probe(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index adbdb503db0c..749b0ef9f405 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1486,7 +1486,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - tcp_rcv_established(sk, skb, tcp_hdr(skb)); + tcp_rcv_established(sk, skb); return 0; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7d47c2b550a9..8764a63abd91 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1322,7 +1322,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - tcp_rcv_established(sk, skb, tcp_hdr(skb)); + tcp_rcv_established(sk, skb); if (opt_skb) goto ipv6_pktoptions; return 0; -- cgit v1.2.3-70-g09d2 From 79e9fed460385a3d8ba0b5782e9e74405cb199b1 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sun, 3 Jun 2018 10:41:17 -0700 Subject: net-tcp: extend tcp_tw_reuse sysctl to enable loopback only optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes the /proc/sys/net/ipv4/tcp_tw_reuse from a boolean to an integer. It now takes the values 0, 1 and 2, where 0 and 1 behave as before, while 2 enables timewait socket reuse only for sockets that we can prove are loopback connections: ie. bound to 'lo' interface or where one of source or destination IPs is 127.0.0.0/8, ::ffff:127.0.0.0/104 or ::1. This enables quicker reuse of ephemeral ports for loopback connections - where tcp_tw_reuse is 100% safe from a protocol perspective (this assumes no artificially induced packet loss on 'lo'). This also makes estblishing many loopback connections *much* faster (allocating ports out of the first half of the ephemeral port range is significantly faster, then allocating from the second half) Without this change in a 32K ephemeral port space my sample program (it just establishes and closes [::1]:ephemeral -> [::1]:server_port connections in a tight loop) fails after 32765 connections in 24 seconds. With it enabled 50000 connections only take 4.7 seconds. This is particularly problematic for IPv6 where we only have one local address and cannot play tricks with varying source IP from 127.0.0.0/8 pool. Signed-off-by: Maciej Żenczykowski Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Wei Wang Change-Id: I0377961749979d0301b7b62871a32a4b34b654e1 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 10 +++++++--- net/ipv4/sysctl_net_ipv4.c | 5 ++++- net/ipv4/tcp_ipv4.c | 35 +++++++++++++++++++++++++++++++--- 3 files changed, 43 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 924bd51327b7..6841c74eac00 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -667,11 +667,15 @@ tcp_tso_win_divisor - INTEGER building larger TSO frames. Default: 3 -tcp_tw_reuse - BOOLEAN - Allow to reuse TIME-WAIT sockets for new connections when it is - safe from protocol viewpoint. Default value is 0. +tcp_tw_reuse - INTEGER + Enable reuse of TIME-WAIT sockets for new connections when it is + safe from protocol viewpoint. + 0 - disable + 1 - global enable + 2 - enable for loopback traffic only It should not be changed without advice/request of technical experts. + Default: 2 tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d2eed3ddcb0a..d06247ba08b2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -30,6 +30,7 @@ static int zero; static int one = 1; +static int two = 2; static int four = 4; static int thousand = 1000; static int gso_max_segs = GSO_MAX_SEGS; @@ -845,7 +846,9 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_tw_reuse, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "tcp_max_tw_buckets", diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 749b0ef9f405..633963e228bc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -110,8 +110,38 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { + const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); + int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; + + if (reuse == 2) { + /* Still does not detect *everything* that goes through + * lo, since we require a loopback src or dst address + * or direct binding to 'lo' interface. + */ + bool loopback = false; + if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) + loopback = true; +#if IS_ENABLED(CONFIG_IPV6) + if (tw->tw_family == AF_INET6) { + if (ipv6_addr_loopback(&tw->tw_v6_daddr) || + (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && + (tw->tw_v6_daddr.s6_addr[12] == 127)) || + ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || + (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && + (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) + loopback = true; + } else +#endif + { + if (ipv4_is_loopback(tw->tw_daddr) || + ipv4_is_loopback(tw->tw_rcv_saddr)) + loopback = true; + } + if (!loopback) + reuse = 0; + } /* With PAWS, it is safe from the viewpoint of data integrity. Even without PAWS it is safe provided sequence @@ -125,8 +155,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && - get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; @@ -2529,7 +2558,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_orphan_retries = 0; net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; - net->ipv4.sysctl_tcp_tw_reuse = 0; + net->ipv4.sysctl_tcp_tw_reuse = 2; cnt = tcp_hashinfo.ehash_mask + 1; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; -- cgit v1.2.3-70-g09d2