diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-10 20:01:30 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-10 20:01:30 -0800 |
commit | c5ce28df0e7c01a1de23c36ebdefcd803f2b6cbb (patch) | |
tree | 9830baf38832769e1cf621708889111bbe3c93df /net/ipv4/tcp_input.c | |
parent | 29afc4e9a408f2304e09c6dd0dbcfbd2356d0faa (diff) | |
parent | 9399f0c51489ae8c16d6559b82a452fdc1895e91 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) More iov_iter conversion work from Al Viro.
[ The "crypto: switch af_alg_make_sg() to iov_iter" commit was
wrong, and this pull actually adds an extra commit on top of the
branch I'm pulling to fix that up, so that the pre-merge state is
ok. - Linus ]
2) Various optimizations to the ipv4 forwarding information base trie
lookup implementation. From Alexander Duyck.
3) Remove sock_iocb altogether, from CHristoph Hellwig.
4) Allow congestion control algorithm selection via routing metrics.
From Daniel Borkmann.
5) Make ipv4 uncached route list per-cpu, from Eric Dumazet.
6) Handle rfs hash collisions more gracefully, also from Eric Dumazet.
7) Add xmit_more support to r8169, e1000, and e1000e drivers. From
Florian Westphal.
8) Transparent Ethernet Bridging support for GRO, from Jesse Gross.
9) Add BPF packet actions to packet scheduler, from Jiri Pirko.
10) Add support for uniqu flow IDs to openvswitch, from Joe Stringer.
11) New NetCP ethernet driver, from Muralidharan Karicheri and Wingman
Kwok.
12) More sanely handle out-of-window dupacks, which can result in
serious ACK storms. From Neal Cardwell.
13) Various rhashtable bug fixes and enhancements, from Herbert Xu,
Patrick McHardy, and Thomas Graf.
14) Support xmit_more in be2net, from Sathya Perla.
15) Group Policy extensions for vxlan, from Thomas Graf.
16) Remove Checksum Offload support for vxlan, from Tom Herbert.
17) Like ipv4, support lockless transmit over ipv6 UDP sockets. From
Vlad Yasevich.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1494+1 commits)
crypto: fix af_alg_make_sg() conversion to iov_iter
ipv4: Namespecify TCP PMTU mechanism
i40e: Fix for stats init function call in Rx setup
tcp: don't include Fast Open option in SYN-ACK on pure SYN-data
openvswitch: Only set TUNNEL_VXLAN_OPT if VXLAN-GBP metadata is set
ipv6: Make __ipv6_select_ident static
ipv6: Fix fragment id assignment on LE arches.
bridge: Fix inability to add non-vlan fdb entry
net: Mellanox: Delete unnecessary checks before the function call "vunmap"
cxgb4: Add support in cxgb4 to get expansion rom version via ethtool
ethtool: rename reserved1 memeber in ethtool_drvinfo for expansion ROM version
net: dsa: Remove redundant phy_attach()
IB/mlx4: Reset flow support for IB kernel ULPs
IB/mlx4: Always use the correct port for mirrored multicast attachments
net/bonding: Fix potential bad memory access during bonding events
tipc: remove tipc_snprintf
tipc: nl compat add noop and remove legacy nl framework
tipc: convert legacy nl stats show to nl compat
tipc: convert legacy nl net id get to nl compat
tipc: convert legacy nl net id set to nl compat
...
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 88 |
1 files changed, 51 insertions, 37 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 075ab4d5af5e..8fdd27b17306 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; +int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -3183,8 +3184,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) - ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); + if (ca_ops->pkts_acked) { + long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us); + ca_ops->pkts_acked(sk, pkts_acked, rtt_us); + } } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { @@ -3319,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 } /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) { /* unprotected vars, we dont care of overwrites */ static u32 challenge_timestamp; static unsigned int challenge_count; - u32 now = jiffies / HZ; + struct tcp_sock *tp = tcp_sk(sk); + u32 now; + + /* First check our per-socket dupack rate limit. */ + if (tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) + return; + /* Then check the check host-wide RFC 5961 rate limit. */ + now = jiffies / HZ; if (now != challenge_timestamp) { challenge_timestamp = now; challenge_count = 0; @@ -3358,34 +3370,34 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } /* This routine deals with acks during a TLP episode. + * We mark the end of a TLP episode on receiving TLP dupack or when + * ack is after tlp_high_seq. * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { struct tcp_sock *tp = tcp_sk(sk); - bool is_tlp_dupack = (ack == tp->tlp_high_seq) && - !(flag & (FLAG_SND_UNA_ADVANCED | - FLAG_NOT_DUP | FLAG_DATA_SACKED)); - /* Mark the end of TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - */ - if (is_tlp_dupack) { - tp->tlp_high_seq = 0; + if (before(ack, tp->tlp_high_seq)) return; - } - if (after(ack, tp->tlp_high_seq)) { + if (flag & FLAG_DSACKING_ACK) { + /* This DSACK means original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; + } else if (after(ack, tp->tlp_high_seq)) { + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); + tcp_try_keep_open(sk); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSSPROBERECOVERY); + } else if (!(flag & (FLAG_SND_UNA_ADVANCED | + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; - /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ - if (!(flag & FLAG_DSACKING_ACK)) { - tcp_init_cwnd_reduction(sk); - tcp_set_ca_state(sk, TCP_CA_CWR); - tcp_end_cwnd_reduction(sk); - tcp_try_keep_open(sk); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSSPROBERECOVERY); - } } } @@ -3421,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) { /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - tp->max_window)) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, skb); return -1; } goto old_ack; @@ -4990,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - tcp_send_dupack(sk, skb); + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDPAWS, + &tp->last_oow_ack_time)) + tcp_send_dupack(sk, skb); goto discard; } /* Reset is accepted even if it did not pass PAWS. */ @@ -5007,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (!th->rst) { if (th->syn) goto syn_challenge; - tcp_send_dupack(sk, skb); + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSEQ, + &tp->last_oow_ack_time)) + tcp_send_dupack(sk, skb); } goto discard; } @@ -5023,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) tcp_reset(sk); else - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, skb); goto discard; } @@ -5037,7 +5055,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, skb); goto discard; } @@ -5870,10 +5888,9 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP - * congestion control; it requires setting ECT on all packets, - * including SYN. We inverse the test in this case: If our - * local socket wants ECN, but peer only set ece/cwr (but not - * ECT in IP header) its probably a non-DCTCP aware sender. + * congestion control: Linux DCTCP asserts ECT on all packets, + * including SYN, which is most optimal solution; however, + * others, such as FreeBSD do not. */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, @@ -5883,18 +5900,15 @@ static void tcp_ecn_create_request(struct request_sock *req, const struct tcphdr *th = tcp_hdr(skb); const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn, ecn_ok; + bool ect, ecn_ok; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - need_ecn = tcp_ca_needs_ecn(listen_sk); ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); - if (!ect && !need_ecn && ecn_ok) - inet_rsk(req)->ecn_ok = 1; - else if (ect && need_ecn) + if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) inet_rsk(req)->ecn_ok = 1; } |