summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c227
1 files changed, 137 insertions, 90 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a140d9f7a0a3..e37488d3453f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -72,6 +72,7 @@
#include <linux/prefetch.h>
#include <net/dst.h>
#include <net/tcp.h>
+#include <net/proto_memory.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
@@ -237,9 +238,14 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+ u8 old_ratio = tcp_sk(sk)->scaling_ratio;
do_div(val, skb->truesize);
tcp_sk(sk)->scaling_ratio = val ? val : 1;
+
+ if (old_ratio != tcp_sk(sk)->scaling_ratio)
+ WRITE_ONCE(tcp_sk(sk)->window_clamp,
+ tcp_win_from_space(sk, sk->sk_rcvbuf));
}
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
@@ -563,19 +569,20 @@ static void tcp_init_buffer_space(struct sock *sk)
maxwin = tcp_full_space(sk);
if (tp->window_clamp >= maxwin) {
- tp->window_clamp = maxwin;
+ WRITE_ONCE(tp->window_clamp, maxwin);
if (tcp_app_win && maxwin > 4 * tp->advmss)
- tp->window_clamp = max(maxwin -
- (maxwin >> tcp_app_win),
- 4 * tp->advmss);
+ WRITE_ONCE(tp->window_clamp,
+ max(maxwin - (maxwin >> tcp_app_win),
+ 4 * tp->advmss));
}
/* Force reservation of one segment. */
if (tcp_app_win &&
tp->window_clamp > 2 * tp->advmss &&
tp->window_clamp + tp->advmss > maxwin)
- tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+ WRITE_ONCE(tp->window_clamp,
+ max(2 * tp->advmss, maxwin - tp->advmss));
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
tp->snd_cwnd_stamp = tcp_jiffies32;
@@ -773,7 +780,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
- tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
+ WRITE_ONCE(tp->window_clamp,
+ tcp_win_from_space(sk, rcvbuf));
}
}
tp->rcvq_space.space = copied;
@@ -911,7 +919,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max_us = tcp_rto_min_us(sk);
- tcp_bpf_rtt(sk);
+ tcp_bpf_rtt(sk, mrtt_us, srtt);
}
} else {
/* no previous measure. */
@@ -921,7 +929,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
- tcp_bpf_rtt(sk);
+ tcp_bpf_rtt(sk, mrtt_us, srtt);
}
tp->srtt_us = max(1U, srtt);
}
@@ -2126,8 +2134,16 @@ void tcp_clear_retrans(struct tcp_sock *tp)
static inline void tcp_init_undo(struct tcp_sock *tp)
{
tp->undo_marker = tp->snd_una;
+
/* Retransmission still in flight may cause DSACKs later. */
- tp->undo_retrans = tp->retrans_out ? : -1;
+ /* First, account for regular retransmits in flight: */
+ tp->undo_retrans = tp->retrans_out;
+ /* Next, account for TLP retransmits in flight: */
+ if (tp->tlp_high_seq && tp->tlp_retrans)
+ tp->undo_retrans++;
+ /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */
+ if (!tp->undo_retrans)
+ tp->undo_retrans = -1;
}
static bool tcp_is_rack(const struct sock *sk)
@@ -2206,6 +2222,7 @@ void tcp_enter_loss(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = 0;
tcp_ecn_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
@@ -2779,13 +2796,37 @@ static void tcp_mtup_probe_success(struct sock *sk)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
+/* Sometimes we deduce that packets have been dropped due to reasons other than
+ * congestion, like path MTU reductions or failed client TFO attempts. In these
+ * cases we call this function to retransmit as many packets as cwnd allows,
+ * without reducing cwnd. Given that retransmits will set retrans_stamp to a
+ * non-zero value (and may do so in a later calling context due to TSQ), we
+ * also enter CA_Loss so that we track when all retransmitted packets are ACKed
+ * and clear retrans_stamp when that happens (to ensure later recurring RTOs
+ * are using the correct retrans_stamp and don't declare ETIMEDOUT
+ * prematurely).
+ */
+static void tcp_non_congestion_loss_retransmit(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = 0;
+ tcp_set_ca_state(sk, TCP_CA_Loss);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* Do a simple retransmit without using the backoff mechanisms in
* tcp_timer. This is used for path mtu discovery.
* The socket is already locked here.
*/
void tcp_simple_retransmit(struct sock *sk)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int mss;
@@ -2825,14 +2866,7 @@ void tcp_simple_retransmit(struct sock *sk)
* in network, but units changed and effective
* cwnd/ssthresh really reduced now.
*/
- if (icsk->icsk_ca_state != TCP_CA_Loss) {
- tp->high_seq = tp->snd_nxt;
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->prior_ssthresh = 0;
- tp->undo_marker = 0;
- tcp_set_ca_state(sk, TCP_CA_Loss);
- }
- tcp_xmit_retransmit_queue(sk);
+ tcp_non_congestion_loss_retransmit(sk);
}
EXPORT_SYMBOL(tcp_simple_retransmit);
@@ -3057,7 +3091,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
return;
if (tcp_try_undo_dsack(sk))
- tcp_try_keep_open(sk);
+ tcp_try_to_open(sk, flag);
tcp_identify_packet_loss(sk, ack_flag);
if (icsk->icsk_ca_state != TCP_CA_Recovery) {
@@ -3539,7 +3573,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->cong_control) {
- icsk->icsk_ca_ops->cong_control(sk, rs);
+ icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs);
return;
}
@@ -3575,8 +3609,10 @@ static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
ao = rcu_dereference_protected(tp->ao_info,
lockdep_sock_is_held((struct sock *)tp));
- if (ao && ack < tp->snd_una)
+ if (ao && ack < tp->snd_una) {
ao->snd_sne++;
+ trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne);
+ }
#endif
}
@@ -3601,8 +3637,10 @@ static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
ao = rcu_dereference_protected(tp->ao_info,
lockdep_sock_is_held((struct sock *)tp));
- if (ao && seq < tp->rcv_nxt)
+ if (ao && seq < tp->rcv_nxt) {
ao->rcv_sne++;
+ trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne);
+ }
#endif
}
@@ -4204,6 +4242,13 @@ void tcp_parse_options(const struct net *net,
*/
break;
#endif
+#ifdef CONFIG_TCP_AO
+ case TCPOPT_AO:
+ /* TCP AO has already been checked
+ * (see tcp_inbound_ao_hash()).
+ */
+ break;
+#endif
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
@@ -4433,9 +4478,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
return SKB_NOT_DROPPED_YET;
}
+
+void tcp_done_with_error(struct sock *sk, int err)
+{
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ WRITE_ONCE(sk->sk_err, err);
+ smp_wmb();
+
+ tcp_write_queue_purge(sk);
+ tcp_done(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+}
+EXPORT_SYMBOL(tcp_done_with_error);
+
/* When we get a reset we do this. */
void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
+ int err;
+
trace_tcp_receive_reset(sk);
/* mptcp can't tell us to ignore reset pkts,
@@ -4447,24 +4509,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
- WRITE_ONCE(sk->sk_err, ECONNREFUSED);
+ err = ECONNREFUSED;
break;
case TCP_CLOSE_WAIT:
- WRITE_ONCE(sk->sk_err, EPIPE);
+ err = EPIPE;
break;
case TCP_CLOSE:
return;
default:
- WRITE_ONCE(sk->sk_err, ECONNRESET);
+ err = ECONNRESET;
}
- /* This barrier is coupled with smp_rmb() in tcp_poll() */
- smp_wmb();
-
- tcp_write_queue_purge(sk);
- tcp_done(sk);
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk_error_report(sk);
+ tcp_done_with_error(sk, err);
}
/*
@@ -4800,13 +4855,8 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
- if (!mptcp_skb_can_collapse(to, from))
- return false;
-
-#ifdef CONFIG_TLS_DEVICE
- if (from->decrypted != to->decrypted)
+ if (!tcp_skb_can_collapse_rx(to, from))
return false;
-#endif
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
@@ -4848,7 +4898,7 @@ static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason reason)
{
sk_drops_add(sk, skb);
- kfree_skb_reason(skb, reason);
+ sk_skb_reason_drop(sk, skb, reason);
}
/* This one checks to see if we can put data from the
@@ -5174,6 +5224,16 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0) {
+ /* Some stacks are known to send bare FIN packets
+ * in a loop even if we send RWIN 0 in our ACK.
+ * Accepting this FIN does not hurt memory pressure
+ * because the FIN flag will simply be merged to the
+ * receive queue tail skb in most cases.
+ */
+ if (!skb->len &&
+ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ goto queue_and_out;
+
reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
@@ -5188,7 +5248,7 @@ queue_and_out:
inet_csk_schedule_ack(sk);
sk->sk_data_ready(sk);
- if (skb_queue_len(&sk->sk_receive_queue)) {
+ if (skb_queue_len(&sk->sk_receive_queue) && skb->len) {
reason = SKB_DROP_REASON_PROTO_MEM;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
@@ -5351,7 +5411,7 @@ restart:
break;
}
- if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
+ if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
@@ -5375,9 +5435,7 @@ restart:
break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
-#ifdef CONFIG_TLS_DEVICE
- nskb->decrypted = skb->decrypted;
-#endif
+ skb_copy_decrypted(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
@@ -5404,13 +5462,9 @@ restart:
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
- !mptcp_skb_can_collapse(nskb, skb) ||
+ !tcp_skb_can_collapse_rx(nskb, skb) ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
-#ifdef CONFIG_TLS_DEVICE
- if (skb->decrypted != nskb->decrypted)
- goto end;
-#endif
}
}
}
@@ -5949,6 +6003,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
+ if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
+ TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
+ TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
+ TCP_SKB_CB(skb)->ack_seq == tp->snd_nxt)
+ goto pass;
syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
@@ -5958,6 +6017,7 @@ syn_challenge:
goto discard;
}
+pass:
bpf_skops_parse_hdr(sk, skb);
return true;
@@ -6288,7 +6348,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
skb_rbtree_walk_from(data)
tcp_mark_skb_lost(sk, data);
- tcp_xmit_retransmit_queue(sk);
+ tcp_non_congestion_loss_retransmit(sk);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
@@ -6426,7 +6486,8 @@ consume:
if (!tp->rx_opt.wscale_ok) {
tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
- tp->window_clamp = min(tp->window_clamp, 65535U);
+ WRITE_ONCE(tp->window_clamp,
+ min(tp->window_clamp, 65535U));
}
if (tp->rx_opt.saw_tstamp) {
@@ -6973,35 +7034,10 @@ static void tcp_openreq_init(struct request_sock *req,
#endif
}
-struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
- struct sock *sk_listener,
- bool attach_listener)
-{
- struct request_sock *req = reqsk_alloc(ops, sk_listener,
- attach_listener);
-
- if (req) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- ireq->ireq_opt = NULL;
-#if IS_ENABLED(CONFIG_IPV6)
- ireq->pktopts = NULL;
-#endif
- atomic64_set(&ireq->ir_cookie, 0);
- ireq->ireq_state = TCP_NEW_SYN_RECV;
- write_pnet(&ireq->ireq_net, sock_net(sk_listener));
- ireq->ireq_family = sk_listener->sk_family;
- req->timeout = TCP_TIMEOUT_INIT;
- }
-
- return req;
-}
-EXPORT_SYMBOL(inet_reqsk_alloc);
-
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
+static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
@@ -7102,7 +7138,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
- __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
@@ -7112,21 +7147,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct dst_entry *dst;
struct flowi fl;
u8 syncookies;
+ u32 isn;
#ifdef CONFIG_TCP_AO
const struct tcp_ao_hdr *aoh;
#endif
- syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
+ isn = __this_cpu_read(tcp_tw_isn);
+ if (isn) {
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ __this_cpu_write(tcp_tw_isn, 0);
+ } else {
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
- if (!want_cookie)
- goto drop;
+ if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) {
+ want_cookie = tcp_syn_flood_action(sk,
+ rsk_ops->slab_name);
+ if (!want_cookie)
+ goto drop;
+ }
}
if (sk_acceptq_is_full(sk)) {
@@ -7165,7 +7207,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
- dst = af_ops->route_req(sk, skb, &fl, req);
+ dst = af_ops->route_req(sk, skb, &fl, req, isn);
if (!dst)
goto drop_and_free;
@@ -7242,7 +7284,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie) {
req->timeout = tcp_timeout_init((struct sock *)req);
- inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
+ if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req,
+ req->timeout))) {
+ reqsk_free(req);
+ return 0;
+ }
+
}
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :