diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 316 | 
1 files changed, 144 insertions, 172 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c71d49ce0c93..39c393cc0fd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,12 +79,13 @@  int sysctl_tcp_timestamps __read_mostly = 1;  int sysctl_tcp_window_scaling __read_mostly = 1;  int sysctl_tcp_sack __read_mostly = 1; -int sysctl_tcp_fack __read_mostly = 1; +int sysctl_tcp_fack __read_mostly;  int sysctl_tcp_max_reordering __read_mostly = 300;  int sysctl_tcp_dsack __read_mostly = 1;  int sysctl_tcp_app_win __read_mostly = 31;  int sysctl_tcp_adv_win_scale __read_mostly = 1;  EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +EXPORT_SYMBOL(sysctl_tcp_timestamps);  /* rfc5961 challenge ack rate limiting */  int sysctl_tcp_challenge_ack_limit = 1000; @@ -94,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;  int sysctl_tcp_max_orphans __read_mostly = NR_FILE;  int sysctl_tcp_frto __read_mostly = 2;  int sysctl_tcp_min_rtt_wlen __read_mostly = 300; - -int sysctl_tcp_thin_dupack __read_mostly; -  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;  int sysctl_tcp_early_retrans __read_mostly = 3;  int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; @@ -903,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,  		tcp_disable_fack(tp);  	} -	if (metric > 0) -		tcp_disable_early_retrans(tp);  	tp->rack.reord = 1;  } @@ -915,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)  	    before(TCP_SKB_CB(skb)->seq,  		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))  		tp->retransmit_skb_hint = skb; - -	if (!tp->lost_out || -	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) -		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;  }  /* Sum the number of packets on the wire we have marked as lost. @@ -1134,6 +1126,7 @@ struct tcp_sacktag_state {  	 */  	struct skb_mstamp first_sackt;  	struct skb_mstamp last_sackt; +	struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */  	struct rate_sample *rate;  	int	flag;  }; @@ -1216,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk,  		return sacked;  	if (!(sacked & TCPCB_SACKED_ACKED)) { -		tcp_rack_advance(tp, xmit_time, sacked); +		tcp_rack_advance(tp, sacked, end_seq, +				 xmit_time, &state->ack_time);  		if (sacked & TCPCB_SACKED_RETRANS) {  			/* If the segment is not tagged as lost, @@ -1936,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk)  	struct tcp_sock *tp = tcp_sk(sk);  	struct net *net = sock_net(sk);  	struct sk_buff *skb; -	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;  	bool is_reneg;			/* is receiver reneging on SACKs? */  	bool mark_lost; @@ -1981,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk)  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;  			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;  			tp->lost_out += tcp_skb_pcount(skb); -			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;  		}  	}  	tcp_verify_left_out(tp); @@ -1997,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk)  	tp->high_seq = tp->snd_nxt;  	tcp_ecn_queue_cwr(tp); -	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous -	 * loss recovery is underway except recurring timeout(s) on -	 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing +	/* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO +	 * if a previous recovery is underway, otherwise it may incorrectly +	 * call a timeout spurious if some previously retransmitted packets +	 * are s/acked (sec 3.2). We do not apply that retriction since +	 * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS +	 * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO +	 * on PTMU discovery to avoid sending new data.  	 */ -	tp->frto = sysctl_tcp_frto && -		   (new_recovery || icsk->icsk_retransmits) && -		   !inet_csk(sk)->icsk_mtup.probe_size; +	tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;  }  /* If ACK arrived pointing to a remembered SACK, it means that our @@ -2055,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)  	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;  } -static bool tcp_pause_early_retransmit(struct sock *sk, int flag) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	unsigned long delay; - -	/* Delay early retransmit and entering fast recovery for -	 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples -	 * available, or RTO is scheduled to fire first. -	 */ -	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || -	    (flag & FLAG_ECE) || !tp->srtt_us) -		return false; - -	delay = max(usecs_to_jiffies(tp->srtt_us >> 5), -		    msecs_to_jiffies(2)); - -	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) -		return false; - -	inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, -				  TCP_RTO_MAX); -	return true; -} -  /* Linux NewReno/SACK/FACK/ECN state machine.   * --------------------------------------   * @@ -2126,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)   *	F.e. after RTO, when all the queue is considered as lost,   *	lost_out = packets_out and in_flight = retrans_out.   * - *		Essentially, we have now two algorithms counting + *		Essentially, we have now a few algorithms detecting   *		lost packets.   * - *		FACK: It is the simplest heuristics. As soon as we decided + *		If the receiver supports SACK: + * + *		RFC6675/3517: It is the conventional algorithm. A packet is + *		considered lost if the number of higher sequence packets + *		SACKed is greater than or equal the DUPACK thoreshold + *		(reordering). This is implemented in tcp_mark_head_lost and + *		tcp_update_scoreboard. + * + *		RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + *		(2017-) that checks timing instead of counting DUPACKs. + *		Essentially a packet is considered lost if it's not S/ACKed + *		after RTT + reordering_window, where both metrics are + *		dynamically measured and adjusted. This is implemented in + *		tcp_rack_mark_lost. + * + *		FACK (Disabled by default. Subsumbed by RACK): + *		It is the simplest heuristics. As soon as we decided   *		that something is lost, we decide that _all_ not SACKed   *		packets until the most forward SACK are lost. I.e.   *		lost_out = fackets_out - sacked_out and left_out = fackets_out. @@ -2138,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)   *		takes place. We use FACK by default until reordering   *		is suspected on the path to this destination.   * - *		NewReno: when Recovery is entered, we assume that one segment + *		If the receiver does not support SACK: + * + *		NewReno (RFC6582): in Recovery we assume that one segment   *		is lost (classic Reno). While we are in Recovery and   *		a partial ACK arrives, we assume that one more packet   *		is lost (NewReno). This heuristics are the same in NewReno   *		and SACK.   * - *  Imagine, that's all! Forget about all this shamanism about CWND inflation - *  deflation etc. CWND is real congestion window, never inflated, changes - *  only according to classic VJ rules. - *   * Really tricky (and requiring careful tuning) part of algorithm   * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().   * The first determines the moment _when_ we should reduce CWND and, @@ -2175,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)  static bool tcp_time_to_recover(struct sock *sk, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk); -	__u32 packets_out; -	int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;  	/* Trick#1: The loss is proven. */  	if (tp->lost_out) @@ -2186,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)  	if (tcp_dupack_heuristics(tp) > tp->reordering)  		return true; -	/* Trick#4: It is still not OK... But will it be useful to delay -	 * recovery more? -	 */ -	packets_out = tp->packets_out; -	if (packets_out <= tp->reordering && -	    tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && -	    !tcp_may_send_now(sk)) { -		/* We have nothing to send. This connection is limited -		 * either by receiver window or by application. -		 */ -		return true; -	} - -	/* If a thin stream is detected, retransmit after first -	 * received dupack. Employ only if SACK is supported in order -	 * to avoid possible corner-case series of spurious retransmissions -	 * Use only if there are no unsent data. -	 */ -	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && -	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && -	    tcp_is_sack(tp) && !tcp_send_head(sk)) -		return true; - -	/* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious -	 * retransmissions due to small network reorderings, we implement -	 * Mitigation A.3 in the RFC and delay the retransmission for a short -	 * interval if appropriate. -	 */ -	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && -	    (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && -	    !tcp_may_send_now(sk)) -		return !tcp_pause_early_retransmit(sk, flag); -  	return false;  } @@ -2414,10 +2363,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)  	if (tp->prior_ssthresh) {  		const struct inet_connection_sock *icsk = inet_csk(sk); -		if (icsk->icsk_ca_ops->undo_cwnd) -			tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); -		else -			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); +		tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);  		if (tp->prior_ssthresh > tp->snd_ssthresh) {  			tp->snd_ssthresh = tp->prior_ssthresh; @@ -2523,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)  	tcp_ecn_queue_cwr(tp);  } -static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, -			       int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int sndcnt = 0; @@ -2692,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk)  }  EXPORT_SYMBOL(tcp_simple_retransmit); -static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +void tcp_enter_recovery(struct sock *sk, bool ece_ack)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int mib_idx; @@ -2728,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,  	    tcp_try_undo_loss(sk, false))  		return; -	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ -		/* Step 3.b. A timeout is spurious if not all data are -		 * lost, i.e., never-retransmitted data are (s)acked. -		 */ -		if ((flag & FLAG_ORIG_SACK_ACKED) && -		    tcp_try_undo_loss(sk, true)) -			return; +	/* The ACK (s)acks some never-retransmitted data meaning not all +	 * the data packets before the timeout were lost. Therefore we +	 * undo the congestion window and state. This is essentially +	 * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since +	 * a retransmitted skb is permantly marked, we can apply such an +	 * operation even if F-RTO was not used. +	 */ +	if ((flag & FLAG_ORIG_SACK_ACKED) && +	    tcp_try_undo_loss(sk, tp->undo_marker)) +		return; +	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */  		if (after(tp->snd_nxt, tp->high_seq)) {  			if (flag & FLAG_DATA_SACKED || is_dupack)  				tp->frto = 0; /* Step 3.a. loss was real */ @@ -2802,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)  	return false;  } +static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, +				   const struct skb_mstamp *ack_time) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	/* Use RACK to detect loss */ +	if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { +		u32 prior_retrans = tp->retrans_out; + +		tcp_rack_mark_lost(sk, ack_time); +		if (prior_retrans > tp->retrans_out) +			*ack_flag |= FLAG_LOST_RETRANS; +	} +} +  /* Process an event, which can update packets-in-flight not trivially.   * Main goal of this function is to calculate new estimate for left_out,   * taking into account both packets sitting in receiver's buffer and @@ -2815,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)   * tcp_xmit_retransmit_queue().   */  static void tcp_fastretrans_alert(struct sock *sk, const int acked, -				  bool is_dupack, int *ack_flag, int *rexmit) +				  bool is_dupack, int *ack_flag, int *rexmit, +				  const struct skb_mstamp *ack_time)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); @@ -2866,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  		}  	} -	/* Use RACK to detect loss */ -	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && -	    tcp_rack_mark_lost(sk)) { -		flag |= FLAG_LOST_RETRANS; -		*ack_flag |= FLAG_LOST_RETRANS; -	} -  	/* E. Process state. */  	switch (icsk->icsk_ca_state) {  	case TCP_CA_Recovery: @@ -2890,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  			tcp_try_keep_open(sk);  			return;  		} +		tcp_rack_identify_loss(sk, ack_flag, ack_time);  		break;  	case TCP_CA_Loss:  		tcp_process_loss(sk, flag, is_dupack, rexmit); -		if (icsk->icsk_ca_state != TCP_CA_Open && -		    !(flag & FLAG_LOST_RETRANS)) +		tcp_rack_identify_loss(sk, ack_flag, ack_time); +		if (!(icsk->icsk_ca_state == TCP_CA_Open || +		      (*ack_flag & FLAG_LOST_RETRANS)))  			return;  		/* Change state if cwnd is undone or retransmits are lost */  	default: @@ -2908,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  		if (icsk->icsk_ca_state <= TCP_CA_Disorder)  			tcp_try_undo_dsack(sk); +		tcp_rack_identify_loss(sk, ack_flag, ack_time);  		if (!tcp_time_to_recover(sk, flag)) {  			tcp_try_to_open(sk, flag);  			return; @@ -3026,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk)  	} else {  		u32 rto = inet_csk(sk)->icsk_rto;  		/* Offset the time elapsed after installing regular RTO */ -		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || +		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||  		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {  			struct sk_buff *skb = tcp_write_queue_head(sk);  			const u32 rto_time_stamp = @@ -3043,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk)  	}  } -/* This function is called when the delayed ER timer fires. TCP enters - * fast recovery and performs fast-retransmit. - */ -void tcp_resume_early_retransmit(struct sock *sk) -{ -	struct tcp_sock *tp = tcp_sk(sk); - -	tcp_rearm_rto(sk); - -	/* Stop if ER is disabled after the delayed ER timer is scheduled */ -	if (!tp->do_early_retrans) -		return; - -	tcp_enter_recovery(sk, false); -	tcp_update_scoreboard(sk, 1); -	tcp_xmit_retransmit_queue(sk); -} -  /* If we get here, the whole TSO packet has not been acked. */  static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)  { @@ -3103,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,   */  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  			       u32 prior_snd_una, int *acked, -			       struct tcp_sacktag_state *sack, -			       struct skb_mstamp *now) +			       struct tcp_sacktag_state *sack)  {  	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct skb_mstamp first_ackt, last_ackt; +	struct skb_mstamp *now = &sack->ack_time;  	struct tcp_sock *tp = tcp_sk(sk);  	u32 prior_sacked = tp->sacked_out;  	u32 reord = tp->packets_out; @@ -3167,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		} else if (tcp_is_sack(tp)) {  			tp->delivered += acked_pcount;  			if (!tcp_skb_spurious_retrans(tp, skb)) -				tcp_rack_advance(tp, &skb->skb_mstamp, sacked); +				tcp_rack_advance(tp, sacked, scb->end_seq, +						 &skb->skb_mstamp, +						 &sack->ack_time);  		}  		if (sacked & TCPCB_LOST)  			tp->lost_out -= acked_pcount; @@ -3201,6 +3146,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  			tp->lost_skb_hint = NULL;  	} +	if (!skb) +		tcp_chrono_stop(sk, TCP_CHRONO_BUSY); +  	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))  		tp->snd_up = tp->snd_una; @@ -3371,9 +3319,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)  	u32 delta = ack - tp->snd_una;  	sock_owned_by_me((struct sock *)tp); -	u64_stats_update_begin_raw(&tp->syncp);  	tp->bytes_acked += delta; -	u64_stats_update_end_raw(&tp->syncp);  	tp->snd_una = ack;  } @@ -3383,9 +3329,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)  	u32 delta = seq - tp->rcv_nxt;  	sock_owned_by_me((struct sock *)tp); -	u64_stats_update_begin_raw(&tp->syncp);  	tp->bytes_received += delta; -	u64_stats_update_end_raw(&tp->syncp);  	tp->rcv_nxt = seq;  } @@ -3598,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	u32 lost = tp->lost;  	int acked = 0; /* Number of packets newly acked */  	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ -	struct skb_mstamp now;  	sack_state.first_sackt.v64 = 0;  	sack_state.rate = &rs; @@ -3624,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	if (after(ack, tp->snd_nxt))  		goto invalid_ack; -	skb_mstamp_get(&now); +	skb_mstamp_get(&sack_state.ack_time); -	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || -	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) +	if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)  		tcp_rearm_rto(sk);  	if (after(ack, prior_snd_una)) { @@ -3692,34 +3634,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	/* See if we can take anything off of the retransmit queue. */  	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, -				    &sack_state, &now); +				    &sack_state);  	if (tcp_ack_is_dubious(sk, flag)) {  		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); -		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); +		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, +				      &sack_state.ack_time);  	}  	if (tp->tlp_high_seq)  		tcp_process_tlp_ack(sk, ack, flag); -	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { -		struct dst_entry *dst = __sk_dst_get(sk); -		if (dst) -			dst_confirm(dst); -	} +	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) +		sk_dst_confirm(sk);  	if (icsk->icsk_pending == ICSK_TIME_RETRANS)  		tcp_schedule_loss_probe(sk);  	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */  	lost = tp->lost - lost;			/* freshly marked lost */ -	tcp_rate_gen(sk, delivered, lost, &now, &rs); -	tcp_cong_control(sk, ack, delivered, flag, &rs); +	tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time, +		     sack_state.rate); +	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);  	tcp_xmit_recovery(sk, rexmit);  	return 1;  no_queue:  	/* If data was DSACKed, see if we can undo a cwnd reduction. */  	if (flag & FLAG_DSACKING_ACK) -		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); +		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, +				      &sack_state.ack_time);  	/* If this ack opens up a zero window, clear backoff.  It was  	 * being used to time the probes, and is probably far higher than  	 * it needs to be for normal retransmission. @@ -3740,9 +3682,11 @@ old_ack:  	 * If data was DSACKed, see if we can undo a cwnd reduction.  	 */  	if (TCP_SKB_CB(skb)->sacked) { +		skb_mstamp_get(&sack_state.ack_time);  		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,  						&sack_state); -		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); +		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, +				      &sack_state.ack_time);  		tcp_xmit_recovery(sk, rexmit);  	} @@ -4560,6 +4504,7 @@ add_sack:  end:  	if (skb) {  		tcp_grow_window(sk, skb); +		skb_condense(skb);  		skb_set_owner_r(skb, sk);  	}  } @@ -5081,10 +5026,13 @@ static void tcp_check_space(struct sock *sk)  	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {  		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);  		/* pairs with tcp_poll() */ -		smp_mb__after_atomic(); +		smp_mb();  		if (sk->sk_socket && -		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) +		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {  			tcp_new_space(sk); +			if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) +				tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); +		}  	}  } @@ -5249,6 +5197,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)  	return err;  } +/* Accept RST for rcv_nxt - 1 after a FIN. + * When tcp connections are abruptly terminated from Mac OSX (via ^C), a + * FIN is sent followed by a RST packet. The RST is sent with the same + * sequence number as the FIN, and thus according to RFC 5961 a challenge + * ACK should be sent. However, Mac OSX rate limits replies to challenge + * ACKs on the closed socket. In addition middleboxes can drop either the + * challenge ACK or a subsequent RST. + */ +static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) && +			(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | +					       TCPF_CLOSING)); +} +  /* Does PAWS and seqno based validation of an incoming segment, flags will   * play significant role here.   */ @@ -5287,20 +5252,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,  						  LINUX_MIB_TCPACKSKIPPEDSEQ,  						  &tp->last_oow_ack_time))  				tcp_send_dupack(sk, skb); +		} else if (tcp_reset_check(sk, skb)) { +			tcp_reset(sk);  		}  		goto discard;  	}  	/* Step 2: check RST bit */  	if (th->rst) { -		/* RFC 5961 3.2 (extend to match against SACK too if available): -		 * If seq num matches RCV.NXT or the right-most SACK block, +		/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a +		 * FIN and SACK too if available): +		 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or +		 * the right-most SACK block,  		 * then  		 *     RESET the connection  		 * else  		 *     Send a challenge ACK  		 */ -		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { +		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || +		    tcp_reset_check(sk, skb)) {  			rst_seq_match = true;  		} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {  			struct tcp_sack_block *sp = &tp->selective_acks[0]; @@ -5916,9 +5886,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)  		if (th->syn) {  			if (th->fin)  				goto discard; -			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) -				return 1; +			/* It is possible that we process SYN packets from backlog, +			 * so we need to make sure to disable BH right there. +			 */ +			local_bh_disable(); +			acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; +			local_bh_enable(); +			if (!acceptable) +				return 1;  			consume_skb(skb);  			return 0;  		} @@ -6022,7 +5998,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)  		break;  	case TCP_FIN_WAIT1: { -		struct dst_entry *dst;  		int tmo;  		/* If we enter the TCP_FIN_WAIT1 state and we are a @@ -6049,9 +6024,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)  		tcp_set_state(sk, TCP_FIN_WAIT2);  		sk->sk_shutdown |= SEND_SHUTDOWN; -		dst = __sk_dst_get(sk); -		if (dst) -			dst_confirm(dst); +		sk_dst_confirm(sk);  		if (!sock_flag(sk, SOCK_DEAD)) {  			/* Wake up lingering close() */ @@ -6318,13 +6291,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  			goto drop;  	} - -	/* Accept backlog is full. If we have already queued enough -	 * of warm entries in syn queue, drop request. It is better than -	 * clogging syn queue with openreqs with exponentially increasing -	 * timeout. -	 */ -	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { +	if (sk_acceptq_is_full(sk)) {  		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);  		goto drop;  	} @@ -6334,6 +6301,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  		goto drop;  	tcp_rsk(req)->af_specific = af_ops; +	tcp_rsk(req)->ts_off = 0;  	tcp_clear_options(&tmp_opt);  	tmp_opt.mss_clamp = af_ops->mss_clamp; @@ -6355,6 +6323,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  	if (security_inet_conn_request(sk, skb, req))  		goto drop_and_free; +	if (isn && tmp_opt.tstamp_ok) +		af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); +  	if (!want_cookie && !isn) {  		/* VJ's idea. We save last timestamp seen  		 * from the destination in peer table, when entering @@ -6365,7 +6336,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  		 * timewait bucket, so that all the necessary checks  		 * are made in the function processing timewait state.  		 */ -		if (tcp_death_row.sysctl_tw_recycle) { +		if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {  			bool strict;  			dst = af_ops->route_req(sk, &fl, req, &strict); @@ -6379,8 +6350,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  		}  		/* Kill the following clause, if you dislike this way. */  		else if (!net->ipv4.sysctl_tcp_syncookies && -			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < -			  (sysctl_max_syn_backlog >> 2)) && +			 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < +			  (net->ipv4.sysctl_max_syn_backlog >> 2)) &&  			 !tcp_peer_is_proven(req, dst, false,  					     tmp_opt.saw_tstamp)) {  			/* Without syncookies last quarter of @@ -6395,7 +6366,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  			goto drop_and_release;  		} -		isn = af_ops->init_seq(skb); +		isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);  	}  	if (!dst) {  		dst = af_ops->route_req(sk, &fl, req, NULL); @@ -6407,6 +6378,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  	if (want_cookie) {  		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); +		tcp_rsk(req)->ts_off = 0;  		req->cookie_ts = tmp_opt.tstamp_ok;  		if (!tmp_opt.tstamp_ok)  			inet_rsk(req)->ecn_ok = 0;  | 
