diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 204 | 
1 files changed, 121 insertions, 83 deletions
| diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ca2139a130b..25a89eaa669d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)  	}  } +/* Set the sk_pacing_rate to allow proper sizing of TSO packets. + * Note: TCP stack does not yet implement pacing. + * FQ packet scheduler can be used to implement cheap but effective + * TCP pacing, to smooth the burst on large writes when packets + * in flight is significantly lower than cwnd (or rwin) + */ +static void tcp_update_pacing_rate(struct sock *sk) +{ +	const struct tcp_sock *tp = tcp_sk(sk); +	u64 rate; + +	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ +	rate = (u64)tp->mss_cache * 2 * (HZ << 3); + +	rate *= max(tp->snd_cwnd, tp->packets_out); + +	/* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), +	 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) +	 * We probably need usec resolution in the future. +	 * Note: This also takes care of possible srtt=0 case, +	 * when tcp_rtt_estimator() was not yet called. +	 */ +	if (tp->srtt > 8 + 2) +		do_div(rate, tp->srtt); + +	sk->sk_pacing_rate = min_t(u64, rate, ~0U); +} +  /* Calculate rto without backoff.  This is the second half of Van Jacobson's   * routine referred to above.   */ @@ -1048,6 +1076,7 @@ struct tcp_sacktag_state {  	int reord;  	int fack_count;  	int flag; +	s32 rtt; /* RTT measured by SACKing never-retransmitted data */  };  /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1108,7 +1137,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  static u8 tcp_sacktag_one(struct sock *sk,  			  struct tcp_sacktag_state *state, u8 sacked,  			  u32 start_seq, u32 end_seq, -			  bool dup_sack, int pcount) +			  int dup_sack, int pcount, u32 xmit_time)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int fack_count = state->fack_count; @@ -1148,6 +1177,9 @@ static u8 tcp_sacktag_one(struct sock *sk,  							   state->reord);  				if (!after(end_seq, tp->high_seq))  					state->flag |= FLAG_ORIG_SACK_ACKED; +				/* Pick the earliest sequence sacked for RTT */ +				if (state->rtt < 0) +					state->rtt = tcp_time_stamp - xmit_time;  			}  			if (sacked & TCPCB_LOST) { @@ -1205,7 +1237,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,  	 * tcp_highest_sack_seq() when skb is highest_sack.  	 */  	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, -			start_seq, end_seq, dup_sack, pcount); +			start_seq, end_seq, dup_sack, pcount, +			TCP_SKB_CB(skb)->when);  	if (skb == tp->lost_skb_hint)  		tp->lost_cnt_hint += pcount; @@ -1479,7 +1512,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,  						TCP_SKB_CB(skb)->seq,  						TCP_SKB_CB(skb)->end_seq,  						dup_sack, -						tcp_skb_pcount(skb)); +						tcp_skb_pcount(skb), +						TCP_SKB_CB(skb)->when);  			if (!before(TCP_SKB_CB(skb)->seq,  				    tcp_highest_sack_seq(tp))) @@ -1536,7 +1570,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl  static int  tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, -			u32 prior_snd_una) +			u32 prior_snd_una, s32 *sack_rtt)  {  	struct tcp_sock *tp = tcp_sk(sk);  	const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1554,6 +1588,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,  	state.flag = 0;  	state.reord = tp->packets_out; +	state.rtt = -1;  	if (!tp->sacked_out) {  		if (WARN_ON(tp->fackets_out)) @@ -1737,6 +1772,7 @@ out:  	WARN_ON((int)tp->retrans_out < 0);  	WARN_ON((int)tcp_packets_in_flight(tp) < 0);  #endif +	*sack_rtt = state.rtt;  	return state.flag;  } @@ -1869,8 +1905,13 @@ void tcp_enter_loss(struct sock *sk, int how)  	}  	tcp_verify_left_out(tp); -	tp->reordering = min_t(unsigned int, tp->reordering, -			       sysctl_tcp_reordering); +	/* Timeout in disordered state after receiving substantial DUPACKs +	 * suggests that the degree of reordering is over-estimated. +	 */ +	if (icsk->icsk_ca_state <= TCP_CA_Disorder && +	    tp->sacked_out >= sysctl_tcp_reordering) +		tp->reordering = min_t(unsigned int, tp->reordering, +				       sysctl_tcp_reordering);  	tcp_set_ca_state(sk, TCP_CA_Loss);  	tp->high_seq = tp->snd_nxt;  	TCP_ECN_queue_cwr(tp); @@ -2472,8 +2513,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)  	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {  		tcp_try_keep_open(sk); -		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) -			tcp_moderate_cwnd(tp);  	} else {  		tcp_cwnd_reduction(sk, prior_unsacked, 0);  	} @@ -2792,65 +2831,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  	tcp_xmit_retransmit_queue(sk);  } -void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, +				      s32 seq_rtt, s32 sack_rtt)  { -	tcp_rtt_estimator(sk, seq_rtt); -	tcp_set_rto(sk); -	inet_csk(sk)->icsk_backoff = 0; -} -EXPORT_SYMBOL(tcp_valid_rtt_meas); +	const struct tcp_sock *tp = tcp_sk(sk); + +	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because +	 * broken middle-boxes or peers may corrupt TS-ECR fields. But +	 * Karn's algorithm forbids taking RTT if some retransmitted data +	 * is acked (RFC6298). +	 */ +	if (flag & FLAG_RETRANS_DATA_ACKED) +		seq_rtt = -1; + +	if (seq_rtt < 0) +		seq_rtt = sack_rtt; -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Supersedes RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, int flag) -{  	/* RTTM Rule: A TSecr value received in a segment is used to  	 * update the averaged RTT measurement only if the segment  	 * acknowledges some new data, i.e., only if it advances the  	 * left edge of the send window. -	 *  	 * See draft-ietf-tcplw-high-performance-00, section 3.3. -	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> -	 * -	 * Changed: reset backoff as soon as we see the first valid sample. -	 * If we do not, we get strongly overestimated rto. With timestamps -	 * samples are accepted even from very old segments: f.e., when rtt=1 -	 * increases to 8, we retransmit 5 times and after 8 seconds delayed -	 * answer arrives rto becomes 120 seconds! If at least one of segments -	 * in window is lost... Voila.	 			--ANK (010210)  	 */ -	struct tcp_sock *tp = tcp_sk(sk); - -	tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); -} +	if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) +		seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) -{ -	/* We don't have a timestamp. Can only use -	 * packets that are not retransmitted to determine -	 * rtt estimates. Also, we must not reset the -	 * backoff for rto until we get a non-retransmitted -	 * packet. This allows us to deal with a situation -	 * where the network delay has increased suddenly. -	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.) -	 */ +	if (seq_rtt < 0) +		return false; -	if (flag & FLAG_RETRANS_DATA_ACKED) -		return; +	tcp_rtt_estimator(sk, seq_rtt); +	tcp_set_rto(sk); -	tcp_valid_rtt_meas(sk, seq_rtt); +	/* RFC6298: only reset backoff on valid RTT measurement. */ +	inet_csk(sk)->icsk_backoff = 0; +	return true;  } -static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, -				      const s32 seq_rtt) +/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ +static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)  { -	const struct tcp_sock *tp = tcp_sk(sk); -	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ -	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) -		tcp_ack_saw_tstamp(sk, flag); -	else if (seq_rtt >= 0) -		tcp_ack_no_tstamp(sk, seq_rtt, flag); +	struct tcp_sock *tp = tcp_sk(sk); +	s32 seq_rtt = -1; + +	if (tp->lsndtime && !tp->total_retrans) +		seq_rtt = tcp_time_stamp - tp->lsndtime; +	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);  }  static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) @@ -2939,7 +2964,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)   * arrived at the other end.   */  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, -			       u32 prior_snd_una) +			       u32 prior_snd_una, s32 sack_rtt)  {  	struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2978,8 +3003,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  			if (sacked & TCPCB_SACKED_RETRANS)  				tp->retrans_out -= acked_pcount;  			flag |= FLAG_RETRANS_DATA_ACKED; -			ca_seq_rtt = -1; -			seq_rtt = -1;  		} else {  			ca_seq_rtt = now - scb->when;  			last_ackt = skb->tstamp; @@ -3031,6 +3054,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))  		flag |= FLAG_SACK_RENEGING; +	if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || +	    (flag & FLAG_ACKED)) +		tcp_rearm_rto(sk); +  	if (flag & FLAG_ACKED) {  		const struct tcp_congestion_ops *ca_ops  			= inet_csk(sk)->icsk_ca_ops; @@ -3040,9 +3067,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  			tcp_mtup_probe_success(sk);  		} -		tcp_ack_update_rtt(sk, flag, seq_rtt); -		tcp_rearm_rto(sk); -  		if (tcp_is_reno(tp)) {  			tcp_remove_reno_sacks(sk, pkts_acked);  		} else { @@ -3130,11 +3154,22 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)  		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;  } +/* Decide wheather to run the increase function of congestion control. */  static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)  { -	const struct tcp_sock *tp = tcp_sk(sk); -	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && -		!tcp_in_cwnd_reduction(sk); +	if (tcp_in_cwnd_reduction(sk)) +		return false; + +	/* If reordering is high then always grow cwnd whenever data is +	 * delivered regardless of its ordering. Otherwise stay conservative +	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ +	 * new SACK or ECE mark may first advance cwnd here and later reduce +	 * cwnd in tcp_fastretrans_alert() based on more states. +	 */ +	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) +		return flag & FLAG_FORWARD_PROGRESS; + +	return flag & FLAG_DATA_ACKED;  }  /* Check that window update is acceptable. @@ -3269,11 +3304,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	u32 ack_seq = TCP_SKB_CB(skb)->seq;  	u32 ack = TCP_SKB_CB(skb)->ack_seq;  	bool is_dupack = false; -	u32 prior_in_flight; +	u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;  	u32 prior_fackets;  	int prior_packets = tp->packets_out;  	const int prior_unsacked = tp->packets_out - tp->sacked_out;  	int acked = 0; /* Number of packets newly acked */ +	s32 sack_rtt = -1;  	/* If the ack is older than previous acks  	 * then we can probably ignore it. @@ -3330,7 +3366,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);  		if (TCP_SKB_CB(skb)->sacked) -			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); +			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, +							&sack_rtt);  		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))  			flag |= FLAG_ECE; @@ -3349,21 +3386,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	/* See if we can take anything off of the retransmit queue. */  	acked = tp->packets_out; -	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); +	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);  	acked -= tp->packets_out; +	/* Advance cwnd if state allows */ +	if (tcp_may_raise_cwnd(sk, flag)) +		tcp_cong_avoid(sk, ack, prior_in_flight); +  	if (tcp_ack_is_dubious(sk, flag)) { -		/* Advance CWND, if state allows this. */ -		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) -			tcp_cong_avoid(sk, ack, prior_in_flight);  		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));  		tcp_fastretrans_alert(sk, acked, prior_unsacked,  				      is_dupack, flag); -	} else { -		if (flag & FLAG_DATA_ACKED) -			tcp_cong_avoid(sk, ack, prior_in_flight);  	} -  	if (tp->tlp_high_seq)  		tcp_process_tlp_ack(sk, ack, flag); @@ -3375,6 +3409,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	if (icsk->icsk_pending == ICSK_TIME_RETRANS)  		tcp_schedule_loss_probe(sk); +	if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) +		tcp_update_pacing_rate(sk);  	return 1;  no_queue: @@ -3402,7 +3438,8 @@ old_ack:  	 * If data was DSACKed, see if we can undo a cwnd reduction.  	 */  	if (TCP_SKB_CB(skb)->sacked) { -		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); +		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, +						&sack_rtt);  		tcp_fastretrans_alert(sk, acked, prior_unsacked,  				      is_dupack, flag);  	} @@ -4102,6 +4139,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)  		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {  			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);  		} else { +			tcp_grow_window(sk, skb);  			kfree_skb_partial(skb, fragstolen);  			skb = NULL;  		} @@ -4177,8 +4215,10 @@ add_sack:  	if (tcp_is_sack(tp))  		tcp_sack_new_ofo_skb(sk, seq, end_seq);  end: -	if (skb) +	if (skb) { +		tcp_grow_window(sk, skb);  		skb_set_owner_r(skb, sk); +	}  }  static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, @@ -5013,8 +5053,8 @@ discard:   *	the rest is checked inline. Fast processing is turned on in   *	tcp_data_queue when everything is OK.   */ -int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, -			const struct tcphdr *th, unsigned int len) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, +			 const struct tcphdr *th, unsigned int len)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -5091,7 +5131,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,  				tcp_ack(sk, skb, 0);  				__kfree_skb(skb);  				tcp_data_snd_check(sk); -				return 0; +				return;  			} else { /* Header too small */  				TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);  				goto discard; @@ -5184,7 +5224,7 @@ no_ack:  			if (eaten)  				kfree_skb_partial(skb, fragstolen);  			sk->sk_data_ready(sk, 0); -			return 0; +			return;  		}  	} @@ -5200,7 +5240,7 @@ slow_path:  	 */  	if (!tcp_validate_incoming(sk, skb, th, 1)) -		return 0; +		return;  step5:  	if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) @@ -5216,7 +5256,7 @@ step5:  	tcp_data_snd_check(sk);  	tcp_ack_snd_check(sk); -	return 0; +	return;  csum_error:  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); @@ -5224,7 +5264,6 @@ csum_error:  discard:  	__kfree_skb(skb); -	return 0;  }  EXPORT_SYMBOL(tcp_rcv_established); @@ -5627,9 +5666,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		 * so release it.  		 */  		if (req) { -			tcp_synack_rtt_meas(sk, req);  			tp->total_retrans = req->num_retrans; -  			reqsk_fastopen_remove(sk, req, false);  		} else {  			/* Make sure socket is routed, for correct metrics. */ @@ -5654,6 +5691,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;  		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;  		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); +		tcp_synack_rtt_meas(sk, req);  		if (tp->rx_opt.tstamp_ok)  			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 
