diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 349 |
1 files changed, 245 insertions, 104 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ccfc8bbf7455..f558c054cf6e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -170,15 +170,14 @@ static void tcp_event_data_sent(struct tcp_sock *tp, tp->lsndtime = now; /* If it is a reply for ato after last received - * packet, enter pingpong mode. + * packet, increase pingpong count. */ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - inet_csk_enter_pingpong_mode(sk); + inet_csk_inc_pingpong_cnt(sk); } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, - u32 rcv_nxt) +static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -192,7 +191,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ - tcp_dec_quickack_mode(sk, pkts); + tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -423,6 +422,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_FAST_OPEN_COOKIE BIT(8) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) +#define OPTION_AO BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -615,19 +615,52 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, - struct tcp_out_options *opts) + const struct tcp_request_sock *tcprsk, + struct tcp_out_options *opts, + struct tcp_key *key) { __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ - if (unlikely(OPTION_MD5 & options)) { + if (tcp_key_is_md5(key)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; - } + } else if (tcp_key_is_ao(key)) { +#ifdef CONFIG_TCP_AO + u8 maclen = tcp_ao_maclen(key->ao_key); + + if (tcprsk) { + u8 aolen = maclen + sizeof(struct tcp_ao_hdr); + *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | + (tcprsk->ao_keyid << 8) | + (tcprsk->ao_rcv_next)); + } else { + struct tcp_ao_key *rnext_key; + struct tcp_ao_info *ao_info; + + ao_info = rcu_dereference_check(tp->ao_info, + lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); + rnext_key = READ_ONCE(ao_info->rnext_key); + if (WARN_ON_ONCE(!rnext_key)) + goto out_ao; + *ptr++ = htonl((TCPOPT_AO << 24) | + (tcp_ao_len(key->ao_key) << 16) | + (key->ao_key->sndid << 8) | + (rnext_key->rcvid)); + } + opts->hash_location = (__u8 *)ptr; + ptr += maclen / sizeof(*ptr); + if (unlikely(maclen % sizeof(*ptr))) { + memset(ptr, TCPOPT_NOP, sizeof(*ptr)); + ptr++; + } +out_ao: +#endif + } if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | @@ -768,23 +801,25 @@ static void mptcp_set_option_cond(const struct request_sock *req, */ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) + struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; + bool timestamps; - *md5 = NULL; -#ifdef CONFIG_TCP_MD5SIG - if (static_branch_unlikely(&tcp_md5_needed.key) && - rcu_access_pointer(tp->md5sig_info)) { - *md5 = tp->af_specific->md5_lookup(sk, sk); - if (*md5) { - opts->options |= OPTION_MD5; - remaining -= TCPOLEN_MD5SIG_ALIGNED; + /* Better than switch (key.type) as it has static branches */ + if (tcp_key_is_md5(key)) { + timestamps = false; + opts->options |= OPTION_MD5; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + } else { + timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps); + if (tcp_key_is_ao(key)) { + opts->options |= OPTION_AO; + remaining -= tcp_ao_len(key->ao_key); } } -#endif /* We always get an MSS option. The option bytes which will be seen in * normal data packets should timestamps be used, must be in the MSS @@ -798,9 +833,9 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; - if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { + if (likely(timestamps)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; + opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -851,7 +886,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, - const struct tcp_md5sig_key *md5, + const struct tcp_key *key, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) @@ -859,8 +894,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; -#ifdef CONFIG_TCP_MD5SIG - if (md5) { + if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; @@ -871,8 +905,11 @@ static unsigned int tcp_synack_options(const struct sock *sk, */ if (synack_type != TCP_SYNACK_COOKIE) ireq->tstamp_ok &= !ireq->sack_ok; + } else if (tcp_key_is_ao(key)) { + opts->options |= OPTION_AO; + remaining -= tcp_ao_len(key->ao_key); + ireq->tstamp_ok &= !ireq->sack_ok; } -#endif /* We always send an MSS option. */ opts->mss = mss; @@ -885,7 +922,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; + opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -922,7 +960,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, */ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) + struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; @@ -930,21 +968,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->options = 0; - *md5 = NULL; -#ifdef CONFIG_TCP_MD5SIG - if (static_branch_unlikely(&tcp_md5_needed.key) && - rcu_access_pointer(tp->md5sig_info)) { - *md5 = tp->af_specific->md5_lookup(sk, sk); - if (*md5) { - opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; - } + /* Better than switch (key.type) as it has static branches */ + if (tcp_key_is_md5(key)) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } else if (tcp_key_is_ao(key)) { + opts->options |= OPTION_AO; + size += tcp_ao_len(key->ao_key); } -#endif if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -1077,7 +1113,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t) #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \ - TCPF_MTU_REDUCED_DEFERRED) + TCPF_MTU_REDUCED_DEFERRED | \ + TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket @@ -1101,16 +1138,6 @@ void tcp_release_cb(struct sock *sk) tcp_tsq_write(sk); __sock_put(sk); } - /* Here begins the tricky part : - * We are called from release_sock() with : - * 1) BH disabled - * 2) sk_lock.slock spinlock held - * 3) socket owned by us (sk->sk_lock.owned == 1) - * - * But following code is meant to be called from BH handlers, - * so we should keep BH disabled, but early release socket ownership - */ - sock_release_ownership(sk); if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); @@ -1124,6 +1151,8 @@ void tcp_release_cb(struct sock *sk) inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } + if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) + tcp_send_ack(sk); } EXPORT_SYMBOL(tcp_release_cb); @@ -1208,7 +1237,7 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); if (sk->sk_pacing_status != SK_PACING_NONE) { - unsigned long rate = sk->sk_pacing_rate; + unsigned long rate = READ_ONCE(sk->sk_pacing_rate); /* Original sch_fq does not pace first 10 MSS * Note that tp->data_segs_out overflows after 2^32 packets, @@ -1251,7 +1280,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; struct sk_buff *oskb = NULL; - struct tcp_md5sig_key *md5; + struct tcp_key key; struct tcphdr *th; u64 prior_wstamp; int err; @@ -1283,11 +1312,11 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); + tcp_get_current_key(sk, &key); if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { - tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); + tcp_options_size = tcp_syn_options(sk, skb, &opts, &key); } else { - tcp_options_size = tcp_established_options(sk, skb, &opts, - &md5); + tcp_options_size = tcp_established_options(sk, skb, &opts, &key); /* Force a PSH flag on all (GSO) packets to expedite GRO flush * at receiver : This slightly improve GRO performance. * Note that we do not force the PSH flag for non GSO packets, @@ -1332,7 +1361,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; refcount_add(skb->truesize, &sk->sk_wmem_alloc); - skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); + skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm)); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; @@ -1368,16 +1397,25 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, th->window = htons(min(tp->rcv_wnd, 65535U)); } - tcp_options_write(th, tp, &opts); + tcp_options_write(th, tp, NULL, &opts, &key); + if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG - /* Calculate the MD5 hash, as we have all we need now */ - if (md5) { + /* Calculate the MD5 hash, as we have all we need now */ sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, - md5, sk, skb); - } + key.md5_key, sk, skb); #endif + } else if (tcp_key_is_ao(&key)) { + int err; + + err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, + opts.hash_location); + if (err) { + kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); + return -ENOMEM; + } + } /* BPF prog is the last one writing header option */ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); @@ -1387,7 +1425,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); @@ -1704,14 +1742,6 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); - /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ - if (icsk->icsk_af_ops->net_frag_header_len) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_allfrag(dst)) - mss_now -= icsk->icsk_af_ops->net_frag_header_len; - } - /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; @@ -1739,21 +1769,11 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - int mtu; - mtu = mss + + return mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; - - /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ - if (icsk->icsk_af_ops->net_frag_header_len) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_allfrag(dst)) - mtu += icsk->icsk_af_ops->net_frag_header_len; - } - return mtu; } EXPORT_SYMBOL(tcp_mss_to_mtu); @@ -1828,7 +1848,7 @@ unsigned int tcp_current_mss(struct sock *sk) u32 mss_now; unsigned int header_len; struct tcp_out_options opts; - struct tcp_md5sig_key *md5; + struct tcp_key key; mss_now = tp->mss_cache; @@ -1837,8 +1857,8 @@ unsigned int tcp_current_mss(struct sock *sk) if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } - - header_len = tcp_established_options(sk, NULL, &opts, &md5) + + tcp_get_current_key(sk, &key); + header_len = tcp_established_options(sk, NULL, &opts, &key) + sizeof(struct tcphdr); /* The mss_cache is sized based on tp->tcp_header_len, which assumes * some common options. If this is an odd packet (because we have SACK @@ -1980,7 +2000,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, unsigned long bytes; u32 r; - bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift); + bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) @@ -2457,6 +2477,7 @@ static int tcp_mtu_probe(struct sock *sk) /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { + tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } @@ -2542,6 +2563,18 @@ static bool tcp_pacing_check(struct sock *sk) return true; } +static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) +{ + const struct rb_node *node = sk->tcp_rtx_queue.rb_node; + + /* No skb in the rtx queue. */ + if (!node) + return true; + + /* Only one skb in rtx queue. */ + return !node->rb_left && !node->rb_right; +} + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) @@ -2560,7 +2593,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, - sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift)); + READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); @@ -2568,7 +2601,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, if (static_branch_unlikely(&tcp_tx_delay_enabled) && tcp_sk(sk)->tcp_tx_delay) { - u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; + u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) * + tcp_sk(sk)->tcp_tx_delay; /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we * approximate our needs assuming an ~100% skb->truesize overhead. @@ -2579,12 +2613,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send skb if rtx queue is empty. + /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (tcp_rtx_queue_empty(sk)) + if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2788,7 +2822,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, rto_delta_us; + u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS @@ -2812,11 +2846,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { - timeout = usecs_to_jiffies(tp->srtt_us >> 2); + timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) - timeout += TCP_RTO_MIN; + timeout_us += tcp_rto_min_us(sk); else - timeout += TCP_TIMEOUT_MIN; + timeout_us += TCP_TIMEOUT_MIN_US; + timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } @@ -3372,7 +3407,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp(skb); + tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; @@ -3620,8 +3655,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); - struct tcp_md5sig_key *md5 = NULL; struct tcp_out_options opts; + struct tcp_key key = {}; struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; @@ -3658,6 +3693,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + if (tcp_rsk(req)->req_usec_ts < 0) + tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) @@ -3671,16 +3708,48 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); - md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif + if (tcp_rsk_used_ao(req)) { +#ifdef CONFIG_TCP_AO + struct tcp_ao_key *ao_key = NULL; + u8 maclen = tcp_rsk(req)->maclen; + u8 keyid = tcp_rsk(req)->ao_keyid; + + ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), + keyid, -1); + /* If there is no matching key - avoid sending anything, + * especially usigned segments. It could try harder and lookup + * for another peer-matching key, but the peer has requested + * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. + */ + if (unlikely(!ao_key || tcp_ao_maclen(ao_key) != maclen)) { + u8 key_maclen = ao_key ? tcp_ao_maclen(ao_key) : 0; + + rcu_read_unlock(); + kfree_skb(skb); + net_warn_ratelimited("TCP-AO: the keyid %u with maclen %u|%u from SYN packet is not present - not sending SYNACK\n", + keyid, maclen, key_maclen); + return NULL; + } + key.ao_key = ao_key; + key.type = TCP_KEY_AO; +#endif + } else { +#ifdef CONFIG_TCP_MD5SIG + key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk, + req_to_sk(req)); + if (key.md5_key) + key.type = TCP_KEY_MD5; +#endif + } skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4); /* bpf program will be interested in the tcp_flags */ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc, synack_type, - syn_skb) + sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, + &key, foc, synack_type, syn_skb) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3700,15 +3769,24 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); - tcp_options_write(th, NULL, &opts); + tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); -#ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ - if (md5) + if (tcp_key_is_md5(&key)) { +#ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, - md5, req_to_sk(req), skb); + key.md5_key, req_to_sk(req), skb); +#endif + } else if (tcp_key_is_ao(&key)) { +#ifdef CONFIG_TCP_AO + tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location, + key.ao_key, req, skb, + opts.hash_location - (u8 *)th, 0); +#endif + } +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_unlock(); #endif @@ -3756,6 +3834,8 @@ static void tcp_connect_init(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; + tcp_ao_connect_init(sk); + /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; @@ -3938,6 +4018,53 @@ int tcp_connect(struct sock *sk) tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); +#if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO) + /* Has to be checked late, after setting daddr/saddr/ops. + * Return error if the peer has both a md5 and a tcp-ao key + * configured as this is ambiguous. + */ + if (unlikely(rcu_dereference_protected(tp->md5sig_info, + lockdep_sock_is_held(sk)))) { + bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1); + bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk); + struct tcp_ao_info *ao_info; + + ao_info = rcu_dereference_check(tp->ao_info, + lockdep_sock_is_held(sk)); + if (ao_info) { + /* This is an extra check: tcp_ao_required() in + * tcp_v{4,6}_parse_md5_keys() should prevent adding + * md5 keys on ao_required socket. + */ + needs_ao |= ao_info->ao_required; + WARN_ON_ONCE(ao_info->ao_required && needs_md5); + } + if (needs_md5 && needs_ao) + return -EKEYREJECTED; + + /* If we have a matching md5 key and no matching tcp-ao key + * then free up ao_info if allocated. + */ + if (needs_md5) { + tcp_ao_destroy_sock(sk, false); + } else if (needs_ao) { + tcp_clear_md5_list(sk); + kfree(rcu_replace_pointer(tp->md5sig_info, NULL, + lockdep_sock_is_held(sk))); + } + } +#endif +#ifdef CONFIG_TCP_AO + if (unlikely(rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held(sk)))) { + /* Don't allow connecting if ao is configured but no + * matching key is found. + */ + if (!tp->af_specific->ao_lookup(sk, sk, -1, -1)) + return -EKEYREJECTED; + } +#endif + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ @@ -3954,7 +4081,7 @@ int tcp_connect(struct sock *sk) tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tcp_mstamp_refresh(tp); - tp->retrans_stamp = tcp_time_stamp(tp); + tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); @@ -3984,6 +4111,20 @@ int tcp_connect(struct sock *sk) } EXPORT_SYMBOL(tcp_connect); +u32 tcp_delack_max(const struct sock *sk) +{ + const struct dst_entry *dst = __sk_dst_get(sk); + u32 delack_max = inet_csk(sk)->icsk_delack_max; + + if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) { + u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); + u32 delack_from_rto_min = max_t(int, 1, rto_min - 1); + + delack_max = min_t(u32, delack_max, delack_from_rto_min); + } + return delack_max; +} + /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. @@ -4019,7 +4160,7 @@ void tcp_send_delayed_ack(struct sock *sk) ato = min(ato, max_ato); } - ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max); + ato = min_t(u32, ato, tcp_delack_max(sk)); /* Stay within the limit we were given */ timeout = jiffies + ato; |