From 5e514f1cba090e1c8fff03e92a175eccfe46305f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 May 2024 12:52:50 +0000 Subject: tcp: add tcp_done_with_error() helper tcp_reset() ends with a sequence that is carefuly ordered. We need to fix [e]poll bugs in the following patches, it makes sense to use a common helper. Suggested-by: Neal Cardwell Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Link: https://lore.kernel.org/r/20240528125253.1966136-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c04a9c8be9d..5aadf64e554d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4436,9 +4436,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, return SKB_NOT_DROPPED_YET; } + +void tcp_done_with_error(struct sock *sk, int err) +{ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + WRITE_ONCE(sk->sk_err, err); + smp_wmb(); + + tcp_write_queue_purge(sk); + tcp_done(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); +} +EXPORT_SYMBOL(tcp_done_with_error); + /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) { + int err; + trace_tcp_receive_reset(sk); /* mptcp can't tell us to ignore reset pkts, @@ -4450,24 +4467,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: - WRITE_ONCE(sk->sk_err, ECONNREFUSED); + err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: - WRITE_ONCE(sk->sk_err, EPIPE); + err = EPIPE; break; case TCP_CLOSE: return; default: - WRITE_ONCE(sk->sk_err, ECONNRESET); + err = ECONNRESET; } - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - - tcp_write_queue_purge(sk); - tcp_done(sk); - - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); + tcp_done_with_error(sk, err); } /* -- cgit v1.2.3-70-g09d2 From 071115301838c6c265065dd5d6bf43a9a987a550 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 30 May 2024 16:36:14 -0700 Subject: tcp: wrap mptcp and decrypted checks into tcp_skb_can_collapse_rx() tcp_skb_can_collapse() checks for conditions which don't make sense on input. Because of this we ended up sprinkling a few pairs of mptcp_skb_can_collapse() and skb_cmp_decrypted() calls on the input path. Group them in a new helper. This should make it less likely that someone will check mptcp and not decrypted or vice versa when adding new code. This implicitly adds a decrypted check early in tcp_collapse(). AFAIU this will very slightly increase our ability to collapse packets under memory pressure, not a real bug. Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- include/net/tcp.h | 7 +++++++ net/ipv4/tcp_input.c | 11 +++-------- net/ipv4/tcp_ipv4.c | 3 +-- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 32815a40dea1..32741856da01 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1071,6 +1071,13 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to, skb_pure_zcopy_same(to, from)); } +static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to, + const struct sk_buff *from) +{ + return likely(mptcp_skb_can_collapse(to, from) && + !skb_cmp_decrypted(to, from)); +} + /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5aadf64e554d..212b6fd0caf7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4813,10 +4813,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; - if (!mptcp_skb_can_collapse(to, from)) - return false; - - if (skb_cmp_decrypted(from, to)) + if (!tcp_skb_can_collapse_rx(to, from)) return false; if (!skb_try_coalesce(to, from, fragstolen, &delta)) @@ -5372,7 +5369,7 @@ restart: break; } - if (n && n != tail && mptcp_skb_can_collapse(skb, n) && + if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -5423,11 +5420,9 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || - !mptcp_skb_can_collapse(nskb, skb) || + !tcp_skb_can_collapse_rx(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; - if (skb_cmp_decrypted(skb, nskb)) - goto end; } } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 59d5b064f233..04044605cadf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2044,8 +2044,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || - !mptcp_skb_can_collapse(tail, skb) || - skb_cmp_decrypted(tail, skb) || + !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; -- cgit v1.2.3-70-g09d2 From adbe695a9765fb704d2ac0d3e284f28bcc8b5bf3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Jun 2024 07:15:52 +0000 Subject: tcp: move inet_reqsk_alloc() close to inet_reqsk_clone() inet_reqsk_alloc() does not belong to tcp_input.c, move it to inet_connection_sock.c instead. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/ipv4/inet_connection_sock.c | 25 +++++++++++++++++++++++++ net/ipv4/tcp_input.c | 25 ------------------------- 2 files changed, 25 insertions(+), 25 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d81f74ce0f02..a9d2e6308910 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -911,6 +911,31 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); + + if (req) { + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ireq_opt = NULL; +#if IS_ENABLED(CONFIG_IPV6) + ireq->pktopts = NULL; +#endif + atomic64_set(&ireq->ir_cookie, 0); + ireq->ireq_state = TCP_NEW_SYN_RECV; + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); + ireq->ireq_family = sk_listener->sk_family; + req->timeout = TCP_TIMEOUT_INIT; + } + + return req; +} +EXPORT_SYMBOL(inet_reqsk_alloc); + static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 212b6fd0caf7..eb187450e4d7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6986,31 +6986,6 @@ static void tcp_openreq_init(struct request_sock *req, #endif } -struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener, - bool attach_listener) -{ - struct request_sock *req = reqsk_alloc(ops, sk_listener, - attach_listener); - - if (req) { - struct inet_request_sock *ireq = inet_rsk(req); - - ireq->ireq_opt = NULL; -#if IS_ENABLED(CONFIG_IPV6) - ireq->pktopts = NULL; -#endif - atomic64_set(&ireq->ir_cookie, 0); - ireq->ireq_state = TCP_NEW_SYN_RECV; - write_pnet(&ireq->ireq_net, sock_net(sk_listener)); - ireq->ireq_family = sk_listener->sk_family; - req->timeout = TCP_TIMEOUT_INIT; - } - - return req; -} -EXPORT_SYMBOL(inet_reqsk_alloc); - /* * Return true if a syncookie should be sent */ -- cgit v1.2.3-70-g09d2 From 96be3dcd013df6aa79acf32e739c0a35b89a4f50 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 7 Jun 2024 00:25:58 +0100 Subject: net/tcp: Add tcp-md5 and tcp-ao tracepoints Instead of forcing userspace to parse dmesg (that's what currently is happening, at least in codebase of my current company), provide a better way, that can be enabled/disabled in runtime. Currently, there are already tcp events, add hashing related ones there, too. Rasdaemon currently exercises net_dev_xmit_timeout, devlink_health_report, but it'll be trivial to teach it to deal with failed hashes. Otherwise, BGP may trace/log them itself. Especially exciting for possible investigations is key rotation (RNext_key requests). Suggested-by: Jakub Kicinski Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 317 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp.c | 17 +++ net/ipv4/tcp_ao.c | 13 ++ net/ipv4/tcp_input.c | 8 +- net/ipv4/tcp_output.c | 2 + 5 files changed, 355 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 49b5ee091cf6..1c8bd8e186b8 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -411,6 +411,323 @@ TRACE_EVENT(tcp_cong_state_set, __entry->cong_state) ); +DECLARE_EVENT_CLASS(tcp_hash_event, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + + TP_ARGS(sk, skb), + + TP_STRUCT__entry( + __field(__u64, net_cookie) + __field(const void *, skbaddr) + __field(const void *, skaddr) + __field(int, state) + + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + __field(int, l3index) + + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + + __field(bool, fin) + __field(bool, syn) + __field(bool, rst) + __field(bool, psh) + __field(bool, ack) + ), + + TP_fast_assign( + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + __entry->net_cookie = sock_net(sk)->net_cookie; + __entry->skbaddr = skb; + __entry->skaddr = sk; + __entry->state = sk->sk_state; + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr); + __entry->l3index = inet_sdif(skb) ? inet_iif(skb) : 0; + + /* For filtering use */ + __entry->sport = ntohs(th->source); + __entry->dport = ntohs(th->dest); + __entry->family = sk->sk_family; + + __entry->fin = th->fin; + __entry->syn = th->syn; + __entry->rst = th->rst; + __entry->psh = th->psh; + __entry->ack = th->ack; + ), + + TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc L3index=%d [%c%c%c%c%c]", + __entry->net_cookie, + show_tcp_state_name(__entry->state), + show_family_name(__entry->family), + __entry->saddr, __entry->daddr, + __entry->l3index, + __entry->fin ? 'F' : ' ', + __entry->syn ? 'S' : ' ', + __entry->rst ? 'R' : ' ', + __entry->psh ? 'P' : ' ', + __entry->ack ? '.' : ' ') +); + +DEFINE_EVENT(tcp_hash_event, tcp_hash_bad_header, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + TP_ARGS(sk, skb) +); + +DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_required, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + TP_ARGS(sk, skb) +); + +DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_unexpected, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + TP_ARGS(sk, skb) +); + +DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_mismatch, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + TP_ARGS(sk, skb) +); + +DEFINE_EVENT(tcp_hash_event, tcp_hash_ao_required, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + TP_ARGS(sk, skb) +); + +DECLARE_EVENT_CLASS(tcp_ao_event, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb, + const __u8 keyid, const __u8 rnext, const __u8 maclen), + + TP_ARGS(sk, skb, keyid, rnext, maclen), + + TP_STRUCT__entry( + __field(__u64, net_cookie) + __field(const void *, skbaddr) + __field(const void *, skaddr) + __field(int, state) + + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + __field(int, l3index) + + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + + __field(bool, fin) + __field(bool, syn) + __field(bool, rst) + __field(bool, psh) + __field(bool, ack) + + __field(__u8, keyid) + __field(__u8, rnext) + __field(__u8, maclen) + ), + + TP_fast_assign( + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + __entry->net_cookie = sock_net(sk)->net_cookie; + __entry->skbaddr = skb; + __entry->skaddr = sk; + __entry->state = sk->sk_state; + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr); + __entry->l3index = inet_sdif(skb) ? inet_iif(skb) : 0; + + /* For filtering use */ + __entry->sport = ntohs(th->source); + __entry->dport = ntohs(th->dest); + __entry->family = sk->sk_family; + + __entry->fin = th->fin; + __entry->syn = th->syn; + __entry->rst = th->rst; + __entry->psh = th->psh; + __entry->ack = th->ack; + + __entry->keyid = keyid; + __entry->rnext = rnext; + __entry->maclen = maclen; + ), + + TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc L3index=%d [%c%c%c%c%c] keyid=%u rnext=%u maclen=%u", + __entry->net_cookie, + show_tcp_state_name(__entry->state), + show_family_name(__entry->family), + __entry->saddr, __entry->daddr, + __entry->l3index, + __entry->fin ? 'F' : ' ', + __entry->syn ? 'S' : ' ', + __entry->rst ? 'R' : ' ', + __entry->psh ? 'P' : ' ', + __entry->ack ? '.' : ' ', + __entry->keyid, __entry->rnext, __entry->maclen) +); + +DEFINE_EVENT(tcp_ao_event, tcp_ao_handshake_failure, + TP_PROTO(const struct sock *sk, const struct sk_buff *skb, + const __u8 keyid, const __u8 rnext, const __u8 maclen), + TP_ARGS(sk, skb, keyid, rnext, maclen) +); + +DEFINE_EVENT(tcp_ao_event, tcp_ao_wrong_maclen, + TP_PROTO(const struct sock *sk, const struct sk_buff *skb, + const __u8 keyid, const __u8 rnext, const __u8 maclen), + TP_ARGS(sk, skb, keyid, rnext, maclen) +); + +DEFINE_EVENT(tcp_ao_event, tcp_ao_mismatch, + TP_PROTO(const struct sock *sk, const struct sk_buff *skb, + const __u8 keyid, const __u8 rnext, const __u8 maclen), + TP_ARGS(sk, skb, keyid, rnext, maclen) +); + +DEFINE_EVENT(tcp_ao_event, tcp_ao_key_not_found, + TP_PROTO(const struct sock *sk, const struct sk_buff *skb, + const __u8 keyid, const __u8 rnext, const __u8 maclen), + TP_ARGS(sk, skb, keyid, rnext, maclen) +); + +DEFINE_EVENT(tcp_ao_event, tcp_ao_rnext_request, + TP_PROTO(const struct sock *sk, const struct sk_buff *skb, + const __u8 keyid, const __u8 rnext, const __u8 maclen), + TP_ARGS(sk, skb, keyid, rnext, maclen) +); + +DECLARE_EVENT_CLASS(tcp_ao_event_sk, + + TP_PROTO(const struct sock *sk, const __u8 keyid, const __u8 rnext), + + TP_ARGS(sk, keyid, rnext), + + TP_STRUCT__entry( + __field(__u64, net_cookie) + __field(const void *, skaddr) + __field(int, state) + + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + + __field(__u8, keyid) + __field(__u8, rnext) + ), + + TP_fast_assign( + const struct inet_sock *inet = inet_sk(sk); + + __entry->net_cookie = sock_net(sk)->net_cookie; + __entry->skaddr = sk; + __entry->state = sk->sk_state; + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + TP_STORE_ADDR_PORTS(__entry, inet, sk); + + /* For filtering use */ + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; + + __entry->keyid = keyid; + __entry->rnext = rnext; + ), + + TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc keyid=%u rnext=%u", + __entry->net_cookie, + show_tcp_state_name(__entry->state), + show_family_name(__entry->family), + __entry->saddr, __entry->daddr, + __entry->keyid, __entry->rnext) +); + +DEFINE_EVENT(tcp_ao_event_sk, tcp_ao_synack_no_key, + TP_PROTO(const struct sock *sk, const __u8 keyid, const __u8 rnext), + TP_ARGS(sk, keyid, rnext) +); + +DECLARE_EVENT_CLASS(tcp_ao_event_sne, + + TP_PROTO(const struct sock *sk, __u32 new_sne), + + TP_ARGS(sk, new_sne), + + TP_STRUCT__entry( + __field(__u64, net_cookie) + __field(const void *, skaddr) + __field(int, state) + + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + + __field(__u32, new_sne) + ), + + TP_fast_assign( + const struct inet_sock *inet = inet_sk(sk); + + __entry->net_cookie = sock_net(sk)->net_cookie; + __entry->skaddr = sk; + __entry->state = sk->sk_state; + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + TP_STORE_ADDR_PORTS(__entry, inet, sk); + + /* For filtering use */ + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; + + __entry->new_sne = new_sne; + ), + + TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc sne=%u", + __entry->net_cookie, + show_tcp_state_name(__entry->state), + show_family_name(__entry->family), + __entry->saddr, __entry->daddr, + __entry->new_sne) +); + +DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_snd_sne_update, + TP_PROTO(const struct sock *sk, __u32 new_sne), + TP_ARGS(sk, new_sne) +); + +DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_rcv_sne_update, + TP_PROTO(const struct sock *sk, __u32 new_sne), + TP_ARGS(sk, new_sne) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 17a4a8e4855d..73152ce1367e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,6 +282,7 @@ #include #include #include +#include #include /* Track pending CMSGs. */ @@ -4484,6 +4485,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, if (!key && hash_location) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); tcp_hash_fail("Unexpected MD5 Hash found", family, skb, ""); + trace_tcp_hash_md5_unexpected(sk, skb); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } @@ -4513,6 +4515,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, l3index); } } + trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; } return SKB_NOT_DROPPED_YET; @@ -4544,15 +4547,27 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req, if (tcp_parse_auth_options(th, &md5_location, &aoh)) { tcp_hash_fail("TCP segment has incorrect auth options set", family, skb, ""); + trace_tcp_hash_bad_header(sk, skb); return SKB_DROP_REASON_TCP_AUTH_HDR; } if (req) { if (tcp_rsk_used_ao(req) != !!aoh) { + u8 keyid, rnext, maclen; + + if (aoh) { + keyid = aoh->keyid; + rnext = aoh->rnext_keyid; + maclen = tcp_ao_hdr_maclen(aoh); + } else { + keyid = rnext = maclen = 0; + } + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); tcp_hash_fail("TCP connection can't start/end using TCP-AO", family, skb, "%s", !aoh ? "missing AO" : "AO signed"); + trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen); return SKB_DROP_REASON_TCP_AOFAILURE; } } @@ -4572,12 +4587,14 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req, if (tcp_ao_required(sk, saddr, family, l3index, true)) { tcp_hash_fail("AO hash is required, but not found", family, skb, "L3 index %d", l3index); + trace_tcp_hash_ao_required(sk, skb); return SKB_DROP_REASON_TCP_AONOTFOUND; } if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); tcp_hash_fail("MD5 Hash not found", family, skb, "L3 index %d", l3index); + trace_tcp_hash_md5_required(sk, skb); return SKB_DROP_REASON_TCP_MD5NOTFOUND; } return SKB_NOT_DROPPED_YET; diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 50ae43c92829..1e5087c6cd7d 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -16,6 +16,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_ao_needed, HZ); @@ -895,6 +896,8 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, tcp_hash_fail("AO hash wrong length", family, skb, "%u != %d L3index: %d", maclen, tcp_ao_maclen(key), l3index); + trace_tcp_ao_wrong_maclen(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOFAILURE; } @@ -911,6 +914,8 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, atomic64_inc(&key->pkt_bad); tcp_hash_fail("AO hash mismatch", family, skb, "L3index: %d", l3index); + trace_tcp_ao_mismatch(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); kfree(hash_buf); return SKB_DROP_REASON_TCP_AOFAILURE; } @@ -927,6 +932,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, int l3index, const struct tcp_ao_hdr *aoh) { const struct tcphdr *th = tcp_hdr(skb); + u8 maclen = tcp_ao_hdr_maclen(aoh); u8 *phash = (u8 *)(aoh + 1); /* hash goes just after the header */ struct tcp_ao_info *info; enum skb_drop_reason ret; @@ -941,6 +947,8 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND); tcp_hash_fail("AO key not found", family, skb, "keyid: %u L3index: %d", aoh->keyid, l3index); + trace_tcp_ao_key_not_found(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOUNEXPECTED; } @@ -981,6 +989,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, current_key = READ_ONCE(info->current_key); /* Key rotation: the peer asks us to use new key (RNext) */ if (unlikely(aoh->rnext_keyid != current_key->sndid)) { + trace_tcp_ao_rnext_request(sk, skb, current_key->sndid, + aoh->rnext_keyid, + tcp_ao_hdr_maclen(aoh)); /* If the key is not found we do nothing. */ key = tcp_ao_established_key(info, aoh->rnext_keyid, -1); if (key) @@ -1048,6 +1059,8 @@ key_not_found: atomic64_inc(&info->counters.key_not_found); tcp_hash_fail("Requested by the peer AO key id not found", family, skb, "L3index: %d", l3index); + trace_tcp_ao_key_not_found(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOKEYNOTFOUND; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb187450e4d7..d0a1e34d69f6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3578,8 +3578,10 @@ static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack) ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); - if (ao && ack < tp->snd_una) + if (ao && ack < tp->snd_una) { ao->snd_sne++; + trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne); + } #endif } @@ -3604,8 +3606,10 @@ static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); - if (ao && seq < tp->rcv_nxt) + if (ao && seq < tp->rcv_nxt) { ao->rcv_sne++; + trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne); + } #endif } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 090fb0c24599..16c48df8df4c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3768,6 +3768,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; + u8 rnext = tcp_rsk(req)->ao_rcv_next; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); @@ -3777,6 +3778,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { + trace_tcp_ao_synack_no_key(sk, keyid, rnext); rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", -- cgit v1.2.3-70-g09d2 From 46a02aa357529d7b038096955976b14f7c44aa23 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Mon, 17 Jun 2024 11:09:20 -0700 Subject: tcp: use sk_skb_reason_drop to free rx packets Replace kfree_skb_reason with sk_skb_reason_drop and pass the receiving socket to the tracepoint. Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202406011539.jhwBd7DX-lkp@intel.com/ Signed-off-by: Yan Zhai Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b61d36810fe3..1948d15f1f28 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -496,6 +496,6 @@ out: out_free: reqsk_free(req); out_drop: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NULL; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d0a1e34d69f6..f513d1f927ad 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4860,7 +4860,7 @@ static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { sk_drops_add(sk, skb); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); } /* This one checks to see if we can put data from the diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index de0c8f43448a..8e49d69279d5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1932,7 +1932,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); discard: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, @@ -2168,8 +2168,8 @@ int tcp_v4_rcv(struct sk_buff *skb) int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; + struct sock *sk = NULL; bool refcounted; - struct sock *sk; int ret; u32 isn; @@ -2368,7 +2368,7 @@ bad_packet: discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); /* Discard frame. */ - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index bfad1e89b6a6..9d83eadd308b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -275,6 +275,6 @@ out: out_free: reqsk_free(req); out_drop: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NULL; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 223b71790e44..200fea92f12f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1674,7 +1674,7 @@ reset: discard: if (opt_skb) __kfree_skb(opt_skb); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; @@ -1747,8 +1747,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; + struct sock *sk = NULL; bool refcounted; - struct sock *sk; int ret; u32 isn; struct net *net = dev_net(skb->dev); @@ -1940,7 +1940,7 @@ bad_packet: discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: -- cgit v1.2.3-70-g09d2 From 23e89e8ee7be73e21200947885a6d3a109a2c58d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Jul 2024 10:12:45 -0700 Subject: tcp: Don't drop SYN+ACK for simultaneous connect(). RFC 9293 states that in the case of simultaneous connect(), the connection gets established when SYN+ACK is received. [0] TCP Peer A TCP Peer B 1. CLOSED CLOSED 2. SYN-SENT --> ... 3. SYN-RECEIVED <-- <-- SYN-SENT 4. ... --> SYN-RECEIVED 5. SYN-RECEIVED --> ... 6. ESTABLISHED <-- <-- SYN-RECEIVED 7. ... --> ESTABLISHED However, since commit 0c24604b68fc ("tcp: implement RFC 5961 4.2"), such a SYN+ACK is dropped in tcp_validate_incoming() and responded with Challenge ACK. For example, the write() syscall in the following packetdrill script fails with -EAGAIN, and wrong SNMP stats get incremented. 0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0 > S 0:0(0) +0 < S 0:0(0) win 1000 +0 > S. 0:0(0) ack 1 +0 < S. 0:0(0) ack 1 win 1000 +0 write(3, ..., 100) = 100 +0 > P. 1:101(100) ack 1 -- # packetdrill cross-synack.pkt cross-synack.pkt:13: runtime error in write call: Expected result 100 but got -1 with errno 11 (Resource temporarily unavailable) # nstat ... TcpExtTCPChallengeACK 1 0.0 TcpExtTCPSYNChallenge 1 0.0 The problem is that bpf_skops_established() is triggered by the Challenge ACK instead of SYN+ACK. This causes the bpf prog to miss the chance to check if the peer supports a TCP option that is expected to be exchanged in SYN and SYN+ACK. Let's accept a bare SYN+ACK for active-open TCP_SYN_RECV sockets to avoid such a situation. Note that tcp_ack_snd_check() in tcp_rcv_state_process() is skipped not to send an unnecessary ACK, but this could be a bit risky for net.git, so this targets for net-next. Link: https://www.rfc-editor.org/rfc/rfc9293.html#section-3.5-7 [0] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240710171246.87533-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e0f54b9be850..ff9ab3d01ced 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5998,6 +5998,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { + if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && + TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && + TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && + TCP_SKB_CB(skb)->ack_seq == tp->snd_nxt) + goto pass; syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); @@ -6007,6 +6012,7 @@ syn_challenge: goto discard; } +pass: bpf_skops_parse_hdr(sk, skb); return true; @@ -6813,6 +6819,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); + + if (sk->sk_socket) + goto consume; break; case TCP_FIN_WAIT1: { -- cgit v1.2.3-70-g09d2