From eda7acddf8080bb2d022a8d4b8b2345eb80c63ec Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:16 -0800 Subject: mptcp: Handle MPTCP TCP options Add hooks to parse and format the MP_CAPABLE option. This option is handled according to MPTCP version 0 (RFC6824). MPTCP version 1 MP_CAPABLE (RFC6824bis/RFC8684) will be added later in coordination with related code changes. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ca6f01531e64..52798ab00394 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -78,6 +78,16 @@ struct tcp_sack_block { #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ +#if IS_ENABLED(CONFIG_MPTCP) +struct mptcp_options_received { + u64 sndr_key; + u64 rcvr_key; + u8 mp_capable : 1, + mp_join : 1, + dss : 1; +}; +#endif + struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -95,6 +105,9 @@ struct tcp_options_received { u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ +#if IS_ENABLED(CONFIG_MPTCP) + struct mptcp_options_received mptcp; +#endif }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) @@ -104,6 +117,11 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif +#if IS_ENABLED(CONFIG_MPTCP) + rx_opt->mptcp.mp_capable = 0; + rx_opt->mptcp.mp_join = 0; + rx_opt->mptcp.dss = 0; +#endif } /* This is the max number of SACKS that we'll generate and process. It's safe -- cgit v1.2.3-70-g09d2 From 2303f994b3e187091fd08148066688b08f837efc Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:17 -0800 Subject: mptcp: Associate MPTCP context with TCP socket Use ULP to associate a subflow_context structure with each TCP subflow socket. Creating these sockets requires new bind and connect functions to make sure ULP is set up immediately when the subflow sockets are created. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++ net/mptcp/Makefile | 2 +- net/mptcp/protocol.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++--- net/mptcp/protocol.h | 26 ++++++++++ net/mptcp/subflow.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 275 insertions(+), 7 deletions(-) create mode 100644 net/mptcp/subflow.c (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 52798ab00394..877947475814 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -397,6 +397,9 @@ struct tcp_sock { u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ +#if IS_ENABLED(CONFIG_MPTCP) + bool is_mptcp; +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 27a846263f08..e1ee5aade8b0 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o options.o +mptcp-y := protocol.o subflow.o options.o diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5e24e7cf7d70..294b03a0393a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -17,6 +17,53 @@ #include #include "protocol.h" +#define MPTCP_SAME_STATE TCP_MAX_STATES + +/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not + * completed yet or has failed, return the subflow socket. + * Otherwise return NULL. + */ +static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) +{ + if (!msk->subflow) + return NULL; + + return msk->subflow; +} + +static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) +{ + return ((struct sock *)msk)->sk_state == TCP_CLOSE; +} + +static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int err; + + ssock = __mptcp_nmpc_socket(msk); + if (ssock) + goto set_state; + + if (!__mptcp_can_create_subflow(msk)) + return ERR_PTR(-EINVAL); + + err = mptcp_subflow_create_socket(sk, &ssock); + if (err) + return ERR_PTR(err); + + msk->subflow = ssock; + subflow = mptcp_subflow_ctx(ssock->sk); + subflow->request_mptcp = 1; + +set_state: + if (state != MPTCP_SAME_STATE) + inet_sk_state_store(sk, state); + return ssock; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -48,12 +95,14 @@ static int mptcp_init_sock(struct sock *sk) static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; inet_sk_state_store(sk, TCP_CLOSE); - if (msk->subflow) { - pr_debug("subflow=%p", msk->subflow->sk); - sock_release(msk->subflow); + ssock = __mptcp_nmpc_socket(msk); + if (ssock) { + pr_debug("subflow=%p", mptcp_subflow_ctx(ssock->sk)); + sock_release(ssock); } sock_orphan(sk); @@ -67,7 +116,8 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len) saddr->sa_family = AF_INET; - pr_debug("msk=%p, subflow=%p", msk, msk->subflow->sk); + pr_debug("msk=%p, subflow=%p", msk, + mptcp_subflow_ctx(msk->subflow->sk)); err = kernel_connect(msk->subflow, saddr, len, 0); @@ -93,15 +143,79 @@ static struct proto mptcp_prot = { .no_autobind = true, }; +static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err = -ENOTSUPP; + + if (uaddr->sa_family != AF_INET) // @@ allow only IPv4 for now + return err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->bind(ssock, uaddr, addr_len); + +unlock: + release_sock(sock->sk); + return err; +} + +static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->connect(ssock, uaddr, addr_len, flags); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + +unlock: + release_sock(sock->sk); + return err; +} + +static __poll_t mptcp_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + __poll_t mask = 0; + + return mask; +} + +static struct proto_ops mptcp_stream_ops; + static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_prot, - .ops = &inet_stream_ops, + .ops = &mptcp_stream_ops, + .flags = INET_PROTOSW_ICSK, }; void __init mptcp_init(void) { + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + mptcp_stream_ops = inet_stream_ops; + mptcp_stream_ops.bind = mptcp_bind; + mptcp_stream_ops.connect = mptcp_stream_connect; + mptcp_stream_ops.poll = mptcp_poll; + + mptcp_subflow_init(); + if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); @@ -109,13 +223,14 @@ void __init mptcp_init(void) } #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct proto_ops mptcp_v6_stream_ops; static struct proto mptcp_v6_prot; static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_v6_prot, - .ops = &inet6_stream_ops, + .ops = &mptcp_v6_stream_ops, .flags = INET_PROTOSW_ICSK, }; @@ -133,6 +248,11 @@ int mptcpv6_init(void) if (err) return err; + mptcp_v6_stream_ops = inet6_stream_ops; + mptcp_v6_stream_ops.bind = mptcp_bind; + mptcp_v6_stream_ops.connect = mptcp_stream_connect; + mptcp_v6_stream_ops.poll = mptcp_poll; + err = inet6_register_protosw(&mptcp_v6_protosw); if (err) proto_unregister(&mptcp_v6_prot); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index c59cf8b220b0..543d4d5d8985 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -48,4 +48,30 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +/* MPTCP subflow context */ +struct mptcp_subflow_context { + u32 request_mptcp : 1; /* send MP_CAPABLE */ + struct sock *tcp_sock; /* tcp sk backpointer */ + struct sock *conn; /* parent mptcp_sock */ + struct rcu_head rcu; +}; + +static inline struct mptcp_subflow_context * +mptcp_subflow_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* Use RCU on icsk_ulp_data only for sock diag code */ + return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; +} + +static inline struct sock * +mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) +{ + return subflow->tcp_sock; +} + +void mptcp_subflow_init(void); +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c new file mode 100644 index 000000000000..bf8139353653 --- /dev/null +++ b/net/mptcp/subflow.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "protocol.h" + +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) +{ + struct mptcp_subflow_context *subflow; + struct net *net = sock_net(sk); + struct socket *sf; + int err; + + err = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sf); + if (err) + return err; + + lock_sock(sf->sk); + + /* kernel sockets do not by default acquire net ref, but TCP timer + * needs it. + */ + sf->sk->sk_net_refcnt = 1; + get_net(net); + this_cpu_add(*net->core.sock_inuse, 1); + err = tcp_set_ulp(sf->sk, "mptcp"); + release_sock(sf->sk); + + if (err) + return err; + + subflow = mptcp_subflow_ctx(sf->sk); + pr_debug("subflow=%p", subflow); + + *new_sock = sf; + subflow->conn = sk; + + return 0; +} + +static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, + gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct mptcp_subflow_context *ctx; + + ctx = kzalloc(sizeof(*ctx), priority); + if (!ctx) + return NULL; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + + pr_debug("subflow=%p", ctx); + + ctx->tcp_sock = sk; + + return ctx; +} + +static int subflow_ulp_init(struct sock *sk) +{ + struct mptcp_subflow_context *ctx; + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + + /* disallow attaching ULP to a socket unless it has been + * created with sock_create_kern() + */ + if (!sk->sk_kern_sock) { + err = -EOPNOTSUPP; + goto out; + } + + ctx = subflow_create_ctx(sk, GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + + tp->is_mptcp = 1; +out: + return err; +} + +static void subflow_ulp_release(struct sock *sk) +{ + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + + if (!ctx) + return; + + kfree_rcu(ctx, rcu); +} + +static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { + .name = "mptcp", + .owner = THIS_MODULE, + .init = subflow_ulp_init, + .release = subflow_ulp_release, +}; + +void mptcp_subflow_init(void) +{ + if (tcp_register_ulp(&subflow_ulp_ops) != 0) + panic("MPTCP: failed to register subflows to ULP\n"); +} -- cgit v1.2.3-70-g09d2 From cec37a6e41aae7bf3df9a3da783380a4d9325fd8 Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:18 -0800 Subject: mptcp: Handle MP_CAPABLE options for outgoing connections Add hooks to tcp_output.c to add MP_CAPABLE to an outgoing SYN request, to capture the MP_CAPABLE in the received SYN-ACK, to add MP_CAPABLE to the final ACK of the three-way handshake. Use the .sk_rx_dst_set() handler in the subflow proto to capture when the responding SYN-ACK is received and notify the MPTCP connection layer. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 + include/net/mptcp.h | 57 +++++++++++ net/ipv4/tcp_input.c | 6 ++ net/ipv4/tcp_output.c | 44 +++++++++ net/ipv6/tcp_ipv6.c | 6 ++ net/mptcp/options.c | 100 +++++++++++++++++++ net/mptcp/protocol.c | 163 +++++++++++++++++++++++++----- net/mptcp/protocol.h | 40 +++++++- net/mptcp/subflow.c | 268 +++++++++++++++++++++++++++++++++++++++++++++++++- 9 files changed, 663 insertions(+), 24 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 877947475814..e9ee06d887fa 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -137,6 +137,9 @@ struct tcp_request_sock { const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; +#if IS_ENABLED(CONFIG_MPTCP) + bool is_mptcp; +#endif u32 txhash; u32 rcv_isn; u32 snt_isn; diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3daec2ceb3ff..eabc57c3fde4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -39,8 +39,27 @@ struct mptcp_out_options { void mptcp_init(void); +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return tcp_sk(sk)->is_mptcp; +} + +static inline bool rsk_is_mptcp(const struct request_sock *req) +{ + return tcp_rsk(req)->is_mptcp; +} + void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx); +bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts); +void mptcp_rcv_synsent(struct sock *sk); +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts); +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts); + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); /* move the skb extension owership, with the assumption that 'to' is @@ -89,11 +108,47 @@ static inline void mptcp_init(void) { } +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return false; +} + +static inline bool rsk_is_mptcp(const struct request_sock *req) +{ + return false; +} + static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } +static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline void mptcp_rcv_synsent(struct sock *sk) +{ +} + +static inline bool mptcp_synack_options(const struct request_sock *req, + unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline bool mptcp_established_options(struct sock *sk, + struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + return false; +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { @@ -107,6 +162,8 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, #endif /* CONFIG_MPTCP */ +void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped); + #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcpv6_init(void); #elif IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3458ee13e6f0..5165c8de47ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5978,6 +5978,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); + if (sk_is_mptcp(sk)) + mptcp_rcv_synsent(sk); + /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ @@ -6600,6 +6603,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; +#if IS_ENABLED(CONFIG_MPTCP) + tcp_rsk(req)->is_mptcp = 0; +#endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0f0984f39f67..5456076166da 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -597,6 +597,22 @@ static void smc_set_option_cond(const struct tcp_sock *tp, #endif } +static void mptcp_set_option_cond(const struct request_sock *req, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + if (rsk_is_mptcp(req)) { + unsigned int size; + + if (mptcp_synack_options(req, &size, &opts->mptcp)) { + if (*remaining >= size) { + opts->options |= OPTION_MPTCP; + *remaining -= size; + } + } + } +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -666,6 +682,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, smc_set_option(tp, opts, &remaining); + if (sk_is_mptcp(sk)) { + unsigned int size; + + if (mptcp_syn_options(sk, &size, &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + remaining -= size; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -727,6 +752,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, } } + mptcp_set_option_cond(req, opts, &remaining); + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -764,6 +791,23 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb size += TCPOLEN_TSTAMP_ALIGNED; } + /* MPTCP options have precedence over SACK for the limited TCP + * option space because a MPTCP connection would be forced to + * fall back to regular TCP if a required multipath option is + * missing. SACK still gets a chance to use whatever space is + * left. + */ + if (sk_is_mptcp(sk)) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + unsigned int opt_size = 0; + + if (mptcp_established_options(sk, skb, &opt_size, remaining, + &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + size += opt_size; + } + } + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 60068ffde1d9..33a578a3eb3a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -238,6 +238,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; icsk->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(sk)) + mptcp_handle_ipv6_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_mapped_specific; @@ -248,6 +250,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { icsk->icsk_ext_hdr_len = exthdrlen; icsk->icsk_af_ops = &ipv6_specific; + if (sk_is_mptcp(sk)) + mptcp_handle_ipv6_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_specific; @@ -1203,6 +1207,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(newsk)) + mptcp_handle_ipv6_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG newtp->af_specific = &tcp_sock_ipv6_mapped_specific; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b7a31c0e5283..52ff2301b68b 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -72,14 +72,114 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, } } +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); + int length = (th->doff * 4) - sizeof(struct tcphdr); + + ptr = (const unsigned char *)(th + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP) + mptcp_parse_option(ptr, opsize, opt_rx); + ptr += opsize - 2; + length -= opsize; + } + } +} + +bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (subflow->request_mptcp) { + pr_debug("local_key=%llu", subflow->local_key); + opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->sndr_key = subflow->local_key; + *size = TCPOLEN_MPTCP_MPC_SYN; + return true; + } + return false; +} + +void mptcp_rcv_synsent(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct tcp_sock *tp = tcp_sk(sk); + + pr_debug("subflow=%p", subflow); + if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { + subflow->mp_capable = 1; + subflow->remote_key = tp->rx_opt.mptcp.sndr_key; + } else { + tcp_sk(sk)->is_mptcp = 0; + } +} + +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (subflow->mp_capable && !subflow->fourth_ack) { + opts->suboptions = OPTION_MPTCP_MPC_ACK; + opts->sndr_key = subflow->local_key; + opts->rcvr_key = subflow->remote_key; + *size = TCPOLEN_MPTCP_MPC_ACK; + subflow->fourth_ack = 1; + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu", + subflow, subflow->local_key, subflow->remote_key); + return true; + } + return false; +} + +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + if (subflow_req->mp_capable) { + opts->suboptions = OPTION_MPTCP_MPC_SYNACK; + opts->sndr_key = subflow_req->local_key; + *size = TCPOLEN_MPTCP_MPC_SYNACK; + pr_debug("subflow_req=%p, local_key=%llu", + subflow_req, subflow_req->local_key); + return true; + } + return false; +} + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | + OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { u8 len; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) len = TCPOLEN_MPTCP_MPC_SYN; + else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYNACK; else len = TCPOLEN_MPTCP_MPC_ACK; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 294b03a0393a..bdd58da1e4f6 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -25,12 +25,28 @@ */ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) { - if (!msk->subflow) + if (!msk->subflow || mptcp_subflow_ctx(msk->subflow->sk)->fourth_ack) return NULL; return msk->subflow; } +/* if msk has a single subflow, and the mp_capable handshake is failed, + * return it. + * Otherwise returns NULL + */ +static struct socket *__mptcp_tcp_fallback(const struct mptcp_sock *msk) +{ + struct socket *ssock = __mptcp_nmpc_socket(msk); + + sock_owned_by_me((const struct sock *)msk); + + if (!ssock || sk_is_mptcp(ssock->sk)) + return NULL; + + return ssock; +} + static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) { return ((struct sock *)msk)->sk_state == TCP_CLOSE; @@ -56,6 +72,7 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) msk->subflow = ssock; subflow = mptcp_subflow_ctx(ssock->sk); + list_add(&subflow->node, &msk->conn_list); subflow->request_mptcp = 1; set_state: @@ -64,66 +81,169 @@ set_state: return ssock; } +static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *subflow = msk->subflow; + struct socket *ssock; + struct sock *ssk; + int ret; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - return sock_sendmsg(subflow, msg); + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (ssock) { + pr_debug("fallback passthrough"); + ret = sock_sendmsg(ssock, msg); + release_sock(sk); + return ret; + } + + ssk = mptcp_subflow_get(msk); + if (!ssk) { + release_sock(sk); + return -ENOTCONN; + } + + ret = sock_sendmsg(ssk->sk_socket, msg); + + release_sock(sk); + return ret; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *subflow = msk->subflow; + struct socket *ssock; + struct sock *ssk; + int copied = 0; if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) return -EOPNOTSUPP; - return sock_recvmsg(subflow, msg, flags); + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (ssock) { + pr_debug("fallback-read subflow=%p", + mptcp_subflow_ctx(ssock->sk)); + copied = sock_recvmsg(ssock, msg, flags); + release_sock(sk); + return copied; + } + + ssk = mptcp_subflow_get(msk); + if (!ssk) { + release_sock(sk); + return -ENOTCONN; + } + + copied = sock_recvmsg(ssk->sk_socket, msg, flags); + + release_sock(sk); + + return copied; +} + +/* subflow sockets can be either outgoing (connect) or incoming + * (accept). + * + * Outgoing subflows use in-kernel sockets. + * Incoming subflows do not have their own 'struct socket' allocated, + * so we need to use tcp_close() after detaching them from the mptcp + * parent socket. + */ +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow, + long timeout) +{ + struct socket *sock = READ_ONCE(ssk->sk_socket); + + list_del(&subflow->node); + + if (sock && sock != sk->sk_socket) { + /* outgoing subflow */ + sock_release(sock); + } else { + /* incoming subflow */ + tcp_close(ssk, timeout); + } } static int mptcp_init_sock(struct sock *sk) { + struct mptcp_sock *msk = mptcp_sk(sk); + + INIT_LIST_HEAD(&msk->conn_list); + return 0; } static void mptcp_close(struct sock *sk, long timeout) { + struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; inet_sk_state_store(sk, TCP_CLOSE); - ssock = __mptcp_nmpc_socket(msk); - if (ssock) { - pr_debug("subflow=%p", mptcp_subflow_ctx(ssock->sk)); - sock_release(ssock); + lock_sock(sk); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + __mptcp_close_ssk(sk, ssk, subflow, timeout); } - sock_orphan(sk); - sock_put(sk); + release_sock(sk); + sk_common_release(sk); } -static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len) +static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); - int err; + struct socket *ssock; - saddr->sa_family = AF_INET; + ssock = __mptcp_nmpc_socket(msk); + pr_debug("msk=%p, subflow=%p", msk, ssock); + if (WARN_ON_ONCE(!ssock)) + return -EINVAL; - pr_debug("msk=%p, subflow=%p", msk, - mptcp_subflow_ctx(msk->subflow->sk)); + return inet_csk_get_port(ssock->sk, snum); +} - err = kernel_connect(msk->subflow, saddr, len, 0); +void mptcp_finish_connect(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *sk; - sk->sk_state = TCP_ESTABLISHED; + subflow = mptcp_subflow_ctx(ssk); - return err; + if (!subflow->mp_capable) + return; + + sk = subflow->conn; + msk = mptcp_sk(sk); + + /* the socket is not connected yet, no msk/subflow ops can access/race + * accessing the field below + */ + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->local_key, subflow->local_key); } static struct proto mptcp_prot = { @@ -132,13 +252,12 @@ static struct proto mptcp_prot = { .init = mptcp_init_sock, .close = mptcp_close, .accept = inet_csk_accept, - .connect = mptcp_connect, .shutdown = tcp_shutdown, .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, .hash = inet_hash, .unhash = inet_unhash, - .get_port = inet_csk_get_port, + .get_port = mptcp_get_port, .obj_size = sizeof(struct mptcp_sock), .no_autobind = true, }; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 543d4d5d8985..bd66e7415515 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -40,19 +40,47 @@ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; + u64 local_key; + u64 remote_key; + struct list_head conn_list; struct socket *subflow; /* outgoing connect/listener/!mp_capable */ }; +#define mptcp_for_each_subflow(__msk, __subflow) \ + list_for_each_entry(__subflow, &((__msk)->conn_list), node) + static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) { return (struct mptcp_sock *)sk; } +struct mptcp_subflow_request_sock { + struct tcp_request_sock sk; + u8 mp_capable : 1, + mp_join : 1, + backup : 1; + u64 local_key; + u64 remote_key; +}; + +static inline struct mptcp_subflow_request_sock * +mptcp_subflow_rsk(const struct request_sock *rsk) +{ + return (struct mptcp_subflow_request_sock *)rsk; +} + /* MPTCP subflow context */ struct mptcp_subflow_context { - u32 request_mptcp : 1; /* send MP_CAPABLE */ + struct list_head node;/* conn_list of subflows */ + u64 local_key; + u64 remote_key; + u32 request_mptcp : 1, /* send MP_CAPABLE */ + mp_capable : 1, /* remote is MPTCP capable */ + fourth_ack : 1, /* send initial DSS */ + conn_finished : 1; struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ + const struct inet_connection_sock_af_ops *icsk_af_ops; struct rcu_head rcu; }; @@ -74,4 +102,14 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) void mptcp_subflow_init(void); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); +extern const struct inet_connection_sock_af_ops ipv4_specific; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +extern const struct inet_connection_sock_af_ops ipv6_specific; +#endif + +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx); + +void mptcp_finish_connect(struct sock *sk); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bf8139353653..df3192305967 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -12,9 +12,188 @@ #include #include #include +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include +#endif #include #include "protocol.h" +static void subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct tcp_options_received rx_opt; + + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + + memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); + mptcp_get_options(skb, &rx_opt); + + subflow_req->mp_capable = 0; + +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + return; +#endif + + if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { + subflow_req->mp_capable = 1; + subflow_req->remote_key = rx_opt.mptcp.sndr_key; + } +} + +static void subflow_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static void subflow_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} +#endif + +static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); + + if (subflow->conn && !subflow->conn_finished) { + pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), + subflow->remote_key); + mptcp_finish_connect(sk); + subflow->conn_finished = 1; + } +} + +static struct request_sock_ops subflow_request_sock_ops; +static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; + +static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + /* Never answer to SYNs sent to broadcast or multicast */ + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv4_ops, + sk, skb); +drop: + tcp_listendrop(sk); + return 0; +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; +static struct inet_connection_sock_af_ops subflow_v6_specific; +static struct inet_connection_sock_af_ops subflow_v6m_specific; + +static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + if (skb->protocol == htons(ETH_P_IP)) + return subflow_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv6_ops, sk, skb); + +drop: + tcp_listendrop(sk); + return 0; /* don't send reset */ +} +#endif + +static struct sock *subflow_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); + struct sock *child; + + pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + + /* if the sk is MP_CAPABLE, we already received the client key */ + + child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + req_unhash, own_req); + + if (child && *own_req) { + if (!mptcp_subflow_ctx(child)) { + pr_debug("Closing child socket"); + inet_sk_set_state(child, TCP_CLOSE); + sock_set_flag(child, SOCK_DEAD); + inet_csk_destroy_sock(child); + child = NULL; + } + } + + return child; +} + +static struct inet_connection_sock_af_ops subflow_specific; + +static struct inet_connection_sock_af_ops * +subflow_default_af_ops(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (sk->sk_family == AF_INET6) + return &subflow_v6_specific; +#endif + return &subflow_specific; +} + +void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock_af_ops *target; + + target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); + + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); + + if (likely(icsk->icsk_af_ops == target)) + return; + + subflow->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = target; +#endif +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -22,7 +201,8 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) struct socket *sf; int err; - err = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sf); + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, + &sf); if (err) return err; @@ -60,6 +240,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, return NULL; rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + INIT_LIST_HEAD(&ctx->node); pr_debug("subflow=%p", ctx); @@ -70,6 +251,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, static int subflow_ulp_init(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; struct tcp_sock *tp = tcp_sk(sk); int err = 0; @@ -91,6 +273,8 @@ static int subflow_ulp_init(struct sock *sk) pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); tp->is_mptcp = 1; + ctx->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = subflow_default_af_ops(sk); out: return err; } @@ -105,15 +289,97 @@ static void subflow_ulp_release(struct sock *sk) kfree_rcu(ctx, rcu); } +static void subflow_ulp_fallback(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_ulp_ops = NULL; + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tcp_sk(sk)->is_mptcp = 0; +} + +static void subflow_ulp_clone(const struct request_sock *req, + struct sock *newsk, + const gfp_t priority) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); + struct mptcp_subflow_context *new_ctx; + + if (!subflow_req->mp_capable) { + subflow_ulp_fallback(newsk); + return; + } + + new_ctx = subflow_create_ctx(newsk, priority); + if (new_ctx == NULL) { + subflow_ulp_fallback(newsk); + return; + } + + new_ctx->conn_finished = 1; + new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; + new_ctx->mp_capable = 1; + new_ctx->fourth_ack = 1; + new_ctx->remote_key = subflow_req->remote_key; + new_ctx->local_key = subflow_req->local_key; +} + static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, .init = subflow_ulp_init, .release = subflow_ulp_release, + .clone = subflow_ulp_clone, }; +static int subflow_ops_init(struct request_sock_ops *subflow_ops) +{ + subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); + subflow_ops->slab_name = "request_sock_subflow"; + + subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, + subflow_ops->obj_size, 0, + SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU, + NULL); + if (!subflow_ops->slab) + return -ENOMEM; + + return 0; +} + void mptcp_subflow_init(void) { + subflow_request_sock_ops = tcp_request_sock_ops; + if (subflow_ops_init(&subflow_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow request sock ops\n"); + + subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; + subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + + subflow_specific = ipv4_specific; + subflow_specific.conn_request = subflow_v4_conn_request; + subflow_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_specific.sk_rx_dst_set = subflow_finish_connect; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; + subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + + subflow_v6_specific = ipv6_specific; + subflow_v6_specific.conn_request = subflow_v6_conn_request; + subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + + subflow_v6m_specific = subflow_v6_specific; + subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; + subflow_v6m_specific.send_check = ipv4_specific.send_check; + subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; + subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; + subflow_v6m_specific.net_frag_header_len = 0; +#endif + if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); } -- cgit v1.2.3-70-g09d2 From 648ef4b88673dadb8463bf0d4b10fbf33d55def8 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 21 Jan 2020 16:56:24 -0800 Subject: mptcp: Implement MPTCP receive path Parses incoming DSS options and populates outgoing MPTCP ACK fields. MPTCP fields are parsed from the TCP option header and placed in an skb extension, allowing the upper MPTCP layer to access MPTCP options after the skb has gone through the TCP stack. The subflow implements its own data_ready() ops, which ensures that the pending data is in sequence - according to MPTCP seq number - dropping out-of-seq skbs. The DATA_READY bit flag is set if this is the case. This allows the MPTCP socket layer to determine if more data is available without having to consult the individual subflows. It additionally validates the current mapping and propagates EoF events to the connection socket. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 10 ++ include/net/mptcp.h | 8 ++ net/ipv4/tcp_input.c | 8 +- net/mptcp/options.c | 107 ++++++++++++++ net/mptcp/protocol.c | 34 +++++ net/mptcp/protocol.h | 64 ++++++++- net/mptcp/subflow.c | 383 ++++++++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 609 insertions(+), 5 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e9ee06d887fa..0d00dad4b85d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -82,9 +82,19 @@ struct tcp_sack_block { struct mptcp_options_received { u64 sndr_key; u64 rcvr_key; + u64 data_ack; + u64 data_seq; + u32 subflow_seq; + u16 data_len; u8 mp_capable : 1, mp_join : 1, dss : 1; + u8 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + __unused:3; }; #endif diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 06dcc665135e..8619c1fca741 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -60,6 +60,8 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx); void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); @@ -150,6 +152,12 @@ static inline bool mptcp_established_options(struct sock *sk, return false; } +static inline void mptcp_incoming_options(struct sock *sk, + struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5165c8de47ee..28d31f2c1422 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4770,6 +4770,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) bool fragstolen; int eaten; + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); return; @@ -6346,8 +6349,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_LAST_ACK: - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); break; + } /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 5ea127bc7f01..1fa8496f3551 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -14,6 +14,7 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, { struct mptcp_options_received *mp_opt = &opt_rx->mptcp; u8 subtype = *ptr >> 4; + int expected_opsize; u8 version; u8 flags; @@ -64,7 +65,79 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, case MPTCPOPT_DSS: pr_debug("DSS"); + ptr++; + + flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; + mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; + mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; + mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; + mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; + mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); + + pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", + mp_opt->data_fin, mp_opt->dsn64, + mp_opt->use_map, mp_opt->ack64, + mp_opt->use_ack); + + expected_opsize = TCPOLEN_MPTCP_DSS_BASE; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) + expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) + expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; + } + + /* RFC 6824, Section 3.3: + * If a checksum is present, but its use had + * not been negotiated in the MP_CAPABLE handshake, + * the checksum field MUST be ignored. + */ + if (opsize != expected_opsize && + opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) + break; + mp_opt->dss = 1; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) { + mp_opt->data_ack = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_ack = get_unaligned_be32(ptr); + ptr += 4; + } + + pr_debug("data_ack=%llu", mp_opt->data_ack); + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) { + mp_opt->data_seq = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_seq = get_unaligned_be32(ptr); + ptr += 4; + } + + mp_opt->subflow_seq = get_unaligned_be32(ptr); + ptr += 4; + + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + mp_opt->data_seq, mp_opt->subflow_seq, + mp_opt->data_len); + } + break; default: @@ -275,6 +348,40 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return false; } +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + struct mptcp_options_received *mp_opt; + struct mptcp_ext *mpext; + + mp_opt = &opt_rx->mptcp; + + if (!mp_opt->dss) + return; + + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + if (!mpext) + return; + + memset(mpext, 0, sizeof(*mpext)); + + if (mp_opt->use_map) { + mpext->data_seq = mp_opt->data_seq; + mpext->subflow_seq = mp_opt->subflow_seq; + mpext->data_len = mp_opt->data_len; + mpext->use_map = 1; + mpext->dsn64 = mp_opt->dsn64; + } + + if (mp_opt->use_ack) { + mpext->data_ack = mp_opt->data_ack; + mpext->use_ack = 1; + mpext->ack64 = mp_opt->ack64; + } + + mpext->data_fin = mp_opt->data_fin; +} + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cf49193b1c0..71250149180b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -224,6 +224,33 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return ret; } +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct mptcp_read_arg *arg = desc->arg.data; + size_t copy_len; + + copy_len = min(desc->count, len); + + if (likely(arg->msg)) { + int err; + + err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len); + if (err) { + pr_debug("error path"); + desc->error = err; + return err; + } + } else { + pr_debug("Flushing skb payload"); + } + + desc->count -= copy_len; + + pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count); + return copy_len; +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -414,7 +441,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, msk->write_seq = subflow->idsn + 1; ack_seq++; msk->ack_seq = ack_seq; + subflow->map_seq = ack_seq; + subflow->map_subflow_seq = 1; subflow->rel_write_seq = 1; + subflow->tcp_sock = ssk; newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); @@ -519,8 +549,12 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); + pr_debug("msk=%p, token=%u", sk, subflow->token); + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); ack_seq++; + subflow->map_seq = ack_seq; + subflow->map_subflow_seq = 1; subflow->rel_write_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 384ec4804198..c6d8217e24d4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -33,7 +33,9 @@ #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_DSS_BASE 4 +#define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 +#define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 @@ -50,6 +52,10 @@ #define MPTCP_DSS_HAS_MAP BIT(2) #define MPTCP_DSS_ACK64 BIT(1) #define MPTCP_DSS_HAS_ACK BIT(0) +#define MPTCP_DSS_FLAG_MASK (0x1F) + +/* MPTCP socket flags */ +#define MPTCP_DATA_READY BIT(0) /* MPTCP connection sock */ struct mptcp_sock { @@ -60,6 +66,7 @@ struct mptcp_sock { u64 write_seq; u64 ack_seq; u32 token; + unsigned long flags; struct list_head conn_list; struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ @@ -82,6 +89,7 @@ struct mptcp_subflow_request_sock { u64 remote_key; u64 idsn; u32 token; + u32 ssn_offset; }; static inline struct mptcp_subflow_request_sock * @@ -96,15 +104,27 @@ struct mptcp_subflow_context { u64 local_key; u64 remote_key; u64 idsn; + u64 map_seq; u32 token; u32 rel_write_seq; + u32 map_subflow_seq; + u32 ssn_offset; + u32 map_data_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ mp_capable : 1, /* remote is MPTCP capable */ fourth_ack : 1, /* send initial DSS */ - conn_finished : 1; + conn_finished : 1, + map_valid : 1, + data_avail : 1, + rx_eof : 1; + struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; + void (*tcp_data_ready)(struct sock *sk); + void (*tcp_state_change)(struct sock *sk); + void (*tcp_write_space)(struct sock *sk); + struct rcu_head rcu; }; @@ -123,14 +143,49 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) return subflow->tcp_sock; } +static inline u64 +mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) +{ + return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - + subflow->ssn_offset - + subflow->map_subflow_seq; +} + +static inline u64 +mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) +{ + return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); +} + +int mptcp_is_enabled(struct net *net); +bool mptcp_subflow_data_available(struct sock *sk); void mptcp_subflow_init(void); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); +static inline void mptcp_subflow_tcp_fallback(struct sock *sk, + struct mptcp_subflow_context *ctx) +{ + sk->sk_data_ready = ctx->tcp_data_ready; + sk->sk_state_change = ctx->tcp_state_change; + sk->sk_write_space = ctx->tcp_write_space; + + inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; +} + extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_MPTCP_IPV6) extern const struct inet_connection_sock_af_ops ipv6_specific; #endif +void mptcp_proto_init(void); + +struct mptcp_read_arg { + struct msghdr *msg; +}; + +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len); + void mptcp_get_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx); @@ -164,4 +219,11 @@ static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 89b91bc7a831..528351e26371 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -78,6 +78,7 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 1; subflow_req->remote_key = rx_opt.mptcp.sndr_key; + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; } } @@ -116,6 +117,11 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_key); mptcp_finish_connect(sk); subflow->conn_finished = 1; + + if (skb) { + pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + } } } @@ -210,6 +216,323 @@ close_child: static struct inet_connection_sock_af_ops subflow_specific; +enum mapping_status { + MAPPING_OK, + MAPPING_INVALID, + MAPPING_EMPTY, + MAPPING_DATA_FIN +}; + +static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) +{ + if ((u32)seq == (u32)old_seq) + return old_seq; + + /* Assume map covers data not mapped yet. */ + return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); +} + +static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) +{ + WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + ssn, subflow->map_subflow_seq, subflow->map_data_len); +} + +static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned int skb_consumed; + + skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(skb_consumed >= skb->len)) + return true; + + return skb->len - skb_consumed <= subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); +} + +static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + + if (unlikely(before(ssn, subflow->map_subflow_seq))) { + /* Mapping covers data later in the subflow stream, + * currently unsupported. + */ + warn_bad_map(subflow, ssn); + return false; + } + if (unlikely(!before(ssn, subflow->map_subflow_seq + + subflow->map_data_len))) { + /* Mapping does covers past subflow data, invalid */ + warn_bad_map(subflow, ssn + skb->len); + return false; + } + return true; +} + +static enum mapping_status get_mapping_status(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_ext *mpext; + struct sk_buff *skb; + u16 data_len; + u64 map_seq; + + skb = skb_peek(&ssk->sk_receive_queue); + if (!skb) + return MAPPING_EMPTY; + + mpext = mptcp_get_ext(skb); + if (!mpext || !mpext->use_map) { + if (!subflow->map_valid && !skb->len) { + /* the TCP stack deliver 0 len FIN pkt to the receive + * queue, that is the only 0len pkts ever expected here, + * and we can admit no mapping only for 0 len pkts + */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + WARN_ONCE(1, "0len seq %d:%d flags %x", + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + TCP_SKB_CB(skb)->tcp_flags); + sk_eat_skb(ssk, skb); + return MAPPING_EMPTY; + } + + if (!subflow->map_valid) + return MAPPING_INVALID; + + goto validate_seq; + } + + pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", + mpext->data_seq, mpext->dsn64, mpext->subflow_seq, + mpext->data_len, mpext->data_fin); + + data_len = mpext->data_len; + if (data_len == 0) { + pr_err("Infinite mapping not handled"); + return MAPPING_INVALID; + } + + if (mpext->data_fin == 1) { + if (data_len == 1) { + pr_debug("DATA_FIN with no payload"); + if (subflow->map_valid) { + /* A DATA_FIN might arrive in a DSS + * option before the previous mapping + * has been fully consumed. Continue + * handling the existing mapping. + */ + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } else { + return MAPPING_DATA_FIN; + } + } + + /* Adjust for DATA_FIN using 1 byte of sequence space */ + data_len--; + } + + if (!mpext->dsn64) { + map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, + mpext->data_seq); + pr_debug("expanded seq=%llu", subflow->map_seq); + } else { + map_seq = mpext->data_seq; + } + + if (subflow->map_valid) { + /* Allow replacing only with an identical map */ + if (subflow->map_seq == map_seq && + subflow->map_subflow_seq == mpext->subflow_seq && + subflow->map_data_len == data_len) { + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } + + /* If this skb data are fully covered by the current mapping, + * the new map would need caching, which is not supported + */ + if (skb_is_fully_mapped(ssk, skb)) + return MAPPING_INVALID; + + /* will validate the next map after consuming the current one */ + return MAPPING_OK; + } + + subflow->map_seq = map_seq; + subflow->map_subflow_seq = mpext->subflow_seq; + subflow->map_data_len = data_len; + subflow->map_valid = 1; + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_seq, subflow->map_subflow_seq, + subflow->map_data_len); + +validate_seq: + /* we revalidate valid mapping on new skb, because we must ensure + * the current skb is completely covered by the available mapping + */ + if (!validate_mapping(ssk, skb)) + return MAPPING_INVALID; + + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; +} + +static bool subflow_check_data_avail(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + enum mapping_status status; + struct mptcp_sock *msk; + struct sk_buff *skb; + + pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, + subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); + if (subflow->data_avail) + return true; + + if (!subflow->conn) + return false; + + msk = mptcp_sk(subflow->conn); + for (;;) { + u32 map_remaining; + size_t delta; + u64 ack_seq; + u64 old_ack; + + status = get_mapping_status(ssk); + pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); + if (status == MAPPING_INVALID) { + ssk->sk_err = EBADMSG; + goto fatal; + } + + if (status != MAPPING_OK) + return false; + + skb = skb_peek(&ssk->sk_receive_queue); + if (WARN_ON_ONCE(!skb)) + return false; + + old_ack = READ_ONCE(msk->ack_seq); + ack_seq = mptcp_subflow_get_mapped_dsn(subflow); + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + ack_seq); + if (ack_seq == old_ack) + break; + + /* only accept in-sequence mapping. Old values are spurious + * retransmission; we can hit "future" values on active backup + * subflow switch, we relay on retransmissions to get + * in-sequence data. + * Cuncurrent subflows support will require subflow data + * reordering + */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + if (before64(ack_seq, old_ack)) + delta = min_t(size_t, old_ack - ack_seq, map_remaining); + else + delta = min_t(size_t, ack_seq - old_ack, map_remaining); + + /* discard mapped data */ + pr_debug("discarding %zu bytes, current map len=%d", delta, + map_remaining); + if (delta) { + struct mptcp_read_arg arg = { + .msg = NULL, + }; + read_descriptor_t desc = { + .count = delta, + .arg.data = &arg, + }; + int ret; + + ret = tcp_read_sock(ssk, &desc, mptcp_read_actor); + if (ret < 0) { + ssk->sk_err = -ret; + goto fatal; + } + if (ret < delta) + return false; + if (delta == map_remaining) + subflow->map_valid = 0; + } + } + return true; + +fatal: + /* fatal protocol error, close the socket */ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + ssk->sk_error_report(ssk); + tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + return false; +} + +bool mptcp_subflow_data_available(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sk_buff *skb; + + /* check if current mapping is still valid */ + if (subflow->map_valid && + mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { + subflow->map_valid = 0; + subflow->data_avail = 0; + + pr_debug("Done with mapping: seq=%u data_len=%u", + subflow->map_subflow_seq, + subflow->map_data_len); + } + + if (!subflow_check_data_avail(sk)) { + subflow->data_avail = 0; + return false; + } + + skb = skb_peek(&sk->sk_receive_queue); + subflow->data_avail = skb && + before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq); + return subflow->data_avail; +} + +static void subflow_data_ready(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + if (!parent || !subflow->mp_capable) { + subflow->tcp_data_ready(sk); + + if (parent) + parent->sk_data_ready(parent); + return; + } + + if (mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } +} + +static void subflow_write_space(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + sk_stream_write_space(sk); + if (parent && sk_stream_is_writeable(sk)) { + sk_stream_write_space(parent); + } +} + static struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { @@ -296,6 +619,47 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, return ctx; } +static void __subflow_state_change(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +static void subflow_state_change(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = READ_ONCE(subflow->conn); + + __subflow_state_change(sk); + + /* as recvmsg() does not acquire the subflow socket for ssk selection + * a fin packet carrying a DSS can be unnoticed if we don't trigger + * the data available machinery here. + */ + if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } + + if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) && + !subflow->rx_eof && subflow_is_done(sk)) { + subflow->rx_eof = 1; + parent->sk_shutdown |= RCV_SHUTDOWN; + __subflow_state_change(parent); + } +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -322,6 +686,12 @@ static int subflow_ulp_init(struct sock *sk) tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); + ctx->tcp_data_ready = sk->sk_data_ready; + ctx->tcp_state_change = sk->sk_state_change; + ctx->tcp_write_space = sk->sk_write_space; + sk->sk_data_ready = subflow_data_ready; + sk->sk_write_space = subflow_write_space; + sk->sk_state_change = subflow_state_change; out: return err; } @@ -339,10 +709,12 @@ static void subflow_ulp_release(struct sock *sk) kfree_rcu(ctx, rcu); } -static void subflow_ulp_fallback(struct sock *sk) +static void subflow_ulp_fallback(struct sock *sk, + struct mptcp_subflow_context *old_ctx) { struct inet_connection_sock *icsk = inet_csk(sk); + mptcp_subflow_tcp_fallback(sk, old_ctx); icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; @@ -357,23 +729,28 @@ static void subflow_ulp_clone(const struct request_sock *req, struct mptcp_subflow_context *new_ctx; if (!subflow_req->mp_capable) { - subflow_ulp_fallback(newsk); + subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx = subflow_create_ctx(newsk, priority); if (new_ctx == NULL) { - subflow_ulp_fallback(newsk); + subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; + new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; + new_ctx->tcp_state_change = old_ctx->tcp_state_change; + new_ctx->tcp_write_space = old_ctx->tcp_write_space; new_ctx->mp_capable = 1; new_ctx->fourth_ack = 1; new_ctx->remote_key = subflow_req->remote_key; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->idsn = subflow_req->idsn; } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { -- cgit v1.2.3-70-g09d2 From cc7972ea1932335e0a0ee00ac8a24b3e8304630d Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 21 Jan 2020 16:56:31 -0800 Subject: mptcp: parse and emit MP_CAPABLE option according to v1 spec This implements MP_CAPABLE options parsing and writing according to RFC 6824 bis / RFC 8684: MPTCP v1. Local key is sent on syn/ack, and both keys are sent on 3rd ack. MP_CAPABLE messages len are updated accordingly. We need the skbuff to correctly emit the above, so we push the skbuff struct as an argument all the way from tcp code to the relevant mptcp callbacks. When processing incoming MP_CAPABLE + data, build a full blown DSS-like map info, to simplify later processing. On child socket creation, we need to record the remote key, if available. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +- include/net/mptcp.h | 17 +++--- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/mptcp/options.c | 162 ++++++++++++++++++++++++++++++++++++++++---------- net/mptcp/protocol.h | 6 +- net/mptcp/subflow.c | 14 ++++- 7 files changed, 160 insertions(+), 46 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0d00dad4b85d..4e2124607d32 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -94,7 +94,8 @@ struct mptcp_options_received { data_fin:1, use_ack:1, ack64:1, - __unused:3; + mpc_map:1, + __unused:2; }; #endif diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8619c1fca741..27627e2d1bc2 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -23,7 +23,8 @@ struct mptcp_ext { data_fin:1, use_ack:1, ack64:1, - __unused:3; + mpc_map:1, + __unused:2; /* one byte hole */ }; @@ -50,10 +51,10 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return tcp_rsk(req)->is_mptcp; } -void mptcp_parse_option(const unsigned char *ptr, int opsize, - struct tcp_options_received *opt_rx); -bool mptcp_syn_options(struct sock *sk, unsigned int *size, - struct mptcp_out_options *opts); +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx); +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts); void mptcp_rcv_synsent(struct sock *sk); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); @@ -121,12 +122,14 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return false; } -static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, +static inline void mptcp_parse_option(const struct sk_buff *skb, + const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } -static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size, +static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts) { return false; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28d31f2c1422..2f475b897c11 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3926,7 +3926,7 @@ void tcp_parse_options(const struct net *net, break; #endif case TCPOPT_MPTCP: - mptcp_parse_option(ptr, opsize, opt_rx); + mptcp_parse_option(skb, ptr, opsize, opt_rx); break; case TCPOPT_FASTOPEN: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5456076166da..fec4b3a4b22d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -685,7 +685,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (sk_is_mptcp(sk)) { unsigned int size; - if (mptcp_syn_options(sk, &size, &opts->mptcp)) { + if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { opts->options |= OPTION_MPTCP; remaining -= size; } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1aec742ca8e1..8f82ff9a5a8e 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -14,8 +14,8 @@ static bool mptcp_cap_flag_sha256(u8 flags) return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } -void mptcp_parse_option(const unsigned char *ptr, int opsize, - struct tcp_options_received *opt_rx) +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx) { struct mptcp_options_received *mp_opt = &opt_rx->mptcp; u8 subtype = *ptr >> 4; @@ -25,13 +25,29 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, switch (subtype) { case MPTCPOPT_MP_CAPABLE: - if (opsize != TCPOLEN_MPTCP_MPC_SYN && - opsize != TCPOLEN_MPTCP_MPC_ACK) + /* strict size checking */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + if (skb->len > tcp_hdr(skb)->doff << 2) + expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; + else + expected_opsize = TCPOLEN_MPTCP_MPC_ACK; + } else { + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) + expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; + else + expected_opsize = TCPOLEN_MPTCP_MPC_SYN; + } + if (opsize != expected_opsize) break; + /* try to be gentle vs future versions on the initial syn */ version = *ptr++ & MPTCP_VERSION_MASK; - if (version != MPTCP_SUPPORTED_VERSION) + if (opsize != TCPOLEN_MPTCP_MPC_SYN) { + if (version != MPTCP_SUPPORTED_VERSION) + break; + } else if (version < MPTCP_SUPPORTED_VERSION) { break; + } flags = *ptr++; if (!mptcp_cap_flag_sha256(flags) || @@ -55,23 +71,40 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, break; mp_opt->mp_capable = 1; - mp_opt->sndr_key = get_unaligned_be64(ptr); - ptr += 8; - - if (opsize == TCPOLEN_MPTCP_MPC_ACK) { + if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { + mp_opt->sndr_key = get_unaligned_be64(ptr); + ptr += 8; + } + if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; - pr_debug("MP_CAPABLE sndr=%llu, rcvr=%llu", - mp_opt->sndr_key, mp_opt->rcvr_key); - } else { - pr_debug("MP_CAPABLE sndr=%llu", mp_opt->sndr_key); } + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + /* Section 3.1.: + * "the data parameters in a MP_CAPABLE are semantically + * equivalent to those in a DSS option and can be used + * interchangeably." + */ + mp_opt->dss = 1; + mp_opt->use_map = 1; + mp_opt->mpc_map = 1; + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + version, flags, opsize, mp_opt->sndr_key, + mp_opt->rcvr_key, mp_opt->data_len); break; case MPTCPOPT_DSS: pr_debug("DSS"); ptr++; + /* we must clear 'mpc_map' be able to detect MP_CAPABLE + * map vs DSS map in mptcp_incoming_options(), and reconstruct + * map info accordingly + */ + mp_opt->mpc_map = 0; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; @@ -176,18 +209,22 @@ void mptcp_get_options(const struct sk_buff *skb, if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) - mptcp_parse_option(ptr, opsize, opt_rx); + mptcp_parse_option(skb, ptr, opsize, opt_rx); ptr += opsize - 2; length -= opsize; } } } -bool mptcp_syn_options(struct sock *sk, unsigned int *size, - struct mptcp_out_options *opts) +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + /* we will use snd_isn to detect first pkt [re]transmission + * in mptcp_established_options_mp() + */ + subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { pr_debug("local_key=%llu", subflow->local_key); opts->suboptions = OPTION_MPTCP_MPC_SYN; @@ -212,20 +249,52 @@ void mptcp_rcv_synsent(struct sock *sk) } } -static bool mptcp_established_options_mp(struct sock *sk, unsigned int *size, +static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_ext *mpext; + unsigned int data_len; + + pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow, + subflow->fourth_ack, subflow->snd_isn, + skb ? TCP_SKB_CB(skb)->seq : 0, remaining); + + if (subflow->mp_capable && !subflow->fourth_ack && skb && + subflow->snd_isn == TCP_SKB_CB(skb)->seq) { + /* When skb is not available, we better over-estimate the + * emitted options len. A full DSS option is longer than + * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit + * that. + */ + mpext = mptcp_get_ext(skb); + data_len = mpext ? mpext->data_len : 0; - if (!subflow->fourth_ack) { + /* we will check ext_copy.data_len in mptcp_write_options() to + * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and + * TCPOLEN_MPTCP_MPC_ACK + */ + opts->ext_copy.data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; - *size = TCPOLEN_MPTCP_MPC_ACK; - subflow->fourth_ack = 1; - pr_debug("subflow=%p, local_key=%llu, remote_key=%llu", - subflow, subflow->local_key, subflow->remote_key); + + /* Section 3.1. + * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK + * packets that start the first subflow of an MPTCP connection, + * as well as the first packet that carries data + */ + if (data_len > 0) + *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); + else + *size = TCPOLEN_MPTCP_MPC_ACK; + + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", + subflow, subflow->local_key, subflow->remote_key, + data_len); + return true; } return false; @@ -319,7 +388,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int opt_size = 0; bool ret = false; - if (mptcp_established_options_mp(sk, &opt_size, remaining, opts)) + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, opts)) @@ -371,11 +440,26 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, memset(mpext, 0, sizeof(*mpext)); if (mp_opt->use_map) { - mpext->data_seq = mp_opt->data_seq; - mpext->subflow_seq = mp_opt->subflow_seq; + if (mp_opt->mpc_map) { + struct mptcp_subflow_context *subflow = + mptcp_subflow_ctx(sk); + + /* this is an MP_CAPABLE carrying MPTCP data + * we know this map the first chunk of data + */ + mptcp_crypto_key_sha(subflow->remote_key, NULL, + &mpext->data_seq); + mpext->data_seq++; + mpext->subflow_seq = 1; + mpext->dsn64 = 1; + mpext->mpc_map = 1; + } else { + mpext->data_seq = mp_opt->data_seq; + mpext->subflow_seq = mp_opt->subflow_seq; + mpext->dsn64 = mp_opt->dsn64; + } mpext->data_len = mp_opt->data_len; mpext->use_map = 1; - mpext->dsn64 = mp_opt->dsn64; } if (mp_opt->use_ack) { @@ -389,8 +473,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) { - if ((OPTION_MPTCP_MPC_SYN | - OPTION_MPTCP_MPC_SYNACK | + if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { u8 len; @@ -398,6 +481,8 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) len = TCPOLEN_MPTCP_MPC_SYN; else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) len = TCPOLEN_MPTCP_MPC_SYNACK; + else if (opts->ext_copy.data_len) + len = TCPOLEN_MPTCP_MPC_ACK_DATA; else len = TCPOLEN_MPTCP_MPC_ACK; @@ -405,14 +490,27 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) (MPTCPOPT_MP_CAPABLE << 12) | (MPTCP_SUPPORTED_VERSION << 8) | MPTCP_CAP_HMAC_SHA256); + + if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & + opts->suboptions)) + goto mp_capable_done; + put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; - if (OPTION_MPTCP_MPC_ACK & opts->suboptions) { - put_unaligned_be64(opts->rcvr_key, ptr); - ptr += 2; - } + if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) + goto mp_capable_done; + + put_unaligned_be64(opts->rcvr_key, ptr); + ptr += 2; + if (!opts->ext_copy.data_len) + goto mp_capable_done; + + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + ptr += 1; } +mp_capable_done: if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a355bb1cf31b..36b90024d34d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -11,7 +11,7 @@ #include #include -#define MPTCP_SUPPORTED_VERSION 0 +#define MPTCP_SUPPORTED_VERSION 1 /* MPTCP option bits */ #define OPTION_MPTCP_MPC_SYN BIT(0) @@ -29,9 +29,10 @@ #define MPTCPOPT_MP_FASTCLOSE 7 /* MPTCP suboption lengths */ -#define TCPOLEN_MPTCP_MPC_SYN 12 +#define TCPOLEN_MPTCP_MPC_SYN 4 #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 +#define TCPOLEN_MPTCP_MPC_ACK_DATA 22 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 @@ -106,6 +107,7 @@ struct mptcp_subflow_context { u64 remote_key; u64 idsn; u64 map_seq; + u32 snd_isn; u32 token; u32 rel_write_seq; u32 map_subflow_seq; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9fb3eb87a20f..8892855f4f52 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -77,7 +77,6 @@ static void subflow_init_req(struct request_sock *req, if (err == 0) subflow_req->mp_capable = 1; - subflow_req->remote_key = rx_opt.mptcp.sndr_key; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; } } @@ -180,11 +179,22 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, bool *own_req) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); + struct mptcp_subflow_request_sock *subflow_req; + struct tcp_options_received opt_rx; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); - /* if the sk is MP_CAPABLE, we already received the client key */ + /* if the sk is MP_CAPABLE, we need to fetch the client key */ + subflow_req = mptcp_subflow_rsk(req); + if (subflow_req->mp_capable) { + opt_rx.mptcp.mp_capable = 0; + mptcp_get_options(skb, &opt_rx); + if (!opt_rx.mptcp.mp_capable) + subflow_req->mp_capable = 0; + else + subflow_req->remote_key = opt_rx.mptcp.sndr_key; + } child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); -- cgit v1.2.3-70-g09d2 From 32efcc06d2a15fa87585614d12d6c2308cc2d3f3 Mon Sep 17 00:00:00 2001 From: Abdul Kabbani Date: Fri, 24 Jan 2020 16:34:02 -0500 Subject: tcp: export count for rehash attempts Using IPv6 flow-label to swiftly route around avoid congested or disconnected network path can greatly improve TCP reliability. This patch adds SNMP counters and a OPT_STATS counter to track both host-level and connection-level statistics. Network administrators can use these counters to evaluate the impact of this new ability better. Export count for rehash attempts to 1) two SNMP counters: TcpTimeoutRehash (rehash due to timeouts), and TcpDuplicateDataRehash (rehash due to receiving duplicate packets) 2) Timestamping API SOF_TIMESTAMPING_OPT_STATS. Signed-off-by: Abdul Kabbani Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Kevin(Yudong) Yang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ include/uapi/linux/snmp.h | 2 ++ include/uapi/linux/tcp.h | 1 + net/ipv4/proc.c | 2 ++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_timer.c | 6 ++++++ 7 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4e2124607d32..1cf73e6f85ca 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -386,6 +386,8 @@ struct tcp_sock { #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif + u16 timeout_rehash; /* Timeout-triggered rehash attempts */ + u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ /* Receiver side RTT estimation */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 7eee233e78d2..7d91f4debc48 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -285,6 +285,8 @@ enum LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */ LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */ LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */ + LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ + LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index d87184e673ca..fd9eb8f6bcae 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -311,6 +311,7 @@ enum { TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ TCP_NLA_REORD_SEEN, /* reordering events seen */ TCP_NLA_SRTT, /* smoothed RTT in usecs */ + TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index cc90243ccf76..2580303249e2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -289,6 +289,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), + SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), + SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2857c853e2fd..484485ae74c2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3337,6 +3337,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ + nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ 0; } @@ -3391,6 +3392,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); + nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4915de65d278..e8b840a4767e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4271,8 +4271,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { sk_rethink_txhash(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1097b438befe..c3f26dcd6704 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -223,6 +223,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; @@ -234,6 +237,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = net->ipv4.sysctl_tcp_retries2; -- cgit v1.2.3-70-g09d2