summaryrefslogtreecommitdiff
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/mib.c9
-rw-r--r--net/mptcp/mib.h9
-rw-r--r--net/mptcp/options.c120
-rw-r--r--net/mptcp/pm.c94
-rw-r--r--net/mptcp/pm_netlink.c325
-rw-r--r--net/mptcp/protocol.c570
-rw-r--r--net/mptcp/protocol.h71
-rw-r--r--net/mptcp/subflow.c119
8 files changed, 1047 insertions, 270 deletions
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 0a6a15f3456d..84d119436b22 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -22,6 +22,15 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+ SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL),
+ SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE),
+ SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE),
+ SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW),
+ SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA),
+ SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR),
+ SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD),
+ SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR),
+ SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index d7de340fc997..47bcecce1106 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -15,6 +15,15 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
+ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */
+ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */
+ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */
+ MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */
+ MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */
+ MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */
+ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */
+ MPTCP_MIB_RMADDR, /* Received RM_ADDR */
+ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 888bbbbb3e8a..092a2d48bfd3 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -11,6 +11,7 @@
#include <net/tcp.h>
#include <net/mptcp.h>
#include "protocol.h"
+#include "mib.h"
static bool mptcp_cap_flag_sha256(u8 flags)
{
@@ -242,7 +243,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->add_addr = 1;
mp_opt->port = 0;
mp_opt->addr_id = *ptr++;
- pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id);
+ pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo);
if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
ptr += 4;
@@ -516,7 +517,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return ret;
}
- if (subflow->use_64bit_ack) {
+ if (READ_ONCE(msk->use_64bit_ack)) {
ack_size = TCPOLEN_MPTCP_DSS_ACK64;
opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
opts->ext_copy.ack64 = 1;
@@ -571,21 +572,22 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
}
#endif
-static bool mptcp_established_options_addr(struct sock *sk,
- unsigned int *size,
- unsigned int remaining,
- struct mptcp_out_options *opts)
+static bool mptcp_established_options_add_addr(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_addr_info saddr;
+ bool echo;
int len;
- if (!mptcp_pm_should_signal(msk) ||
- !(mptcp_pm_addr_signal(msk, remaining, &saddr)))
+ if (!mptcp_pm_should_add_signal(msk) ||
+ !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo)))
return false;
- len = mptcp_add_addr_len(saddr.family);
+ len = mptcp_add_addr_len(saddr.family, echo);
if (remaining < len)
return false;
@@ -594,22 +596,51 @@ static bool mptcp_established_options_addr(struct sock *sk,
if (saddr.family == AF_INET) {
opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
opts->addr = saddr.addr;
- opts->ahmac = add_addr_generate_hmac(msk->local_key,
- msk->remote_key,
- opts->addr_id,
- &opts->addr);
+ if (!echo) {
+ opts->ahmac = add_addr_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr);
+ }
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (saddr.family == AF_INET6) {
opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
opts->addr6 = saddr.addr6;
- opts->ahmac = add_addr6_generate_hmac(msk->local_key,
- msk->remote_key,
- opts->addr_id,
- &opts->addr6);
+ if (!echo) {
+ opts->ahmac = add_addr6_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr6);
+ }
}
#endif
- pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac);
+ pr_debug("addr_id=%d, ahmac=%llu, echo=%d", opts->addr_id, opts->ahmac, echo);
+
+ return true;
+}
+
+static bool mptcp_established_options_rm_addr(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ u8 rm_id;
+
+ if (!mptcp_pm_should_rm_signal(msk) ||
+ !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id)))
+ return false;
+
+ if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
+ return false;
+
+ *size = TCPOLEN_MPTCP_RM_ADDR_BASE;
+ opts->suboptions |= OPTION_MPTCP_RM_ADDR;
+ opts->rm_id = rm_id;
+
+ pr_debug("rm_id=%d", opts->rm_id);
return true;
}
@@ -626,6 +657,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
if (unlikely(mptcp_check_fallback(sk)))
return false;
+ /* prevent adding of any MPTCP related options on reset packet
+ * until we support MP_TCPRST/MP_FASTCLOSE
+ */
+ if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
+ return false;
+
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -640,7 +677,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
*size += opt_size;
remaining -= opt_size;
- if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) {
+ if (mptcp_established_options_add_addr(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) {
*size += opt_size;
remaining -= opt_size;
ret = true;
@@ -676,7 +717,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
return false;
}
-static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
+static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
struct mptcp_subflow_context *subflow,
struct sk_buff *skb,
struct mptcp_options_received *mp_opt)
@@ -693,15 +734,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
subflow->mp_join && mp_opt->mp_join &&
READ_ONCE(msk->pm.server_side))
- tcp_send_ack(sk);
+ tcp_send_ack(ssk);
goto fully_established;
}
- /* we should process OoO packets before the first subflow is fully
- * established, but not expected for MP_JOIN subflows
+ /* we must process OoO packets before the first subflow is fully
+ * established. OoO packets are instead a protocol violation
+ * for MP_JOIN subflows as the peer must not send any data
+ * before receiving the forth ack - cfr. RFC 8684 section 3.2.
*/
- if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
+ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
+ if (subflow->mp_join)
+ goto reset;
return subflow->mp_capable;
+ }
if (mp_opt->dss && mp_opt->use_ack) {
/* subflows are fully established as soon as we get any
@@ -713,9 +759,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
}
/* If the first established packet does not contain MP_CAPABLE + data
- * then fallback to TCP
+ * then fallback to TCP. Fallback scenarios requires a reset for
+ * MP_JOIN subflows.
*/
if (!mp_opt->mp_capable) {
+ if (subflow->mp_join)
+ goto reset;
subflow->mp_capable = 0;
pr_fallback(msk);
__mptcp_do_fallback(msk);
@@ -732,12 +781,16 @@ fully_established:
subflow->pm_notified = 1;
if (subflow->mp_join) {
- clear_3rdack_retransmission(sk);
+ clear_3rdack_retransmission(ssk);
mptcp_pm_subflow_established(msk, subflow);
} else {
mptcp_pm_fully_established(msk);
}
return true;
+
+reset:
+ mptcp_subflow_reset(ssk);
+ return false;
}
static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
@@ -825,8 +878,7 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
return hmac == mp_opt->ahmac;
}
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
- struct tcp_options_received *opt_rx)
+void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
@@ -855,11 +907,21 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
addr.addr6 = mp_opt.addr6;
}
#endif
- if (!mp_opt.echo)
+ if (!mp_opt.echo) {
mptcp_pm_add_addr_received(msk, &addr);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR);
+ } else {
+ mptcp_pm_del_add_timer(msk, &addr);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
+ }
mp_opt.add_addr = 0;
}
+ if (mp_opt.rm_addr) {
+ mptcp_pm_rm_addr_received(msk, mp_opt.rm_id);
+ mp_opt.rm_addr = 0;
+ }
+
if (!mp_opt.dss)
return;
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index a8ad20559aaa..e19e1525ecbb 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -13,23 +13,34 @@
/* path manager command handlers */
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr)
+ const struct mptcp_addr_info *addr,
+ bool echo)
{
pr_debug("msk=%p, local_id=%d", msk, addr->id);
msk->pm.local = *addr;
- WRITE_ONCE(msk->pm.addr_signal, true);
+ WRITE_ONCE(msk->pm.add_addr_echo, echo);
+ WRITE_ONCE(msk->pm.add_addr_signal, true);
return 0;
}
int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id)
{
- return -ENOTSUPP;
+ pr_debug("msk=%p, local_id=%d", msk, local_id);
+
+ msk->pm.rm_id = local_id;
+ WRITE_ONCE(msk->pm.rm_addr_signal, true);
+ return 0;
}
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id)
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id)
{
- return -ENOTSUPP;
+ pr_debug("msk=%p, local_id=%d", msk, local_id);
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_nl_rm_subflow_received(msk, local_id);
+ spin_unlock_bh(&msk->pm.lock);
+ return 0;
}
/* path manager event handlers */
@@ -46,7 +57,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
- int ret;
+ int ret = 0;
pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
pm->subflows_max, READ_ONCE(pm->accept_subflow));
@@ -56,9 +67,11 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
return false;
spin_lock_bh(&pm->lock);
- ret = pm->subflows < pm->subflows_max;
- if (ret && ++pm->subflows == pm->subflows_max)
- WRITE_ONCE(pm->accept_subflow, false);
+ if (READ_ONCE(pm->accept_subflow)) {
+ ret = pm->subflows < pm->subflows_max;
+ if (ret && ++pm->subflows == pm->subflows_max)
+ WRITE_ONCE(pm->accept_subflow, false);
+ }
spin_unlock_bh(&pm->lock);
return ret;
@@ -135,38 +148,71 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
READ_ONCE(pm->accept_addr));
- /* avoid acquiring the lock if there is no room for fouther addresses */
- if (!READ_ONCE(pm->accept_addr))
- return;
-
spin_lock_bh(&pm->lock);
- /* be sure there is something to signal re-checking under PM lock */
- if (READ_ONCE(pm->accept_addr) &&
- mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED))
+ if (!READ_ONCE(pm->accept_addr))
+ mptcp_pm_announce_addr(msk, addr, true);
+ else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED))
pm->remote = *addr;
spin_unlock_bh(&pm->lock);
}
+void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p remote_id=%d", msk, rm_id);
+
+ spin_lock_bh(&pm->lock);
+ mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED);
+ pm->rm_id = rm_id;
+ spin_unlock_bh(&pm->lock);
+}
+
/* path manager helpers */
-bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
- struct mptcp_addr_info *saddr)
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr, bool *echo)
{
int ret = false;
spin_lock_bh(&msk->pm.lock);
/* double check after the lock is acquired */
- if (!mptcp_pm_should_signal(msk))
+ if (!mptcp_pm_should_add_signal(msk))
goto out_unlock;
- if (remaining < mptcp_add_addr_len(msk->pm.local.family))
+ *echo = READ_ONCE(msk->pm.add_addr_echo);
+
+ if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo))
goto out_unlock;
*saddr = msk->pm.local;
- WRITE_ONCE(msk->pm.addr_signal, false);
+ WRITE_ONCE(msk->pm.add_addr_signal, false);
+ ret = true;
+
+out_unlock:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ u8 *rm_id)
+{
+ int ret = false;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ /* double check after the lock is acquired */
+ if (!mptcp_pm_should_rm_signal(msk))
+ goto out_unlock;
+
+ if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
+ goto out_unlock;
+
+ *rm_id = msk->pm.rm_id;
+ WRITE_ONCE(msk->pm.rm_addr_signal, false);
ret = true;
out_unlock:
@@ -185,13 +231,17 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
msk->pm.add_addr_accepted = 0;
msk->pm.local_addr_used = 0;
msk->pm.subflows = 0;
+ msk->pm.rm_id = 0;
WRITE_ONCE(msk->pm.work_pending, false);
- WRITE_ONCE(msk->pm.addr_signal, false);
+ WRITE_ONCE(msk->pm.add_addr_signal, false);
+ WRITE_ONCE(msk->pm.rm_addr_signal, false);
WRITE_ONCE(msk->pm.accept_addr, false);
WRITE_ONCE(msk->pm.accept_subflow, false);
+ WRITE_ONCE(msk->pm.add_addr_echo, false);
msk->pm.status = 0;
spin_lock_init(&msk->pm.lock);
+ INIT_LIST_HEAD(&msk->pm.anno_list);
mptcp_pm_nl_data_init(msk);
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 770da3627848..0d6f3d912891 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -15,6 +15,7 @@
#include <uapi/linux/mptcp.h>
#include "protocol.h"
+#include "mib.h"
/* forward declaration */
static struct genl_family mptcp_genl_family;
@@ -23,12 +24,18 @@ static int pm_nl_pernet_id;
struct mptcp_pm_addr_entry {
struct list_head list;
- unsigned int flags;
- int ifindex;
struct mptcp_addr_info addr;
struct rcu_head rcu;
};
+struct mptcp_pm_add_entry {
+ struct list_head list;
+ struct mptcp_addr_info addr;
+ struct timer_list add_timer;
+ struct mptcp_sock *sock;
+ u8 retrans_times;
+};
+
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
@@ -42,6 +49,7 @@ struct pm_nl_pernet {
};
#define MPTCP_PM_ADDR_MAX 8
+#define ADD_ADDR_RETRANS_MAX 3
static bool addresses_equal(const struct mptcp_addr_info *a,
struct mptcp_addr_info *b, bool use_port)
@@ -129,7 +137,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
rcu_read_lock();
spin_lock_bh(&msk->join_list_lock);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
/* avoid any address already in use by subflows and
@@ -160,7 +168,7 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
if (i++ == pos) {
ret = entry;
@@ -179,6 +187,121 @@ static void check_work_pending(struct mptcp_sock *msk)
WRITE_ONCE(msk->pm.work_pending, false);
}
+static struct mptcp_pm_add_entry *
+lookup_anno_list_by_saddr(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+
+ list_for_each_entry(entry, &msk->pm.anno_list, list) {
+ if (addresses_equal(&entry->addr, addr, false))
+ return entry;
+ }
+
+ return NULL;
+}
+
+static void mptcp_pm_add_timer(struct timer_list *timer)
+{
+ struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
+ struct mptcp_sock *msk = entry->sock;
+ struct sock *sk = (struct sock *)msk;
+
+ pr_debug("msk=%p", msk);
+
+ if (!msk)
+ return;
+
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return;
+
+ if (!entry->addr.id)
+ return;
+
+ if (mptcp_pm_should_add_signal(msk)) {
+ sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
+ goto out;
+ }
+
+ spin_lock_bh(&msk->pm.lock);
+
+ if (!mptcp_pm_should_add_signal(msk)) {
+ pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id);
+ mptcp_pm_announce_addr(msk, &entry->addr, false);
+ entry->retrans_times++;
+ }
+
+ if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
+ sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX);
+
+ spin_unlock_bh(&msk->pm.lock);
+
+out:
+ __sock_put(sk);
+}
+
+struct mptcp_pm_add_entry *
+mptcp_pm_del_add_timer(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+ struct sock *sk = (struct sock *)msk;
+
+ spin_lock_bh(&msk->pm.lock);
+ entry = lookup_anno_list_by_saddr(msk, addr);
+ if (entry)
+ entry->retrans_times = ADD_ADDR_RETRANS_MAX;
+ spin_unlock_bh(&msk->pm.lock);
+
+ if (entry)
+ sk_stop_timer_sync(sk, &entry->add_timer);
+
+ return entry;
+}
+
+static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_pm_add_entry *add_entry = NULL;
+ struct sock *sk = (struct sock *)msk;
+
+ if (lookup_anno_list_by_saddr(msk, &entry->addr))
+ return false;
+
+ add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
+ if (!add_entry)
+ return false;
+
+ list_add(&add_entry->list, &msk->pm.anno_list);
+
+ add_entry->addr = entry->addr;
+ add_entry->sock = msk;
+ add_entry->retrans_times = 0;
+
+ timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
+ sk_reset_timer(sk, &add_entry->add_timer, jiffies + TCP_RTO_MAX);
+
+ return true;
+}
+
+void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_add_entry *entry, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ LIST_HEAD(free_list);
+
+ pr_debug("msk=%p", msk);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_splice_init(&msk->pm.anno_list, &free_list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ list_for_each_entry_safe(entry, tmp, &free_list, list) {
+ sk_stop_timer_sync(sk, &entry->add_timer);
+ kfree(entry);
+ }
+}
+
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
struct mptcp_addr_info remote = { 0 };
@@ -199,8 +322,10 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
msk->pm.add_addr_signaled);
if (local) {
- msk->pm.add_addr_signaled++;
- mptcp_pm_announce_addr(msk, &local->addr);
+ if (mptcp_pm_alloc_anno_list(msk, local)) {
+ msk->pm.add_addr_signaled++;
+ mptcp_pm_announce_addr(msk, &local->addr, false);
+ }
} else {
/* pick failed, avoid fourther attempts later */
msk->pm.local_addr_used = msk->pm.add_addr_signal_max;
@@ -220,8 +345,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
msk->pm.subflows++;
check_work_pending(msk);
spin_unlock_bh(&msk->pm.lock);
- __mptcp_subflow_connect(sk, local->ifindex,
- &local->addr, &remote);
+ __mptcp_subflow_connect(sk, &local->addr, &remote);
spin_lock_bh(&msk->pm.lock);
return;
}
@@ -267,13 +391,86 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
local.family = remote.family;
spin_unlock_bh(&msk->pm.lock);
- __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote);
+ __mptcp_subflow_connect((struct sock *)msk, &local, &remote);
spin_lock_bh(&msk->pm.lock);
+
+ mptcp_pm_announce_addr(msk, &remote, true);
+}
+
+void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+
+ pr_debug("address rm_id %d", msk->pm.rm_id);
+
+ if (!msk->pm.rm_id)
+ return;
+
+ if (list_empty(&msk->conn_list))
+ return;
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
+ long timeout = 0;
+
+ if (msk->pm.rm_id != subflow->remote_id)
+ continue;
+
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, how);
+ __mptcp_close_ssk(sk, ssk, subflow, timeout);
+ spin_lock_bh(&msk->pm.lock);
+
+ msk->pm.add_addr_accepted--;
+ msk->pm.subflows--;
+ WRITE_ONCE(msk->pm.accept_addr, true);
+
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMADDR);
+
+ break;
+ }
+}
+
+void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+
+ pr_debug("subflow rm_id %d", rm_id);
+
+ if (!rm_id)
+ return;
+
+ if (list_empty(&msk->conn_list))
+ return;
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
+ long timeout = 0;
+
+ if (rm_id != subflow->local_id)
+ continue;
+
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, how);
+ __mptcp_close_ssk(sk, ssk, subflow, timeout);
+ spin_lock_bh(&msk->pm.lock);
+
+ msk->pm.local_addr_used--;
+ msk->pm.subflows--;
+
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
+
+ break;
+ }
}
static bool address_use_port(struct mptcp_pm_addr_entry *entry)
{
- return (entry->flags &
+ return (entry->addr.flags &
(MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
MPTCP_PM_ADDR_FLAG_SIGNAL;
}
@@ -303,9 +500,9 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
goto out;
}
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max++;
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max++;
entry->addr.id = pernet->next_id++;
@@ -358,8 +555,9 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
if (!entry)
return -ENOMEM;
- entry->flags = 0;
entry->addr = skc_local;
+ entry->addr.ifindex = 0;
+ entry->addr.flags = 0;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
kfree(entry);
@@ -397,8 +595,8 @@ mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
[MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
[MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
[MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
- [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr), },
+ [MPTCP_PM_ADDR_ATTR_ADDR6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
[MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
[MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
[MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
@@ -473,14 +671,17 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
skip_family:
- if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX])
- entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+ if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
+ u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+
+ entry->addr.ifindex = val;
+ }
if (tb[MPTCP_PM_ADDR_ATTR_ID])
entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
- entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+ entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
return 0;
}
@@ -530,6 +731,68 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
return NULL;
}
+static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+
+ entry = mptcp_pm_del_add_timer(msk, addr);
+ if (entry) {
+ list_del(&entry->list);
+ kfree(entry);
+ return true;
+ }
+
+ return false;
+}
+
+static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ bool force)
+{
+ bool ret;
+
+ ret = remove_anno_list_by_saddr(msk, addr);
+ if (ret || force) {
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, addr->id);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+ return ret;
+}
+
+static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_sock *msk;
+ long s_slot = 0, s_num = 0;
+
+ pr_debug("remove_id=%d", addr->id);
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+ bool remove_subflow;
+
+ if (list_empty(&msk->conn_list)) {
+ mptcp_pm_remove_anno_addr(msk, addr, false);
+ goto next;
+ }
+
+ lock_sock(sk);
+ remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
+ mptcp_pm_remove_anno_addr(msk, addr, remove_subflow);
+ if (remove_subflow)
+ mptcp_pm_remove_subflow(msk, addr->id);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
@@ -545,19 +808,21 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
entry = __lookup_addr_by_id(pernet, addr.addr.id);
if (!entry) {
GENL_SET_ERR_MSG(info, "address not found");
- ret = -EINVAL;
- goto out;
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
}
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max--;
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max--;
pernet->addrs--;
list_del_rcu(&entry->list);
- kfree_rcu(entry, rcu);
-out:
spin_unlock_bh(&pernet->lock);
+
+ mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr);
+ kfree_rcu(entry, rcu);
+
return ret;
}
@@ -606,10 +871,10 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb,
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
goto nla_put_failure;
- if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
+ if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags))
goto nla_put_failure;
- if (entry->ifindex &&
- nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
+ if (entry->addr.ifindex &&
+ nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->addr.ifindex))
goto nla_put_failure;
if (addr->family == AF_INET &&
@@ -789,7 +1054,7 @@ fail:
return -EMSGSIZE;
}
-static struct genl_ops mptcp_pm_ops[] = {
+static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_ADD_ADDR,
.doit = mptcp_nl_cmd_add_addr,
@@ -828,8 +1093,8 @@ static struct genl_family mptcp_genl_family __ro_after_init = {
.policy = mptcp_pm_policy,
.netnsok = true,
.module = THIS_MODULE,
- .ops = mptcp_pm_ops,
- .n_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .small_ops = mptcp_pm_ops,
+ .n_small_ops = ARRAY_SIZE(mptcp_pm_ops),
.mcgrps = mptcp_pm_mcgrps,
.n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
};
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5d747c6a610e..185dacb39781 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -24,8 +24,6 @@
#include "protocol.h"
#include "mib.h"
-#define MPTCP_SAME_STATE TCP_MAX_STATES
-
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct mptcp6_sock {
struct mptcp_sock msk;
@@ -34,6 +32,8 @@ struct mptcp6_sock {
#endif
struct mptcp_skb_cb {
+ u64 map_seq;
+ u64 end_seq;
u32 offset;
};
@@ -112,64 +112,205 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
return 0;
}
-static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
- struct sk_buff *skb,
- unsigned int offset, size_t copy_len)
+static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
+static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ bool fragstolen;
+ int delta;
+
+ if (MPTCP_SKB_CB(from)->offset ||
+ !skb_try_coalesce(to, from, &fragstolen, &delta))
+ return false;
+
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
+ MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
+ to->len, MPTCP_SKB_CB(from)->end_seq);
+ MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+ kfree_skb_partial(from, fragstolen);
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ return true;
+}
+
+static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
+ return false;
+
+ return mptcp_try_coalesce((struct sock *)msk, to, from);
+}
+
+/* "inspired" by tcp_data_queue_ofo(), main differences:
+ * - use mptcp seqs
+ * - don't cope with sacks
+ */
+static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
{
struct sock *sk = (struct sock *)msk;
- struct sk_buff *tail;
+ struct rb_node **p, *parent;
+ u64 seq, end_seq, max_seq;
+ struct sk_buff *skb1;
+ int space;
+
+ seq = MPTCP_SKB_CB(skb)->map_seq;
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ space = tcp_space(sk);
+ max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq;
+
+ pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
+ RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ if (after64(seq, max_seq)) {
+ /* out of window */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
+ return;
+ }
- __skb_unlink(skb, &ssk->sk_receive_queue);
+ p = &msk->out_of_order_queue.rb_node;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
+ if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
+ msk->ooo_last_skb = skb;
+ goto end;
+ }
- skb_ext_reset(skb);
- skb_orphan(skb);
- WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+ /* with 2 subflows, adding at end of ooo queue is quite likely
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ return;
+ }
- tail = skb_peek_tail(&sk->sk_receive_queue);
- if (offset == 0 && tail) {
- bool fragstolen;
- int delta;
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ parent = &msk->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
- if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
- kfree_skb_partial(skb, fragstolen);
- atomic_add(delta, &sk->sk_rmem_alloc);
- sk_mem_charge(sk, delta);
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ return;
+ }
+ if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ /* partial overlap:
+ * | skb |
+ * | skb1 |
+ * continue traversing
+ */
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ goto merge_right;
+ }
+ } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
return;
}
+ p = &parent->rb_right;
}
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- MPTCP_SKB_CB(skb)->offset = offset;
-}
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
-static void mptcp_stop_timer(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
+ if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
+ break;
+ rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ }
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!skb1)
+ msk->ooo_last_skb = skb;
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
- mptcp_sk(sk)->timer_ival = 0;
+end:
+ skb_condense(skb);
+ skb_set_owner_r(skb, sk);
}
-/* both sockets must be locked */
-static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
- struct sock *ssk)
+static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb, unsigned int offset,
+ size_t copy_len)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
- /* revalidate data sequence number.
- *
- * mptcp_subflow_data_available() is usually called
- * without msk lock. Its unlikely (but possible)
- * that msk->ack_seq has been advanced since the last
- * call found in-sequence data.
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+
+ skb_ext_reset(skb);
+ skb_orphan(skb);
+
+ /* the skb map_seq accounts for the skb offset:
+ * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
+ * value
*/
- if (likely(dsn == msk->ack_seq))
+ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+
+ if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
+ /* in sequence */
+ WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (tail && mptcp_try_coalesce(sk, tail, skb))
+ return true;
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
+ } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
+ mptcp_data_queue_ofo(msk, skb);
+ return false;
+ }
- subflow->data_avail = 0;
- return mptcp_subflow_data_available(ssk);
+ /* old data, keep it simple and drop the whole pkt, sender
+ * will retransmit as needed, if needed.
+ */
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ mptcp_drop(sk, skb);
+ return false;
+}
+
+static void mptcp_stop_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
}
static void mptcp_check_data_fin_ack(struct sock *sk)
@@ -313,14 +454,12 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
unsigned int moved = 0;
bool more_data_avail;
struct tcp_sock *tp;
+ u32 old_copied_seq;
bool done = false;
- if (!mptcp_subflow_dsn_valid(msk, ssk)) {
- *bytes = 0;
- return false;
- }
-
+ pr_debug("msk=%p ssk=%p", msk, ssk);
tp = tcp_sk(ssk);
+ old_copied_seq = tp->copied_seq;
do {
u32 map_remaining, offset;
u32 seq = tp->copied_seq;
@@ -332,8 +471,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
mptcp_subflow_get_map_offset(subflow);
skb = skb_peek(&ssk->sk_receive_queue);
- if (!skb)
+ if (!skb) {
+ /* if no data is found, a racing workqueue/recvmsg
+ * already processed the new data, stop here or we
+ * can enter an infinite loop
+ */
+ if (!moved)
+ done = true;
break;
+ }
if (__mptcp_check_fallback(msk)) {
/* if we are running under the workqueue, TCP could have
@@ -357,9 +503,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
if (tp->urg_data)
done = true;
- __mptcp_move_skb(msk, ssk, skb, offset, len);
+ if (__mptcp_move_skb(msk, ssk, skb, offset, len))
+ moved += len;
seq += len;
- moved += len;
if (WARN_ON_ONCE(map_remaining < len))
break;
@@ -378,20 +524,56 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
} while (more_data_avail);
- *bytes = moved;
-
- /* If the moves have caught up with the DATA_FIN sequence number
- * it's time to ack the DATA_FIN and change socket state, but
- * this is not a good place to change state. Let the workqueue
- * do it.
- */
- if (mptcp_pending_data_fin(sk, NULL) &&
- schedule_work(&msk->work))
- sock_hold(sk);
+ *bytes += moved;
+ if (tp->copied_seq != old_copied_seq)
+ tcp_cleanup_rbuf(ssk, 1);
return done;
}
+static bool mptcp_ofo_queue(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb, *tail;
+ bool moved = false;
+ struct rb_node *p;
+ u64 end_seq;
+
+ p = rb_first(&msk->out_of_order_queue);
+ pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ while (p) {
+ skb = rb_to_skb(p);
+ if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
+ break;
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &msk->out_of_order_queue);
+
+ if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
+ msk->ack_seq))) {
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ continue;
+ }
+
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
+ int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+
+ /* skip overlapping data, if any */
+ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
+ MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
+ delta);
+ MPTCP_SKB_CB(skb)->offset += delta;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+ msk->ack_seq = end_seq;
+ moved = true;
+ }
+ return moved;
+}
+
/* In most cases we will be able to lock the mptcp socket. If its already
* owned, we need to defer to the work queue to avoid ABBA deadlock.
*/
@@ -407,8 +589,19 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
return false;
/* must re-check after taking the lock */
- if (!READ_ONCE(sk->sk_lock.owned))
+ if (!READ_ONCE(sk->sk_lock.owned)) {
__mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ mptcp_ofo_queue(msk);
+
+ /* If the moves have caught up with the DATA_FIN sequence number
+ * it's time to ack the DATA_FIN and change socket state, but
+ * this is not a good place to change state. Let the workqueue
+ * do it.
+ */
+ if (mptcp_pending_data_fin(sk, NULL) &&
+ schedule_work(&msk->work))
+ sock_hold(sk);
+ }
spin_unlock_bh(&sk->sk_lock.slock);
@@ -417,9 +610,17 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(sk);
+ bool wake;
- set_bit(MPTCP_DATA_READY, &msk->flags);
+ /* move_skbs_to_msk below can legitly clear the data_avail flag,
+ * but we will need later to properly woke the reader, cache its
+ * value
+ */
+ wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
+ if (wake)
+ set_bit(MPTCP_DATA_READY, &msk->flags);
if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
move_skbs_to_msk(msk, ssk))
@@ -440,7 +641,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
move_skbs_to_msk(msk, ssk);
}
wake:
- sk->sk_data_ready(sk);
+ if (wake)
+ sk->sk_data_ready(sk);
}
static void __mptcp_flush_join_list(struct mptcp_sock *msk)
@@ -474,7 +676,7 @@ void mptcp_data_acked(struct sock *sk)
{
mptcp_reset_timer(sk);
- if ((!sk_stream_is_writeable(sk) ||
+ if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
(inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
schedule_work(&mptcp_sk(sk)->work))
sock_hold(sk);
@@ -569,6 +771,20 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page);
}
+static bool mptcp_is_writeable(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!sk_stream_is_writeable((struct sock *)msk))
+ return false;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (sk_stream_is_writeable(subflow->tcp_sock))
+ return true;
+ }
+ return false;
+}
+
static void mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -611,8 +827,15 @@ out:
sk_mem_reclaim_partial(sk);
/* Only wake up writers if a subflow is ready */
- if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ if (mptcp_is_writeable(msk)) {
+ set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
+ smp_mb__after_atomic();
+
+ /* set SEND_SPACE before sk_stream_write_space clears
+ * NOSPACE
+ */
sk_stream_write_space(sk);
+ }
}
}
@@ -803,60 +1026,128 @@ out:
return ret;
}
-static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
+static void mptcp_nospace(struct mptcp_sock *msk)
{
+ struct mptcp_subflow_context *subflow;
+
clear_bit(MPTCP_SEND_SPACE, &msk->flags);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
- /* enables sk->write_space() callbacks */
- set_bit(SOCK_NOSPACE, &sock->flags);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct socket *sock = READ_ONCE(ssk->sk_socket);
+
+ /* enables ssk->write_space() callbacks */
+ if (sock)
+ set_bit(SOCK_NOSPACE, &sock->flags);
+ }
+}
+
+static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
+ if (subflow->request_join && !subflow->fully_established)
+ return false;
+
+ /* only send if our side has not closed yet */
+ return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
}
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
+ sizeof(struct tcphdr) - \
+ MAX_TCP_OPTION_SPACE - \
+ sizeof(struct ipv6hdr) - \
+ sizeof(struct frag_hdr))
+
+struct subflow_send_info {
+ struct sock *ssk;
+ u64 ratio;
+};
+
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
+ u32 *sndbuf)
{
+ struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
- struct sock *backup = NULL;
+ int i, nr_active = 0;
+ struct sock *ssk;
+ u64 ratio;
+ u32 pace;
- sock_owned_by_me((const struct sock *)msk);
+ sock_owned_by_me((struct sock *)msk);
+ *sndbuf = 0;
if (!mptcp_ext_cache_refill(msk))
return NULL;
+ if (__mptcp_check_fallback(msk)) {
+ if (!msk->first)
+ return NULL;
+ *sndbuf = msk->first->sk_sndbuf;
+ return sk_stream_memory_free(msk->first) ? msk->first : NULL;
+ }
+
+ /* re-use last subflow, if the burst allow that */
+ if (msk->last_snd && msk->snd_burst > 0 &&
+ sk_stream_memory_free(msk->last_snd) &&
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ }
+ return msk->last_snd;
+ }
+
+ /* pick the subflow with the lower wmem/wspace ratio */
+ for (i = 0; i < 2; ++i) {
+ send_info[i].ssk = NULL;
+ send_info[i].ratio = -1;
+ }
mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
- if (!sk_stream_memory_free(ssk)) {
- struct socket *sock = ssk->sk_socket;
+ nr_active += !subflow->backup;
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ if (!sk_stream_memory_free(subflow->tcp_sock))
+ continue;
- if (sock)
- mptcp_nospace(msk, sock);
+ pace = READ_ONCE(ssk->sk_pacing_rate);
+ if (!pace)
+ continue;
- return NULL;
+ ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
+ pace);
+ if (ratio < send_info[subflow->backup].ratio) {
+ send_info[subflow->backup].ssk = ssk;
+ send_info[subflow->backup].ratio = ratio;
}
+ }
- if (subflow->backup) {
- if (!backup)
- backup = ssk;
+ pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
+ msk, nr_active, send_info[0].ssk, send_info[0].ratio,
+ send_info[1].ssk, send_info[1].ratio);
- continue;
- }
+ /* pick the best backup if no other subflow is active */
+ if (!nr_active)
+ send_info[0].ssk = send_info[1].ssk;
- return ssk;
+ if (send_info[0].ssk) {
+ msk->last_snd = send_info[0].ssk;
+ msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
+ sk_stream_wspace(msk->last_snd));
+ return msk->last_snd;
}
-
- return backup;
+ return NULL;
}
-static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
+static void ssk_check_wmem(struct mptcp_sock *msk)
{
- struct socket *sock;
-
- if (likely(sk_stream_is_writeable(ssk)))
- return;
-
- sock = READ_ONCE(ssk->sk_socket);
- if (sock)
- mptcp_nospace(msk, sock);
+ if (unlikely(!mptcp_is_writeable(msk)))
+ mptcp_nospace(msk);
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
@@ -866,6 +1157,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct page_frag *pfrag;
size_t copied = 0;
struct sock *ssk;
+ u32 sndbuf;
bool tx_ok;
long timeo;
@@ -892,7 +1184,7 @@ restart:
}
__mptcp_flush_join_list(msk);
- ssk = mptcp_subflow_get_send(msk);
+ ssk = mptcp_subflow_get_send(msk, &sndbuf);
while (!sk_stream_memory_free(sk) ||
!ssk ||
!mptcp_page_frag_refill(ssk, pfrag)) {
@@ -909,19 +1201,25 @@ restart:
mptcp_reset_timer(sk);
}
+ mptcp_nospace(msk);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
mptcp_clean_una(sk);
- ssk = mptcp_subflow_get_send(msk);
+ ssk = mptcp_subflow_get_send(msk, &sndbuf);
if (list_empty(&msk->conn_list)) {
ret = -ENOTCONN;
goto out;
}
}
+ /* do auto tuning */
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+ sndbuf > READ_ONCE(sk->sk_sndbuf))
+ WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
@@ -938,6 +1236,10 @@ restart:
break;
}
+ /* burst can be negative, we will try move to the next subflow
+ * at selection time, if possible.
+ */
+ msk->snd_burst -= ret;
copied += ret;
tx_ok = msg_data_left(msg);
@@ -947,7 +1249,6 @@ restart:
if (!sk_stream_memory_free(ssk) ||
!mptcp_page_frag_refill(ssk, pfrag) ||
!mptcp_ext_cache_refill(msk)) {
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
tcp_push(ssk, msg->msg_flags, mss_now,
tcp_sk(ssk)->nonagle, size_goal);
mptcp_set_timeout(sk, ssk);
@@ -995,9 +1296,9 @@ restart:
mptcp_reset_timer(sk);
}
- ssk_check_wmem(msk, ssk);
release_sock(ssk);
out:
+ ssk_check_wmem(msk);
release_sock(sk);
return copied ? : ret;
}
@@ -1135,10 +1436,14 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
*/
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
+ bool slow;
ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
tcp_sk(ssk)->window_clamp = window_clamp;
+ tcp_cleanup_rbuf(ssk, 1);
+ unlock_sock_fast(ssk, slow);
}
}
}
@@ -1154,6 +1459,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0;
bool done;
+ /* avoid looping forever below on racing close */
+ if (((struct sock *)msk)->sk_state == TCP_CLOSE)
+ return false;
+
+ __mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
@@ -1165,7 +1475,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
release_sock(ssk);
} while (!done);
- return moved > 0;
+ if (mptcp_ofo_queue(msk) || moved > 0) {
+ mptcp_check_data_fin((struct sock *)msk);
+ return true;
+ }
+ return false;
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
@@ -1261,6 +1575,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
set_bit(MPTCP_DATA_READY, &msk->flags);
}
out_err:
+ pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
+ msk, test_bit(MPTCP_DATA_READY, &msk->flags),
+ skb_queue_empty(&sk->sk_receive_queue), copied);
mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
@@ -1311,9 +1628,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
sock_owned_by_me((const struct sock *)msk);
+ if (__mptcp_check_fallback(msk))
+ return msk->first;
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
+
/* still data outstanding at TCP level? Don't retransmit. */
if (!tcp_write_queue_empty(ssk))
return NULL;
@@ -1338,9 +1661,9 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
* so we need to use tcp_close() after detaching them from the mptcp
* parent socket.
*/
-static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
- struct mptcp_subflow_context *subflow,
- long timeout)
+void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ long timeout)
{
struct socket *sock = READ_ONCE(ssk->sk_socket);
@@ -1371,6 +1694,10 @@ static void pm_work(struct mptcp_sock *msk)
pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
mptcp_pm_nl_add_addr_received(msk);
}
+ if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
+ mptcp_pm_nl_rm_addr_received(msk);
+ }
if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
mptcp_pm_nl_fully_established(msk);
@@ -1383,6 +1710,20 @@ static void pm_work(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
}
+static void __mptcp_close_subflow(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (inet_sk_state_load(ssk) != TCP_CLOSE)
+ continue;
+
+ __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0);
+ }
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -1400,6 +1741,9 @@ static void mptcp_worker(struct work_struct *work)
mptcp_clean_una(sk);
mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk);
+ if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ __mptcp_close_subflow(msk);
+
__mptcp_move_skbs(msk);
if (msk->pm.status)
@@ -1474,6 +1818,7 @@ static int __mptcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker);
+ msk->out_of_order_queue = RB_ROOT;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -1491,23 +1836,23 @@ static int mptcp_init_sock(struct sock *sk)
struct net *net = sock_net(sk);
int ret;
+ ret = __mptcp_init_sock(sk);
+ if (ret)
+ return ret;
+
if (!mptcp_is_enabled(net))
return -ENOPROTOOPT;
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
- ret = __mptcp_init_sock(sk);
- if (ret)
- return ret;
-
ret = __mptcp_socket_create(mptcp_sk(sk));
if (ret)
return ret;
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
return 0;
}
@@ -1531,7 +1876,7 @@ static void mptcp_cancel_work(struct sock *sk)
sock_put(sk);
}
-static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
+void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
{
lock_sock(ssk);
@@ -1809,14 +2154,21 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
return newsk;
}
+void mptcp_destroy_common(struct mptcp_sock *msk)
+{
+ skb_rbtree_purge(&msk->out_of_order_queue);
+ mptcp_token_destroy(msk);
+ mptcp_pm_free_anno_list(msk);
+}
+
static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- mptcp_token_destroy(msk);
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
+ mptcp_destroy_common(msk);
sk_sockets_allocated_dec(sk);
}
@@ -2288,13 +2640,13 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
+ pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
if (state == TCP_LISTEN)
return mptcp_check_readable(msk);
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk);
- if (sk_stream_is_writeable(sk) &&
- test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
mask |= EPOLLOUT | EPOLLWRNORM;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 20f04ac85409..13ab89dc1914 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -90,6 +90,7 @@
#define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4
+#define MPTCP_WORK_CLOSE_SUBFLOW 5
struct mptcp_options_received {
u64 sndr_key;
@@ -140,6 +141,8 @@ struct mptcp_addr_info {
sa_family_t family;
__be16 port;
u8 id;
+ u8 flags;
+ int ifindex;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -150,6 +153,7 @@ struct mptcp_addr_info {
enum mptcp_pm_status {
MPTCP_PM_ADD_ADDR_RECEIVED,
+ MPTCP_PM_RM_ADDR_RECEIVED,
MPTCP_PM_ESTABLISHED,
MPTCP_PM_SUBFLOW_ESTABLISHED,
};
@@ -157,14 +161,17 @@ enum mptcp_pm_status {
struct mptcp_pm_data {
struct mptcp_addr_info local;
struct mptcp_addr_info remote;
+ struct list_head anno_list;
spinlock_t lock; /*protects the whole PM data */
- bool addr_signal;
+ bool add_addr_signal;
+ bool rm_addr_signal;
bool server_side;
bool work_pending;
bool accept_addr;
bool accept_subflow;
+ bool add_addr_echo;
u8 add_addr_signaled;
u8 add_addr_accepted;
u8 local_addr_used;
@@ -174,6 +181,7 @@ struct mptcp_pm_data {
u8 local_addr_max;
u8 subflows_max;
u8 status;
+ u8 rm_id;
};
struct mptcp_data_frag {
@@ -194,6 +202,8 @@ struct mptcp_sock {
u64 write_seq;
u64 ack_seq;
u64 rcv_data_fin_seq;
+ struct sock *last_snd;
+ int snd_burst;
atomic64_t snd_una;
unsigned long timer_ival;
u32 token;
@@ -202,8 +212,11 @@ struct mptcp_sock {
bool fully_established;
bool rcv_data_fin;
bool snd_data_fin_enable;
+ bool use_64bit_ack; /* Set when we received a 64-bit DSN */
spinlock_t join_list_lock;
struct work_struct work;
+ struct sk_buff *ooo_last_skb;
+ struct rb_root out_of_order_queue;
struct list_head conn_list;
struct list_head rtx_queue;
struct list_head join_list;
@@ -268,6 +281,12 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
return (struct mptcp_subflow_request_sock *)rsk;
}
+enum mptcp_data_avail {
+ MPTCP_SUBFLOW_NODATA,
+ MPTCP_SUBFLOW_DATA_AVAIL,
+ MPTCP_SUBFLOW_OOO_DATA
+};
+
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
@@ -292,10 +311,9 @@ struct mptcp_subflow_context {
map_valid : 1,
mpc_map : 1,
backup : 1,
- data_avail : 1,
rx_eof : 1,
- use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
can_ack : 1; /* only after processing the remote a key */
+ enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -348,10 +366,14 @@ void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void);
+void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
+void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ long timeout);
+void mptcp_subflow_reset(struct sock *ssk);
/* called with sk socket lock held */
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
@@ -388,6 +410,7 @@ bool mptcp_finish_join(struct sock *sk);
void mptcp_data_acked(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
+void mptcp_destroy_common(struct mptcp_sock *msk);
void __init mptcp_token_init(void);
static inline void mptcp_token_init_request(struct request_sock *req)
@@ -421,26 +444,40 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk,
void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
+void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id);
+void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
+struct mptcp_pm_add_entry *
+mptcp_pm_del_add_timer(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr);
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr);
+ const struct mptcp_addr_info *addr,
+ bool echo);
int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id);
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id);
+
+static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.add_addr_signal);
+}
-static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk)
+static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk)
{
- return READ_ONCE(msk->pm.addr_signal);
+ return READ_ONCE(msk->pm.rm_addr_signal);
}
-static inline unsigned int mptcp_add_addr_len(int family)
+static inline unsigned int mptcp_add_addr_len(int family, bool echo)
{
if (family == AF_INET)
- return TCPOLEN_MPTCP_ADD_ADDR;
- return TCPOLEN_MPTCP_ADD_ADDR6;
+ return echo ? TCPOLEN_MPTCP_ADD_ADDR_BASE
+ : TCPOLEN_MPTCP_ADD_ADDR;
+ return echo ? TCPOLEN_MPTCP_ADD_ADDR6_BASE : TCPOLEN_MPTCP_ADD_ADDR6;
}
-bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
- struct mptcp_addr_info *saddr);
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr, bool *echo);
+bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ u8 *rm_id);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
void __init mptcp_pm_nl_init(void);
@@ -448,6 +485,8 @@ void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk);
+void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk);
+void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
@@ -464,12 +503,12 @@ static inline bool before64(__u64 seq1, __u64 seq2)
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
-static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
+static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
{
return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
-static inline bool mptcp_check_fallback(struct sock *sk)
+static inline bool mptcp_check_fallback(const struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 6f035af1c9d2..ac4a1fe3550b 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,6 +20,7 @@
#include <net/ip6_route.h>
#endif
#include <net/mptcp.h>
+#include <uapi/linux/mptcp.h>
#include "protocol.h"
#include "mib.h"
@@ -270,6 +271,19 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
return thmac == subflow->thmac;
}
+void mptcp_subflow_reset(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+
+ tcp_set_state(ssk, TCP_CLOSE);
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ tcp_done(ssk);
+ if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
+ schedule_work(&mptcp_sk(sk)->work))
+ sock_hold(sk);
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -342,8 +356,7 @@ fallback:
return;
do_reset:
- tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_done(sk);
+ mptcp_subflow_reset(sk);
}
struct request_sock_ops mptcp_subflow_request_sock_ops;
@@ -434,7 +447,7 @@ static void mptcp_sock_destruct(struct sock *sk)
sock_orphan(sk);
}
- mptcp_token_destroy(mptcp_sk(sk));
+ mptcp_destroy_common(mptcp_sk(sk));
inet_sock_destruct(sk);
}
@@ -769,12 +782,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
if (!mpext->dsn64) {
map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
mpext->data_seq);
- subflow->use_64bit_ack = 0;
pr_debug("expanded seq=%llu", subflow->map_seq);
} else {
map_seq = mpext->data_seq;
- subflow->use_64bit_ack = 1;
}
+ WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
if (subflow->map_valid) {
/* Allow replacing only with an identical map */
@@ -817,16 +829,25 @@ validate_seq:
return MAPPING_OK;
}
-static int subflow_read_actor(read_descriptor_t *desc,
- struct sk_buff *skb,
- unsigned int offset, size_t len)
+static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
+ u64 limit)
{
- size_t copy_len = min(desc->count, len);
-
- desc->count -= copy_len;
-
- pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
- return copy_len;
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ u32 incr;
+
+ incr = limit >= skb->len ? skb->len + fin : limit;
+
+ pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
+ subflow->map_subflow_seq);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
+ tcp_sk(ssk)->copied_seq += incr;
+ if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
+ sk_eat_skb(ssk, skb);
+ if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
+ subflow->map_valid = 0;
+ if (incr)
+ tcp_cleanup_rbuf(ssk, incr);
}
static bool subflow_check_data_avail(struct sock *ssk)
@@ -838,13 +859,13 @@ static bool subflow_check_data_avail(struct sock *ssk)
pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
+ if (!skb_peek(&ssk->sk_receive_queue))
+ subflow->data_avail = 0;
if (subflow->data_avail)
return true;
msk = mptcp_sk(subflow->conn);
for (;;) {
- u32 map_remaining;
- size_t delta;
u64 ack_seq;
u64 old_ack;
@@ -862,6 +883,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
subflow->ssn_offset;
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
return true;
}
@@ -889,42 +911,18 @@ static bool subflow_check_data_avail(struct sock *ssk)
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
ack_seq);
- if (ack_seq == old_ack)
+ if (ack_seq == old_ack) {
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
break;
+ } else if (after64(ack_seq, old_ack)) {
+ subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
+ break;
+ }
/* only accept in-sequence mapping. Old values are spurious
- * retransmission; we can hit "future" values on active backup
- * subflow switch, we relay on retransmissions to get
- * in-sequence data.
- * Cuncurrent subflows support will require subflow data
- * reordering
+ * retransmission
*/
- map_remaining = subflow->map_data_len -
- mptcp_subflow_get_map_offset(subflow);
- if (before64(ack_seq, old_ack))
- delta = min_t(size_t, old_ack - ack_seq, map_remaining);
- else
- delta = min_t(size_t, ack_seq - old_ack, map_remaining);
-
- /* discard mapped data */
- pr_debug("discarding %zu bytes, current map len=%d", delta,
- map_remaining);
- if (delta) {
- read_descriptor_t desc = {
- .count = delta,
- };
- int ret;
-
- ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
- if (ret < 0) {
- ssk->sk_err = -ret;
- goto fatal;
- }
- if (ret < delta)
- return false;
- if (delta == map_remaining)
- subflow->map_valid = 0;
- }
+ mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
}
return true;
@@ -935,13 +933,13 @@ fatal:
ssk->sk_error_report(ssk);
tcp_set_state(ssk, TCP_CLOSE);
tcp_send_active_reset(ssk, GFP_ATOMIC);
+ subflow->data_avail = 0;
return false;
}
bool mptcp_subflow_data_available(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sk_buff *skb;
/* check if current mapping is still valid */
if (subflow->map_valid &&
@@ -954,15 +952,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
subflow->map_data_len);
}
- if (!subflow_check_data_avail(sk)) {
- subflow->data_avail = 0;
- return false;
- }
-
- skb = skb_peek(&sk->sk_receive_queue);
- subflow->data_avail = skb &&
- before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
- return subflow->data_avail;
+ return subflow_check_data_avail(sk);
}
/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
@@ -1009,8 +999,10 @@ static void subflow_write_space(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
- sk_stream_write_space(sk);
- if (sk_stream_is_writeable(sk)) {
+ if (!sk_stream_is_writeable(sk))
+ return;
+
+ if (sk_stream_is_writeable(parent)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
@@ -1069,8 +1061,7 @@ static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
#endif
}
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1115,7 +1106,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
if (loc->family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- ssk->sk_bound_dev_if = ifindex;
+ ssk->sk_bound_dev_if = loc->ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
goto failed;
@@ -1127,7 +1118,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
subflow->local_id = local_id;
subflow->remote_id = remote_id;
subflow->request_join = 1;
- subflow->request_bkup = 1;
+ subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
mptcp_info2sockaddr(remote, &addr);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);