summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2014-09-29 00:13:17 -0400
committerDavid S. Miller <davem@davemloft.net>2014-09-29 00:13:17 -0400
commita11238ec28d40f56f8b939f6f125694dba3adb70 (patch)
tree3a13df46a74af91d928dc4ac5150c2815ee42207
parent53dfd501819a6e9c3a7d56cac1ddaf03fe90800d (diff)
parente3118e8359bb7c59555aca60c725106e6d78c5ce (diff)
Merge branch 'dctcp'
Daniel Borkmann says: ==================== net: tcp: DCTCP congestion control algorithm This patch series adds support for the DataCenter TCP (DCTCP) congestion control algorithm. Please see individual patches for the details. The last patch adds DCTCP as a congestion control module, and previous ones add needed infrastructure to extend the congestion control framework. Joint work between Florian Westphal, Daniel Borkmann and Glenn Judd. v3 -> v2: - No changes anywhere, just a resend as requested by Dave - Added Stephen's ACK v1 -> v2: - Rebased to latest net-next - Addressed Eric's feedback, thanks! - Update stale comment wrt. DCTCP ECN usage - Don't call INET_ECN_xmit for every packet - Add dctcp ss/inetdiag support to expose internal stats to userspace ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/dctcp.txt43
-rw-r--r--include/net/tcp.h78
-rw-r--r--include/uapi/linux/inet_diag.h13
-rw-r--r--net/ipv4/Kconfig26
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/tcp.c6
-rw-r--r--net/ipv4/tcp_cong.c46
-rw-r--r--net/ipv4/tcp_dctcp.c344
-rw-r--r--net/ipv4/tcp_input.c32
-rw-r--r--net/ipv4/tcp_minisocks.c5
-rw-r--r--net/ipv4/tcp_output.c30
-rw-r--r--net/ipv4/tcp_westwood.c28
12 files changed, 574 insertions, 78 deletions
diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt
new file mode 100644
index 000000000000..0d5dfbc89ec9
--- /dev/null
+++ b/Documentation/networking/dctcp.txt
@@ -0,0 +1,43 @@
+DCTCP (DataCenter TCP)
+----------------------
+
+DCTCP is an enhancement to the TCP congestion control algorithm for data
+center networks and leverages Explicit Congestion Notification (ECN) in
+the data center network to provide multi-bit feedback to the end hosts.
+
+To enable it on end hosts:
+
+ sysctl -w net.ipv4.tcp_congestion_control=dctcp
+
+All switches in the data center network running DCTCP must support ECN
+marking and be configured for marking when reaching defined switch buffer
+thresholds. The default ECN marking threshold heuristic for DCTCP on
+switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps,
+but might need further careful tweaking.
+
+For more details, see below documents:
+
+Paper:
+
+The algorithm is further described in detail in the following two
+SIGCOMM/SIGMETRICS papers:
+
+ i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ "Data Center TCP (DCTCP)", Data Center Networks session
+ Proc. ACM SIGCOMM, New Delhi, 2010.
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192
+
+ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ Proc. ACM SIGMETRICS, San Jose, 2011.
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+
+IETF informational draft:
+
+ http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00
+
+DCTCP site:
+
+ http://simula.stanford.edu/~alizade/Site/DCTCP.html
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 02a9a2c366bf..1f57c5363492 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -733,23 +733,6 @@ struct tcp_skb_cb {
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
-/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
- *
- * If we receive a SYN packet with these bits set, it means a network is
- * playing bad games with TOS bits. In order to avoid possible false congestion
- * notifications, we disable TCP ECN negociation.
- */
-static inline void
-TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
- struct net *net)
-{
- const struct tcphdr *th = tcp_hdr(skb);
-
- if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
- INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
- inet_rsk(req)->ecn_ok = 1;
-}
-
/* Due to TSO, an SKB can be composed of multiple actual
* packets. To keep these tracked properly, we use this.
*/
@@ -780,8 +763,17 @@ enum tcp_ca_event {
CA_EVENT_CWND_RESTART, /* congestion window restart */
CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
CA_EVENT_LOSS, /* loss timeout */
- CA_EVENT_FAST_ACK, /* in sequence ack */
- CA_EVENT_SLOW_ACK, /* other ack */
+ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
+ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
+ CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
+ CA_EVENT_NON_DELAYED_ACK,
+};
+
+/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+enum tcp_ca_ack_event_flags {
+ CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
+ CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
+ CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
};
/*
@@ -791,7 +783,10 @@ enum tcp_ca_event {
#define TCP_CA_MAX 128
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
+/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1
+/* Requires ECN/ECT set on all packets */
+#define TCP_CONG_NEEDS_ECN 0x2
struct tcp_congestion_ops {
struct list_head list;
@@ -810,6 +805,8 @@ struct tcp_congestion_ops {
void (*set_state)(struct sock *sk, u8 new_state);
/* call when cwnd event occurs (optional) */
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+ /* call when ack arrives (optional) */
+ void (*in_ack_event)(struct sock *sk, u32 flags);
/* new value of cwnd after loss (optional) */
u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
@@ -824,6 +821,7 @@ struct tcp_congestion_ops {
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
void tcp_cleanup_congestion_control(struct sock *sk);
int tcp_set_default_congestion_control(const char *name);
@@ -835,11 +833,17 @@ int tcp_set_congestion_control(struct sock *sk, const char *name);
int tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
-extern struct tcp_congestion_ops tcp_init_congestion_ops;
u32 tcp_reno_ssthresh(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
+static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
+}
+
static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -857,6 +861,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
icsk->icsk_ca_ops->cwnd_event(sk, event);
}
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negociation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control; it requires setting ECT on all packets,
+ * including SYN. We inverse the test in this case: If our
+ * local socket wants ECN, but peer only set ece/cwr (but not
+ * ECT in IP header) its probably a non-DCTCP aware sender.
+ */
+static inline void
+TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
+ const struct sock *listen_sk)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct net *net = sock_net(listen_sk);
+ bool th_ecn = th->ece && th->cwr;
+ bool ect, need_ecn;
+
+ if (!th_ecn)
+ return;
+
+ ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+ need_ecn = tcp_ca_needs_ecn(listen_sk);
+
+ if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+ inet_rsk(req)->ecn_ok = 1;
+ else if (ect && need_ecn)
+ inet_rsk(req)->ecn_ok = 1;
+}
+
/* These functions determine how the current flow behaves in respect of SACK
* handling. SACK is negotiated with the peer, and therefore it can vary
* between different flows.
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index bbde90fa5838..d65c0a09efd3 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -110,10 +110,10 @@ enum {
INET_DIAG_TCLASS,
INET_DIAG_SKMEMINFO,
INET_DIAG_SHUTDOWN,
+ INET_DIAG_DCTCPINFO,
};
-#define INET_DIAG_MAX INET_DIAG_SHUTDOWN
-
+#define INET_DIAG_MAX INET_DIAG_DCTCPINFO
/* INET_DIAG_MEM */
@@ -133,5 +133,14 @@ struct tcpvegas_info {
__u32 tcpv_minrtt;
};
+/* INET_DIAG_DCTCPINFO */
+
+struct tcp_dctcp_info {
+ __u16 dctcp_enabled;
+ __u16 dctcp_ce_state;
+ __u32 dctcp_alpha;
+ __u32 dctcp_ab_ecn;
+ __u32 dctcp_ab_tot;
+};
#endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 84f710b7472a..69fb37854449 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS
For further details see:
http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+config TCP_CONG_DCTCP
+ tristate "DataCenter TCP (DCTCP)"
+ default n
+ ---help---
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
+
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
+
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
+
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -598,9 +619,11 @@ choice
config DEFAULT_WESTWOOD
bool "Westwood" if TCP_CONG_WESTWOOD=y
+ config DEFAULT_DCTCP
+ bool "DCTCP" if TCP_CONG_DCTCP=y
+
config DEFAULT_RENO
bool "Reno"
-
endchoice
endif
@@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG
default "westwood" if DEFAULT_WESTWOOD
default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
+ default "dctcp" if DEFAULT_DCTCP
default "cubic"
config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index d78d404c596f..d8105787c199 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87289e51be00..cf5e508e1ef5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
tp->reordering = sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
- icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
@@ -3258,8 +3258,6 @@ void __init tcp_init(void)
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
tcp_metrics_init();
-
- tcp_register_congestion_control(&tcp_reno);
-
+ BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 80248f56c89f..a6c8a5775624 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
/* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct sock *sk)
+void tcp_assign_congestion_control(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca;
- /* if no choice made yet assign the current value set as default */
- if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
- rcu_read_lock();
- list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
- if (try_module_get(ca->owner)) {
- icsk->icsk_ca_ops = ca;
- break;
- }
-
- /* fallback to next available */
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (likely(try_module_get(ca->owner))) {
+ icsk->icsk_ca_ops = ca;
+ goto out;
}
- rcu_read_unlock();
+ /* Fallback to next available. The last really
+ * guaranteed fallback is Reno from this list.
+ */
}
+out:
+ rcu_read_unlock();
+
+ /* Clear out private data before diag gets it and
+ * the ca has not been initialized.
+ */
+ if (ca->get_info)
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+}
+
+void tcp_init_congestion_control(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
@@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = {
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
};
-
-/* Initial congestion control used (until SYN)
- * really reno under another name so we can tell difference
- * during tcp_set_default_congestion_control
- */
-struct tcp_congestion_ops tcp_init_congestion_ops = {
- .name = "",
- .owner = THIS_MODULE,
- .ssthresh = tcp_reno_ssthresh,
- .cong_avoid = tcp_reno_cong_avoid,
-};
-EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000000..b504371af742
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,344 @@
+/* DataCenter TCP (DCTCP) congestion control.
+ *
+ * http://simula.stanford.edu/~alizade/Site/DCTCP.html
+ *
+ * This is an implementation of DCTCP over Reno, an enhancement to the
+ * TCP congestion control algorithm designed for data centers. DCTCP
+ * leverages Explicit Congestion Notification (ECN) in the network to
+ * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
+ * the following three data center transport requirements:
+ *
+ * - High burst tolerance (incast due to partition/aggregate)
+ * - Low latency (short flows, queries)
+ * - High throughput (continuous data updates, large file transfers)
+ * with commodity shallow buffered switches
+ *
+ * The algorithm is described in detail in the following two papers:
+ *
+ * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ * "Data Center TCP (DCTCP)", Data Center Networks session
+ * Proc. ACM SIGCOMM, New Delhi, 2010.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ *
+ * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ * "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ * Proc. ACM SIGMETRICS, San Jose, 2011.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+ *
+ * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <dborkman@redhat.com>
+ * Florian Westphal <fw@strlen.de>
+ * Glenn Judd <glenn.judd@morganstanley.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+#define DCTCP_MAX_ALPHA 1024U
+
+struct dctcp {
+ u32 acked_bytes_ecn;
+ u32 acked_bytes_total;
+ u32 prior_snd_una;
+ u32 prior_rcv_nxt;
+ u32 dctcp_alpha;
+ u32 next_seq;
+ u32 ce_state;
+ u32 delayed_ack_reserved;
+};
+
+static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+module_param(dctcp_shift_g, uint, 0644);
+MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
+
+static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+module_param(dctcp_alpha_on_init, uint, 0644);
+MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+
+static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
+module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
+MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
+ "parameter for clamping alpha on loss");
+
+static struct tcp_congestion_ops dctcp_reno;
+
+static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+{
+ ca->next_seq = tp->snd_nxt;
+
+ ca->acked_bytes_ecn = 0;
+ ca->acked_bytes_total = 0;
+}
+
+static void dctcp_init(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((tp->ecn_flags & TCP_ECN_OK) ||
+ (sk->sk_state == TCP_LISTEN ||
+ sk->sk_state == TCP_CLOSE)) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ ca->prior_snd_una = tp->snd_una;
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+
+ ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+
+ ca->delayed_ack_reserved = 0;
+ ca->ce_state = 0;
+
+ dctcp_reset(tp, ca);
+ return;
+ }
+
+ /* No ECN support? Fall back to Reno. Also need to clear
+ * ECT from sk since it is set during 3WHS for DCTCP.
+ */
+ inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
+ INET_ECN_dontxmit(sk);
+}
+
+static u32 dctcp_ssthresh(struct sock *sk)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+
+static void dctcp_ce_state_0_to_1(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* State has changed from CE=0 to CE=1 and delayed
+ * ACK has not sent yet.
+ */
+ if (!ca->ce_state && ca->delayed_ack_reserved) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+ tmp_rcv_nxt = tp->rcv_nxt;
+
+ /* Generate previous ack with CE=0. */
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = ca->prior_rcv_nxt;
+
+ tcp_send_ack(sk);
+
+ /* Recover current rcv_nxt. */
+ tp->rcv_nxt = tmp_rcv_nxt;
+ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 1;
+
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_ce_state_1_to_0(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* State has changed from CE=1 to CE=0 and delayed
+ * ACK has not sent yet.
+ */
+ if (ca->ce_state && ca->delayed_ack_reserved) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+ tmp_rcv_nxt = tp->rcv_nxt;
+
+ /* Generate previous ack with CE=1. */
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = ca->prior_rcv_nxt;
+
+ tcp_send_ack(sk);
+
+ /* Recover current rcv_nxt. */
+ tp->rcv_nxt = tmp_rcv_nxt;
+ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 0;
+
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_update_alpha(struct sock *sk, u32 flags)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
+ u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
+
+ /* If ack did not advance snd_una, count dupack as MSS size.
+ * If ack did update window, do not count it at all.
+ */
+ if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
+ acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+ if (acked_bytes) {
+ ca->acked_bytes_total += acked_bytes;
+ ca->prior_snd_una = tp->snd_una;
+
+ if (flags & CA_ACK_ECE)
+ ca->acked_bytes_ecn += acked_bytes;
+ }
+
+ /* Expired RTT */
+ if (!before(tp->snd_una, ca->next_seq)) {
+ /* For avoiding denominator == 1. */
+ if (ca->acked_bytes_total == 0)
+ ca->acked_bytes_total = 1;
+
+ /* alpha = (1 - g) * alpha + g * F */
+ ca->dctcp_alpha = ca->dctcp_alpha -
+ (ca->dctcp_alpha >> dctcp_shift_g) +
+ (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
+ ca->acked_bytes_total;
+
+ if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
+ /* Clamp dctcp_alpha to max. */
+ ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+
+ dctcp_reset(tp, ca);
+ }
+}
+
+static void dctcp_state(struct sock *sk, u8 new_state)
+{
+ if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ /* If this extension is enabled, we clamp dctcp_alpha to
+ * max on packet loss; the motivation is that dctcp_alpha
+ * is an indicator to the extend of congestion and packet
+ * loss is an indicator of extreme congestion; setting
+ * this in practice turned out to be beneficial, and
+ * effectively assumes total congestion which reduces the
+ * window by half.
+ */
+ ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+ }
+}
+
+static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ switch (ev) {
+ case CA_EVENT_DELAYED_ACK:
+ if (!ca->delayed_ack_reserved)
+ ca->delayed_ack_reserved = 1;
+ break;
+ case CA_EVENT_NON_DELAYED_ACK:
+ if (ca->delayed_ack_reserved)
+ ca->delayed_ack_reserved = 0;
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+ switch (ev) {
+ case CA_EVENT_ECN_IS_CE:
+ dctcp_ce_state_0_to_1(sk);
+ break;
+ case CA_EVENT_ECN_NO_CE:
+ dctcp_ce_state_1_to_0(sk);
+ break;
+ case CA_EVENT_DELAYED_ACK:
+ case CA_EVENT_NON_DELAYED_ACK:
+ dctcp_update_ack_reserved(sk, ev);
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+
+ /* Fill it also in case of VEGASINFO due to req struct limits.
+ * We can still correctly retrieve it later.
+ */
+ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcp_dctcp_info info;
+
+ memset(&info, 0, sizeof(info));
+ if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
+ info.dctcp_enabled = 1;
+ info.dctcp_ce_state = (u16) ca->ce_state;
+ info.dctcp_alpha = ca->dctcp_alpha;
+ info.dctcp_ab_ecn = ca->acked_bytes_ecn;
+ info.dctcp_ab_tot = ca->acked_bytes_total;
+ }
+
+ nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
+ }
+}
+
+static struct tcp_congestion_ops dctcp __read_mostly = {
+ .init = dctcp_init,
+ .in_ack_event = dctcp_update_alpha,
+ .cwnd_event = dctcp_cwnd_event,
+ .ssthresh = dctcp_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .set_state = dctcp_state,
+ .get_info = dctcp_get_info,
+ .flags = TCP_CONG_NEEDS_ECN,
+ .owner = THIS_MODULE,
+ .name = "dctcp",
+};
+
+static struct tcp_congestion_ops dctcp_reno __read_mostly = {
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .get_info = dctcp_get_info,
+ .owner = THIS_MODULE,
+ .name = "dctcp-reno",
+};
+
+static int __init dctcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&dctcp);
+}
+
+static void __exit dctcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&dctcp);
+}
+
+module_init(dctcp_register);
+module_exit(dctcp_unregister);
+
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5073eefa6fae..fc133178c787 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
tcp_enter_quickack_mode((struct sock *)tp);
break;
case INET_ECN_CE:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode((struct sock *)tp);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
- /* fallinto */
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
default:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
}
}
@@ -3362,6 +3369,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
}
}
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->in_ack_event)
+ icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3421,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
- tcp_ca_event(sk, CA_EVENT_FAST_ACK);
+ tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
+ u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
@@ -3436,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt_us);
- if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
+ if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
+ ack_ev_flags |= CA_ACK_ECE;
+ }
+
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
- tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+ tcp_in_ack_event(sk, ack_ev_flags);
}
/* We passed data and got it acked, remove any soft error
@@ -5944,7 +5966,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, skb, sock_net(sk));
+ TCP_ECN_create_request(req, skb, sk);
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a058f411d3a6..47b73506b77e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
- if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
- !try_module_get(newicsk->icsk_ca_ops->owner))
- newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ if (!try_module_get(newicsk->icsk_ca_ops->owner))
+ tcp_assign_congestion_control(newsk);
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4d92703df4c6..86a0216fcaa1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk)
}
/* Packet ECN state for a SYN-ACK */
-static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+ else if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
}
/* Packet ECN state for a SYN. */
@@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ tcp_ca_needs_ecn(sk)) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
}
}
static __inline__ void
-TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
+TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
+ struct sock *sk)
{
- if (inet_rsk(req)->ecn_ok)
+ if (inet_rsk(req)->ecn_ok) {
th->ece = 1;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ }
}
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
tcp_hdr(skb)->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
- } else {
+ } else if (!tcp_ca_needs_ecn(sk)) {
/* ACK or retransmitted segment: clear ECT|CE */
INET_ECN_dontxmit(sk);
}
@@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk)
}
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
- TCP_ECN_send_synack(tcp_sk(sk), skb);
+ TCP_ECN_send_synack(sk, skb);
}
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
@@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- TCP_ECN_make_synack(req, th);
+ TCP_ECN_make_synack(req, th, sk);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
@@ -3119,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk)
int ato = icsk->icsk_ack.ato;
unsigned long timeout;
+ tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
+
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
@@ -3175,6 +3188,8 @@ void tcp_send_ack(struct sock *sk)
if (sk->sk_state == TCP_CLOSE)
return;
+ tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
+
/* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this
* sock.
@@ -3196,6 +3211,7 @@ void tcp_send_ack(struct sock *sk)
skb_mstamp_get(&buff->skb_mstamp);
tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
}
+EXPORT_SYMBOL_GPL(tcp_send_ack);
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 81911a92356c..bb63fba47d47 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
}
+static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
+{
+ if (ack_flags & CA_ACK_SLOWPATH) {
+ struct westwood *w = inet_csk_ca(sk);
+
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
+
+ update_rtt_min(w);
+ return;
+ }
+
+ westwood_fast_bw(sk);
+}
+
static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
{
struct tcp_sock *tp = tcp_sk(sk);
struct westwood *w = inet_csk_ca(sk);
switch (event) {
- case CA_EVENT_FAST_ACK:
- westwood_fast_bw(sk);
- break;
-
case CA_EVENT_COMPLETE_CWR:
tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
break;
-
case CA_EVENT_LOSS:
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
/* Update RTT_min when next ack arrives */
w->reset_rtt_min = 1;
break;
-
- case CA_EVENT_SLOW_ACK:
- westwood_update_window(sk);
- w->bk += westwood_acked_count(sk);
- update_rtt_min(w);
- break;
-
default:
/* don't care */
break;
@@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
.cwnd_event = tcp_westwood_event,
+ .in_ack_event = tcp_westwood_ack,
.get_info = tcp_westwood_info,
.pkts_acked = tcp_westwood_pkts_acked,