From c5c6a8ab45ec0f18733afb4aaade0d4a139d80b3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 5 Jan 2015 23:57:46 +0100 Subject: net: tcp: add key management to congestion control This patch adds necessary infrastructure to the congestion control framework for later per route congestion control support. For a per route congestion control possibility, our aim is to store a unique u32 key identifier into dst metrics, which can then be mapped into a tcp_congestion_ops struct. We argue that having a RTAX key entry is the most simple, generic and easy way to manage, and also keeps the memory footprint of dst entries lower on 64 bit than with storing a pointer directly, for example. Having a unique key id also allows for decoupling actual TCP congestion control module management from the FIB layer, i.e. we don't have to care about expensive module refcounting inside the FIB at this point. We first thought of using an IDR store for the realization, which takes over dynamic assignment of unused key space and also performs the key to pointer mapping in RCU. While doing so, we stumbled upon the issue that due to the nature of dynamic key distribution, it just so happens, arguably in very rare occasions, that excessive module loads and unloads can lead to a possible reuse of previously used key space. Thus, previously stale keys in the dst metric are now being reassigned to a different congestion control algorithm, which might lead to unexpected behaviour. One way to resolve this would have been to walk FIBs on the actually rare occasion of a module unload and reset the metric keys for each FIB in each netns, but that's just very costly. Therefore, we argue a better solution is to reuse the unique congestion control algorithm name member and map that into u32 key space through jhash. For that, we split the flags attribute (as it currently uses 2 bits only anyway) into two u32 attributes, flags and key, so that we can keep the cacheline boundary of 2 cachelines on x86_64 and cache the precalculated key at registration time for the fast path. On average we might expect 2 - 4 modules being loaded worst case perhaps 15, so a key collision possibility is extremely low, and guaranteed collision-free on LE/BE for all in-tree modules. Overall this results in much simpler code, and all without the overhead of an IDR. Due to the deterministic nature, modules can now be unloaded, the congestion control algorithm for a specific but unloaded key will fall back to the default one, and on module reload time it will switch back to the expected algorithm transparently. Joint work with Florian Westphal. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index f50f29faf76f..135b70c9a734 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -787,6 +787,8 @@ enum tcp_ca_ack_event_flags { #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) +#define TCP_CA_UNSPEC 0 + /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ @@ -794,7 +796,8 @@ enum tcp_ca_ack_event_flags { struct tcp_congestion_ops { struct list_head list; - unsigned long flags; + u32 key; + u32 flags; /* initialize private data (optional) */ void (*init)(struct sock *sk); @@ -841,6 +844,10 @@ u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +struct tcp_congestion_ops *tcp_ca_find_key(u32 key); +u32 tcp_ca_get_key_by_name(const char *name); +char *tcp_ca_get_name_by_key(u32 key, char *buffer); + static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); -- cgit v1.2.3-70-g09d2 From ea697639992d96da98016b8934e68a73876a2264 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 5 Jan 2015 23:57:47 +0100 Subject: net: tcp: add RTAX_CC_ALGO fib handling This patch adds the minimum necessary for the RTAX_CC_ALGO congestion control metric to be set up and dumped back to user space. While the internal representation of RTAX_CC_ALGO is handled as a u32 key, we avoided to expose this implementation detail to user space, thus instead, we chose the netlink attribute that is being exchanged between user space to be the actual congestion control algorithm name, similarly as in the setsockopt(2) API in order to allow for maximum flexibility, even for 3rd party modules. It is a bit unfortunate that RTAX_QUICKACK used up a whole RTAX slot as it should have been stored in RTAX_FEATURES instead, we first thought about reusing it for the congestion control key, but it brings more complications and/or confusion than worth it. Joint work with Florian Westphal. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/tcp.h | 7 +++++++ include/uapi/linux/rtnetlink.h | 2 ++ net/core/rtnetlink.c | 15 +++++++++++++-- net/decnet/dn_fib.c | 3 ++- net/decnet/dn_table.c | 4 +++- net/ipv4/fib_semantics.c | 14 ++++++++++++-- net/ipv6/route.c | 17 +++++++++++++++-- 7 files changed, 54 insertions(+), 8 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 135b70c9a734..95bb237152e0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -846,7 +846,14 @@ extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(const char *name); +#ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); +#else +static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) +{ + return NULL; +} +#endif static inline bool tcp_ca_needs_ecn(const struct sock *sk) { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 9c9b8b4480cd..d81f22d5b390 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -389,6 +389,8 @@ enum { #define RTAX_INITRWND RTAX_INITRWND RTAX_QUICKACK, #define RTAX_QUICKACK RTAX_QUICKACK + RTAX_CC_ALGO, +#define RTAX_CC_ALGO RTAX_CC_ALGO __RTAX_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index da983d4bac02..6a6cdade1676 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -669,9 +670,19 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { + if (i == RTAX_CC_ALGO - 1) { + char tmp[TCP_CA_NAME_MAX], *name; + + name = tcp_ca_get_name_by_key(metrics[i], tmp); + if (!name) + continue; + if (nla_put_string(skb, i + 1, name)) + goto nla_put_failure; + } else { + if (nla_put_u32(skb, i + 1, metrics[i])) + goto nla_put_failure; + } valid++; - if (nla_put_u32(skb, i+1, metrics[i])) - goto nla_put_failure; } } diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index d332aefb0846..df4803437888 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -298,7 +298,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att int type = nla_type(attr); if (type) { - if (type > RTAX_MAX || nla_len(attr) < 4) + if (type > RTAX_MAX || type == RTAX_CC_ALGO || + nla_len(attr) < 4) goto err_inval; fi->fib_metrics[type-1] = nla_get_u32(attr); diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 86e3807052e9..3f19fcbf126d 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -29,6 +29,7 @@ #include /* RTF_xxx */ #include #include +#include #include #include #include @@ -273,7 +274,8 @@ static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi) size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(2) /* RTA_DST */ - + nla_total_size(4); /* RTA_PRIORITY */ + + nla_total_size(4) /* RTA_PRIORITY */ + + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f99f41bd15b8..d2b7b5521b1b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_DST */ + nla_total_size(4) /* RTA_PRIORITY */ - + nla_total_size(4); /* RTA_PREFSRC */ + + nla_total_size(4) /* RTA_PREFSRC */ + + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); @@ -859,7 +860,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (type > RTAX_MAX) goto err_inval; - val = nla_get_u32(nla); + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp); + if (val == TCP_CA_UNSPEC) + goto err_inval; + } else { + val = nla_get_u32(nla); + } if (type == RTAX_ADVMSS && val > 65535 - 40) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 454771d20b21..34dcbb59df75 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1488,10 +1488,22 @@ static int ip6_convert_metrics(struct mx6_config *mxc, int type = nla_type(nla); if (type) { + u32 val; + if (unlikely(type > RTAX_MAX)) goto err; + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp); + if (val == TCP_CA_UNSPEC) + goto err; + } else { + val = nla_get_u32(nla); + } - mp[type - 1] = nla_get_u32(nla); + mp[type - 1] = val; __set_bit(type - 1, mxc->mx_valid); } } @@ -2571,7 +2583,8 @@ static inline size_t rt6_nlmsg_size(void) + nla_total_size(4) /* RTA_OIF */ + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ - + nla_total_size(sizeof(struct rta_cacheinfo)); + + nla_total_size(sizeof(struct rta_cacheinfo)) + + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ } static int rt6_fill_node(struct net *net, -- cgit v1.2.3-70-g09d2 From 81164413ad096bafe8ad1068f3f095a7dd081d8b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 5 Jan 2015 23:57:48 +0100 Subject: net: tcp: add per route congestion control This work adds the possibility to define a per route/destination congestion control algorithm. Generally, this opens up the possibility for a machine with different links to enforce specific congestion control algorithms with optimal strategies for each of them based on their network characteristics, even transparently for a single application listening on all links. For our specific use case, this additionally facilitates deployment of DCTCP, for example, applications can easily serve internal traffic/dsts in DCTCP and external one with CUBIC. Other scenarios would also allow for utilizing e.g. long living, low priority background flows for certain destinations/routes while still being able for normal traffic to utilize the default congestion control algorithm. We also thought about a per netns setting (where different defaults are possible), but given its actually a link specific property, we argue that a per route/destination setting is the most natural and flexible. The administrator can utilize this through ip-route(8) by appending "congctl [lock] ", where denotes the name of a congestion control algorithm and the optional lock parameter allows to enforce the given algorithm so that applications in user space would not be allowed to overwrite that algorithm for that destination. The dst metric lookups are being done when a dst entry is already available in order to avoid a costly lookup and still before the algorithms are being initialized, thus overhead is very low when the feature is not being used. While the client side would need to drop the current reference on the module, on server side this can actually even be avoided as we just got a flat-copied socket clone. Joint work with Florian Westphal. Suggested-by: Hannes Frederic Sowa Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ++++++ net/ipv4/tcp_ipv4.c | 2 ++ net/ipv4/tcp_minisocks.c | 30 ++++++++++++++++++++++++++---- net/ipv4/tcp_output.c | 21 +++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 2 ++ 5 files changed, 57 insertions(+), 4 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 95bb237152e0..b8fdc6bab3f3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -448,6 +448,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb); +void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); @@ -636,6 +637,11 @@ static inline u32 tcp_rto_min_us(struct sock *sk) return jiffies_to_usecs(tcp_rto_min(sk)); } +static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) +{ + return dst_metric_locked(dst, RTAX_CC_ALGO); +} + /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a3f72d7fc06c..ad3e65bdd368 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } sk_setup_caps(newsk, dst); + tcp_ca_openreq_child(newsk, dst); + tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 63d2680b65db..bc9216dc9de1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp, tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; } +void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + + rcu_read_lock(); + ca = tcp_ca_find_key(ca_key); + if (likely(ca && try_module_get(ca->owner))) { + icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); + icsk->icsk_ca_ops = ca; + ca_got_dst = true; + } + rcu_read_unlock(); + } + + if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner)) + tcp_assign_congestion_control(sk); + + tcp_set_ca_state(sk, TCP_CA_Open); +} +EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; - if (!try_module_get(newicsk->icsk_ca_ops->owner)) - tcp_assign_congestion_control(newsk); - - tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7f18262e2326..dc30cb563e4f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2939,6 +2939,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, } EXPORT_SYMBOL(tcp_make_synack); +static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_congestion_ops *ca; + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + + if (ca_key == TCP_CA_UNSPEC) + return; + + rcu_read_lock(); + ca = tcp_ca_find_key(ca_key); + if (likely(ca && try_module_get(ca->owner))) { + module_put(icsk->icsk_ca_ops->owner); + icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); + icsk->icsk_ca_ops = ca; + } + rcu_read_unlock(); +} + /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { @@ -2964,6 +2983,8 @@ static void tcp_connect_init(struct sock *sk) tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); + tcp_ca_dst_init(sk, dst); + if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric_advmss(dst); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9c0b54e87b47..5d46832c6f72 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1199,6 +1199,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + newnp->opt->opt_flen); + tcp_ca_openreq_child(newsk, dst); + tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && -- cgit v1.2.3-70-g09d2 From 9878196578286c5ed494778ada01da094377a686 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Feb 2015 18:31:53 -0800 Subject: tcp: do not pace pure ack packets When we added pacing to TCP, we decided to let sch_fq take care of actual pacing. All TCP had to do was to compute sk->pacing_rate using simple formula: sk->pacing_rate = 2 * cwnd * mss / rtt It works well for senders (bulk flows), but not very well for receivers or even RPC : cwnd on the receiver can be less than 10, rtt can be around 100ms, so we can end up pacing ACK packets, slowing down the sender. Really, only the sender should pace, according to its own logic. Instead of adding a new bit in skb, or call yet another flow dissection, we tweak skb->truesize to a small value (2), and we instruct sch_fq to use new helper and not pace pure ack. Note this also helps TCP small queue, as ack packets present in qdisc/NIC do not prevent sending a data packet (RPC workload) This helps to reduce tx completion overhead, ack packets can use regular sock_wfree() instead of tcp_wfree() which is a bit more expensive. This has no impact in the case packets are sent to loopback interface, as we do not coalesce ack packets (were we would detect skb->truesize lie) In case netem (with a delay) is used, skb_orphan_partial() also sets skb->truesize to 1. This patch is a combination of two patches we used for about one year at Google. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 15 +++++++++++++++ net/ipv4/tcp_output.c | 10 +++++++++- net/sched/sch_fq.c | 10 ++++++++-- 3 files changed, 32 insertions(+), 3 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index b8fdc6bab3f3..637ee490ec81 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1713,4 +1713,19 @@ static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) return dopt; } +/* locally generated TCP pure ACKs have skb->truesize == 2 + * (check tcp_send_ack() in net/ipv4/tcp_output.c ) + * This is much faster than dissecting the packet to find out. + * (Think of GRE encapsulations, IPv4, IPv6, ...) + */ +static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb) +{ + return skb->truesize == 2; +} + +static inline void skb_set_tcp_pure_ack(struct sk_buff *skb) +{ + skb->truesize = 2; +} + #endif /* _TCP_H */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20ab06b228ac..1b326ed46f7b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -948,7 +948,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; - skb->destructor = tcp_wfree; + skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree; skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); @@ -3265,6 +3265,14 @@ void tcp_send_ack(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); + /* We do not want pure acks influencing TCP Small Queues or fq/pacing + * too much. + * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 + * We also avoid tcp_wfree() overhead (cache line miss accessing + * tp->tsq_flags) by using regular sock_wfree() + */ + skb_set_tcp_pure_ack(buff); + /* Send it off, this clears delayed acks for us. */ skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 2a50f5c62070..69a3dbf55c60 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -52,6 +52,7 @@ #include #include #include +#include /* * Per flow structure, dynamically allocated @@ -445,7 +446,9 @@ begin: goto begin; } - if (unlikely(f->head && now < f->time_next_packet)) { + skb = f->head; + if (unlikely(skb && now < f->time_next_packet && + !skb_is_tcp_pure_ack(skb))) { head->first = f->next; fq_flow_set_throttled(q, f); goto begin; @@ -464,12 +467,15 @@ begin: goto begin; } prefetch(&skb->end); - f->time_next_packet = now; f->credit -= qdisc_pkt_len(skb); if (f->credit > 0 || !q->rate_enable) goto out; + /* Do not pace locally generated ack packets */ + if (skb_is_tcp_pure_ack(skb)) + goto out; + rate = q->flow_max_rate; if (skb->sk) rate = min(skb->sk->sk_pacing_rate, rate); -- cgit v1.2.3-70-g09d2 From 032ee4236954eb214651cb9bfc1b38ffa8fd7a01 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 6 Feb 2015 16:04:38 -0500 Subject: tcp: helpers to mitigate ACK loops by rate-limiting out-of-window dupacks Helpers for mitigating ACK loops by rate-limiting dupacks sent in response to incoming out-of-window packets. This patch includes: - rate-limiting logic - sysctl to control how often we allow dupacks to out-of-window packets - SNMP counter for cases where we rate-limited our dupack sending The rate-limiting logic in this patch decides to not send dupacks in response to out-of-window segments if (a) they are SYNs or pure ACKs and (b) the remote endpoint is sending them faster than the configured rate limit. We rate-limit our responses rather than blocking them entirely or resetting the connection, because legitimate connections can rely on dupacks in response to some out-of-window segments. For example, zero window probes are typically sent with a sequence number that is below the current window, and ZWPs thus expect to thus elicit a dupack in response. We allow dupacks in response to TCP segments with data, because these may be spurious retransmissions for which the remote endpoint wants to receive DSACKs. This is safe because segments with data can't realistically be part of ACK loops, which by their nature consist of each side sending pure/data-less ACKs to each other. The dupack interval is controlled by a new sysctl knob, tcp_invalid_ratelimit, given in milliseconds, in case an administrator needs to dial this upward in the face of a high-rate DoS attack. The name and units are chosen to be analogous to the existing analogous knob for ICMP, icmp_ratelimit. The default value for tcp_invalid_ratelimit is 500ms, which allows at most one such dupack per 500ms. This is chosen to be 2x faster than the 1-second minimum RTO interval allowed by RFC 6298 (section 2, rule 2.4). We allow the extra 2x factor because network delay variations can cause packets sent at 1 second intervals to be compressed and arrive much closer. Reported-by: Avery Fay Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 22 ++++++++++++++++++++++ include/net/tcp.h | 32 ++++++++++++++++++++++++++++++++ include/uapi/linux/snmp.h | 6 ++++++ net/ipv4/proc.c | 6 ++++++ net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_input.c | 1 + 6 files changed, 74 insertions(+) (limited to 'include/net/tcp.h') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a5e4c813f17f..1b8c964b0d17 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -290,6 +290,28 @@ tcp_frto - INTEGER By default it's enabled with a non-zero value. 0 disables F-RTO. +tcp_invalid_ratelimit - INTEGER + Limit the maximal rate for sending duplicate acknowledgments + in response to incoming TCP packets that are for an existing + connection but that are invalid due to any of these reasons: + + (a) out-of-window sequence number, + (b) out-of-window acknowledgment number, or + (c) PAWS (Protection Against Wrapped Sequence numbers) check failure + + This can help mitigate simple "ack loop" DoS attacks, wherein + a buggy or malicious middlebox or man-in-the-middle can + rewrite TCP header fields in manner that causes each endpoint + to think that the other is sending invalid TCP segments, thus + causing each side to send an unterminating stream of duplicate + acknowledgments for invalid segments. + + Using 0 disables rate-limiting of dupacks in response to + invalid segments; otherwise this value specifies the minimal + space between sending such dupacks, in milliseconds. + + Default: 500 (milliseconds). + tcp_keepalive_time - INTEGER How often TCP sends out keepalive messages when keepalive is enabled. Default: 2hours. diff --git a/include/net/tcp.h b/include/net/tcp.h index 28e9bd3abceb..b81f45c67b2e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -274,6 +274,7 @@ extern int sysctl_tcp_challenge_ack_limit; extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking; +extern int sysctl_tcp_invalid_ratelimit; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -1236,6 +1237,37 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, return true; } +/* Return true if we're currently rate-limiting out-of-window ACKs and + * thus shouldn't send a dupack right now. We rate-limit dupacks in + * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS + * attacks that send repeated SYNs or ACKs for the same connection. To + * do this, we do not send a duplicate SYNACK or ACK if the remote + * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. + */ +static inline bool tcp_oow_rate_limited(struct net *net, + const struct sk_buff *skb, + int mib_idx, u32 *last_oow_ack_time) +{ + /* Data packets without SYNs are not likely part of an ACK loop. */ + if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && + !tcp_hdr(skb)->syn) + goto not_rate_limited; + + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + +not_rate_limited: + return false; /* not rate-limited: go ahead, send dupack now! */ +} + static inline void tcp_mib_init(struct net *net) { /* See RFC 2012 */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index b22224100011..6a6fb747c78d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -270,6 +270,12 @@ enum LINUX_MIB_TCPHYSTARTTRAINCWND, /* TCPHystartTrainCwnd */ LINUX_MIB_TCPHYSTARTDELAYDETECT, /* TCPHystartDelayDetect */ LINUX_MIB_TCPHYSTARTDELAYCWND, /* TCPHystartDelayCwnd */ + LINUX_MIB_TCPACKSKIPPEDSYNRECV, /* TCPACKSkippedSynRecv */ + LINUX_MIB_TCPACKSKIPPEDPAWS, /* TCPACKSkippedPAWS */ + LINUX_MIB_TCPACKSKIPPEDSEQ, /* TCPACKSkippedSeq */ + LINUX_MIB_TCPACKSKIPPEDFINWAIT2, /* TCPACKSkippedFinWait2 */ + LINUX_MIB_TCPACKSKIPPEDTIMEWAIT, /* TCPACKSkippedTimeWait */ + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, /* TCPACKSkippedChallenge */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8f9cd200ce20..d8953ef0770c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -292,6 +292,12 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), + SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV), + SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS), + SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ), + SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), + SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), + SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e0ee384a448f..82601a68cf90 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -728,6 +728,13 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "tcp_invalid_ratelimit", + .data = &sysctl_tcp_invalid_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, { .procname = "icmp_msgs_per_sec", .data = &sysctl_icmp_msgs_per_sec, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d3dfff78fa19..9401aa43b814 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; +int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -- cgit v1.2.3-70-g09d2 From a9b2c06dbef48ed31cff1764c5ce824829106f4f Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 6 Feb 2015 16:04:39 -0500 Subject: tcp: mitigate ACK loops for connections as tcp_request_sock In the SYN_RECV state, where the TCP connection is represented by tcp_request_sock, we now rate-limit SYNACKs in response to a client's retransmitted SYNs: we do not send a SYNACK in response to client SYN if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we last sent a SYNACK in response to a client's retransmitted SYN. This allows the vast majority of legitimate client connections to proceed unimpeded, even for the most aggressive platforms, iOS and MacOS, which actually retransmit SYNs 1-second intervals for several times in a row. They use SYN RTO timeouts following the progression: 1,1,1,1,1,2,4,8,16,32. Reported-by: Avery Fay Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/net/tcp.h | 1 + net/ipv4/tcp_minisocks.c | 6 +++++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/net/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 67309ece0772..bcc828d3b9b9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -115,6 +115,7 @@ struct tcp_request_sock { u32 rcv_isn; u32 snt_isn; u32 snt_synack; /* synack sent time */ + u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# * after data-in-SYN. diff --git a/include/net/tcp.h b/include/net/tcp.h index b81f45c67b2e..da4196fb78db 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1145,6 +1145,7 @@ static inline void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bc9216dc9de1..131aa4950d1c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -605,7 +605,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ - if (!inet_rtx_syn_ack(sk, req)) + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSYNRECV, + &tcp_rsk(req)->last_oow_ack_time) && + + !inet_rtx_syn_ack(sk, req)) req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX) + jiffies; return NULL; -- cgit v1.2.3-70-g09d2 From b0f9ca53cbb103e9240a29a974e0b6085e58f9f7 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Tue, 10 Feb 2015 09:53:16 +0800 Subject: ipv4: Namespecify TCP PMTU mechanism Packetization Layer Path MTU Discovery works separately beside Path MTU Discovery at IP level, different net namespace has various requirements on which one to chose, e.g., a virutalized container instance would require TCP PMTU to probe an usable effective mtu for underlying tunnel, while the host would employ classical ICMP based PMTU to function. Hence making TCP PMTU mechanism per net namespace to decouple two functionality. Furthermore the probe base MSS should also be configured separately for each namespace. Signed-off-by: Fan Du Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 2 -- net/ipv4/sysctl_net_ipv4.c | 28 ++++++++++++++-------------- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 8 +++----- net/ipv4/tcp_timer.c | 7 +++++-- 6 files changed, 25 insertions(+), 23 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e0bdcb147326..dbe225478adb 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -82,6 +82,8 @@ struct netns_ipv4 { int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; + int sysctl_tcp_mtu_probing; + int sysctl_tcp_base_mss; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index da4196fb78db..8d6b983d5099 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -262,8 +262,6 @@ extern int sysctl_tcp_low_latency; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; -extern int sysctl_tcp_mtu_probing; -extern int sysctl_tcp_base_mss; extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_thin_linear_timeouts; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 82601a68cf90..d151539da8e6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -603,20 +603,6 @@ static struct ctl_table ipv4_table[] = { .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, - { - .procname = "tcp_mtu_probing", - .data = &sysctl_tcp_mtu_probing, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_base_mss", - .data = &sysctl_tcp_base_mss, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_workaround_signed_windows", .data = &sysctl_tcp_workaround_signed_windows, @@ -883,6 +869,20 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_mtu_probing", + .data = &init_net.ipv4.sysctl_tcp_mtu_probing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_base_mss", + .data = &init_net.ipv4.sysctl_tcp_base_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 67bc95fb5d9e..5a2dfed4783b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2459,6 +2459,7 @@ static int __net_init tcp_sk_init(struct net *net) *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; return 0; fail: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4fcc9a768849..a2a796c5536b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -59,9 +59,6 @@ int sysctl_tcp_limit_output_bytes __read_mostly = 131072; */ int sysctl_tcp_tso_win_divisor __read_mostly = 3; -int sysctl_tcp_mtu_probing __read_mostly = 0; -int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; - /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; @@ -1350,11 +1347,12 @@ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); - icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; + icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; } EXPORT_SYMBOL(tcp_mtup_init); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1829c7fbc77e..0732b787904e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -101,17 +101,20 @@ static int tcp_orphan_retries(struct sock *sk, int alive) static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { + struct net *net = sock_net(sk); + /* Black hole detection */ - if (sysctl_tcp_mtu_probing) { + if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { + struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); int mss; mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; - mss = min(sysctl_tcp_base_mss, mss); + mss = min(net->ipv4.sysctl_tcp_base_mss, mss); mss = max(mss, 68 - tp->tcp_header_len); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); -- cgit v1.2.3-70-g09d2