summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 11:56:19 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 11:56:19 -0800
commit5bbcc0f595fadb4cac0eddc4401035ec0bd95b09 (patch)
tree3b65e490cc36a6c6fecac1fa24d9e0ac9ced4455 /net/ipv4/tcp.c
parent892204e06cb9e89fbc4b299a678f9ca358e97cac (diff)
parent50895b9de1d3e0258e015e8e55128d835d9a9f19 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Maintain the TCP retransmit queue using an rbtree, with 1GB windows at 100Gb this really has become necessary. From Eric Dumazet. 2) Multi-program support for cgroup+bpf, from Alexei Starovoitov. 3) Perform broadcast flooding in hardware in mv88e6xxx, from Andrew Lunn. 4) Add meter action support to openvswitch, from Andy Zhou. 5) Add a data meta pointer for BPF accessible packets, from Daniel Borkmann. 6) Namespace-ify almost all TCP sysctl knobs, from Eric Dumazet. 7) Turn on Broadcom Tags in b53 driver, from Florian Fainelli. 8) More work to move the RTNL mutex down, from Florian Westphal. 9) Add 'bpftool' utility, to help with bpf program introspection. From Jakub Kicinski. 10) Add new 'cpumap' type for XDP_REDIRECT action, from Jesper Dangaard Brouer. 11) Support 'blocks' of transformations in the packet scheduler which can span multiple network devices, from Jiri Pirko. 12) TC flower offload support in cxgb4, from Kumar Sanghvi. 13) Priority based stream scheduler for SCTP, from Marcelo Ricardo Leitner. 14) Thunderbolt networking driver, from Amir Levy and Mika Westerberg. 15) Add RED qdisc offloadability, and use it in mlxsw driver. From Nogah Frankel. 16) eBPF based device controller for cgroup v2, from Roman Gushchin. 17) Add some fundamental tracepoints for TCP, from Song Liu. 18) Remove garbage collection from ipv6 route layer, this is a significant accomplishment. From Wei Wang. 19) Add multicast route offload support to mlxsw, from Yotam Gigi" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2177 commits) tcp: highest_sack fix geneve: fix fill_info when link down bpf: fix lockdep splat net: cdc_ncm: GetNtbFormat endian fix openvswitch: meter: fix NULL pointer dereference in ovs_meter_cmd_reply_start netem: remove unnecessary 64 bit modulus netem: use 64 bit divide by rate tcp: Namespace-ify sysctl_tcp_default_congestion_control net: Protect iterations over net::fib_notifier_ops in fib_seq_sum() ipv6: set all.accept_dad to 0 by default uapi: fix linux/tls.h userspace compilation error usbnet: ipheth: prevent TX queue timeouts when device not ready vhost_net: conditionally enable tx polling uapi: fix linux/rxrpc.h userspace compilation errors net: stmmac: fix LPI transitioning for dwmac4 atm: horizon: Fix irq release error net-sysfs: trigger netlink notification on ifalias change via sysfs openvswitch: Using kfree_rcu() to simplify the code openvswitch: Make local function ovs_nsh_key_attr_size() static openvswitch: Fix return value check in ovs_meter_cmd_features() ...
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c167
1 files changed, 132 insertions, 35 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5091402720ab..bf97317e6c97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/errqueue.h>
+#include <linux/static_key.h>
#include <net/icmp.h>
#include <net/inet_common.h>
@@ -282,24 +283,22 @@
#include <asm/ioctls.h>
#include <net/busy_poll.h>
-int sysctl_tcp_min_tso_segs __read_mostly = 2;
-
-int sysctl_tcp_autocorking __read_mostly = 1;
+#include <trace/events/tcp.h>
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
-int sysctl_tcp_wmem[3] __read_mostly;
-int sysctl_tcp_rmem[3] __read_mostly;
-
EXPORT_SYMBOL(sysctl_tcp_mem);
-EXPORT_SYMBOL(sysctl_tcp_rmem);
-EXPORT_SYMBOL(sysctl_tcp_wmem);
atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
+#if IS_ENABLED(CONFIG_SMC)
+DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
+EXPORT_SYMBOL(tcp_have_smc);
+#endif
+
/*
* Current number of TCP sockets.
*/
@@ -413,8 +412,10 @@ void tcp_init_sock(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tp->out_of_order_queue = RB_ROOT;
+ sk->tcp_rtx_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
INIT_LIST_HEAD(&tp->tsq_node);
+ INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -441,6 +442,7 @@ void tcp_init_sock(struct sock *sk)
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
+ tp->rack.reo_wnd_steps = 1;
sk->sk_state = TCP_CLOSE;
@@ -449,15 +451,29 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_sync_mss = tcp_sync_mss;
- sk->sk_sndbuf = sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
sk_sockets_allocated_inc(sk);
}
EXPORT_SYMBOL(tcp_init_sock);
-static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
+void tcp_init_transfer(struct sock *sk, int bpf_op)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_mtup_init(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_metrics(sk);
+ tcp_call_bpf(sk, bpf_op);
+ tcp_init_congestion_control(sk);
+ tcp_init_buffer_space(sk);
+}
+
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
+{
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -675,7 +691,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
int size_goal)
{
return skb->len < size_goal &&
- sysctl_tcp_autocorking &&
+ sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
skb != tcp_write_queue_head(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
@@ -686,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- if (!tcp_send_head(sk))
- return;
-
skb = tcp_write_queue_tail(sk);
+ if (!skb)
+ return;
if (!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, skb);
@@ -869,6 +884,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
* available to the caller, no more, no less.
*/
skb->reserved_tailroom = skb->end - skb->tail - size;
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb;
}
__kfree_skb(skb);
@@ -948,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
int copy, i;
bool can_coalesce;
- if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+ if (!skb || (copy = size_goal - skb->len) <= 0 ||
!tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- skb_queue_empty(&sk->sk_write_queue));
+ tcp_rtx_and_write_queues_empty(sk));
if (!skb)
goto wait_for_memory;
@@ -1027,7 +1043,7 @@ wait_for_memory:
out:
if (copied) {
- tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+ tcp_tx_timestamp(sk, sk->sk_tsflags);
if (!(flags & MSG_SENDPAGE_NOTLAST))
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
@@ -1126,7 +1142,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
struct sockaddr *uaddr = msg->msg_name;
int err, flags;
- if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+ if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
uaddr->sa_family == AF_UNSPEC))
return -EOPNOTSUPP;
@@ -1183,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
goto out_err;
}
- skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+ skb = tcp_write_queue_tail(sk);
uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
@@ -1259,7 +1275,7 @@ restart:
int max = size_goal;
skb = tcp_write_queue_tail(sk);
- if (tcp_send_head(sk)) {
+ if (skb) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb->len;
@@ -1279,7 +1295,7 @@ new_segment:
process_backlog = false;
goto restart;
}
- first_skb = skb_queue_empty(&sk->sk_write_queue);
+ first_skb = tcp_rtx_and_write_queues_empty(sk);
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg, first_skb),
sk->sk_allocation,
@@ -1404,7 +1420,7 @@ wait_for_memory:
out:
if (copied) {
- tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
+ tcp_tx_timestamp(sk, sockc.tsflags);
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
@@ -1505,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
/* XXX -- need to support SO_PEEK_OFF */
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+ if (err)
+ return err;
+ copied += skb->len;
+ }
+
skb_queue_walk(&sk->sk_write_queue, skb) {
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
if (err)
@@ -2017,6 +2040,8 @@ void tcp_set_state(struct sock *sk, int state)
{
int oldstate = sk->sk_state;
+ trace_tcp_set_state(sk, oldstate, state);
+
switch (state) {
case TCP_ESTABLISHED:
if (oldstate != TCP_ESTABLISHED)
@@ -2304,6 +2329,37 @@ static inline bool tcp_need_reset(int state)
TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+ struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+ /* Since we are deleting whole queue, no need to
+ * list_del(&skb->tcp_tsorted_anchor)
+ */
+ tcp_rtx_queue_unlink(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+ }
+}
+
+void tcp_write_queue_purge(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ tcp_skb_tsorted_anchor_cleanup(skb);
+ sk_wmem_free_skb(sk, skb);
+ }
+ tcp_rtx_queue_purge(sk);
+ INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+ sk_mem_reclaim(sk);
+ tcp_clear_all_retrans_hints(tcp_sk(sk));
+}
+
int tcp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -2362,7 +2418,6 @@ int tcp_disconnect(struct sock *sk, int flags)
* issue in __tcp_select_window()
*/
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
- tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
dst_release(sk->sk_rx_dst);
@@ -2454,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk,
return -EINVAL;
tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
- if (sysctl_tcp_fack)
- tcp_enable_fack(tp);
break;
case TCPOPT_TIMESTAMP:
if (opt.opt_val != 0)
@@ -2518,6 +2571,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
+ case TCP_FASTOPEN_KEY: {
+ __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+ if (optlen != sizeof(key))
+ return -EINVAL;
+
+ if (copy_from_user(key, optval, optlen))
+ return -EFAULT;
+
+ return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+ }
default:
/* fallthru */
break;
@@ -2749,7 +2813,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) {
- tcp_fastopen_init_key_once(true);
+ tcp_fastopen_init_key_once(net);
fastopen_queue_tune(sk, val);
} else {
@@ -2759,7 +2823,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_FASTOPEN_CONNECT:
if (val > 1 || val < 0) {
err = -EINVAL;
- } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
if (sk->sk_state == TCP_CLOSE)
tp->fastopen_connect = val;
else
@@ -2768,6 +2832,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EOPNOTSUPP;
}
break;
+ case TCP_FASTOPEN_NO_COOKIE:
+ if (val > 1 || val < 0)
+ err = -EINVAL;
+ else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ err = -EINVAL;
+ else
+ tp->fastopen_no_cookie = val;
+ break;
case TCP_TIMESTAMP:
if (!tp->repair)
err = -EPERM;
@@ -2905,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
- info->tcpi_fackets = tp->fackets_out;
now = tcp_jiffies32;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
@@ -3104,6 +3175,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
+ case TCP_FASTOPEN_KEY: {
+ __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ struct tcp_fastopen_context *ctx;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
+ if (ctx)
+ memcpy(key, ctx->key, sizeof(key));
+ else
+ len = 0;
+ rcu_read_unlock();
+
+ len = min_t(unsigned int, len, sizeof(key));
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, key, len))
+ return -EFAULT;
+ return 0;
+ }
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
@@ -3166,6 +3259,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->fastopen_connect;
break;
+ case TCP_FASTOPEN_NO_COOKIE:
+ val = tp->fastopen_no_cookie;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp_raw() + tp->tsoffset;
break;
@@ -3531,13 +3628,13 @@ void __init tcp_init(void)
max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit);
- sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
- sysctl_tcp_wmem[1] = 16*1024;
- sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+ init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+ init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
- sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
- sysctl_tcp_rmem[1] = 87380;
- sysctl_tcp_rmem[2] = max(87380, max_rshare);
+ init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
+ init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);