summaryrefslogtreecommitdiff
path: root/include/net/tcp.h
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 11:56:19 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 11:56:19 -0800
commit5bbcc0f595fadb4cac0eddc4401035ec0bd95b09 (patch)
tree3b65e490cc36a6c6fecac1fa24d9e0ac9ced4455 /include/net/tcp.h
parent892204e06cb9e89fbc4b299a678f9ca358e97cac (diff)
parent50895b9de1d3e0258e015e8e55128d835d9a9f19 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Maintain the TCP retransmit queue using an rbtree, with 1GB windows at 100Gb this really has become necessary. From Eric Dumazet. 2) Multi-program support for cgroup+bpf, from Alexei Starovoitov. 3) Perform broadcast flooding in hardware in mv88e6xxx, from Andrew Lunn. 4) Add meter action support to openvswitch, from Andy Zhou. 5) Add a data meta pointer for BPF accessible packets, from Daniel Borkmann. 6) Namespace-ify almost all TCP sysctl knobs, from Eric Dumazet. 7) Turn on Broadcom Tags in b53 driver, from Florian Fainelli. 8) More work to move the RTNL mutex down, from Florian Westphal. 9) Add 'bpftool' utility, to help with bpf program introspection. From Jakub Kicinski. 10) Add new 'cpumap' type for XDP_REDIRECT action, from Jesper Dangaard Brouer. 11) Support 'blocks' of transformations in the packet scheduler which can span multiple network devices, from Jiri Pirko. 12) TC flower offload support in cxgb4, from Kumar Sanghvi. 13) Priority based stream scheduler for SCTP, from Marcelo Ricardo Leitner. 14) Thunderbolt networking driver, from Amir Levy and Mika Westerberg. 15) Add RED qdisc offloadability, and use it in mlxsw driver. From Nogah Frankel. 16) eBPF based device controller for cgroup v2, from Roman Gushchin. 17) Add some fundamental tracepoints for TCP, from Song Liu. 18) Remove garbage collection from ipv6 route layer, this is a significant accomplishment. From Wei Wang. 19) Add multicast route offload support to mlxsw, from Yotam Gigi" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2177 commits) tcp: highest_sack fix geneve: fix fill_info when link down bpf: fix lockdep splat net: cdc_ncm: GetNtbFormat endian fix openvswitch: meter: fix NULL pointer dereference in ovs_meter_cmd_reply_start netem: remove unnecessary 64 bit modulus netem: use 64 bit divide by rate tcp: Namespace-ify sysctl_tcp_default_congestion_control net: Protect iterations over net::fib_notifier_ops in fib_seq_sum() ipv6: set all.accept_dad to 0 by default uapi: fix linux/tls.h userspace compilation error usbnet: ipheth: prevent TX queue timeouts when device not ready vhost_net: conditionally enable tx polling uapi: fix linux/rxrpc.h userspace compilation errors net: stmmac: fix LPI transitioning for dwmac4 atm: horizon: Fix irq release error net-sysfs: trigger netlink notification on ifalias change via sysfs openvswitch: Using kfree_rcu() to simplify the code openvswitch: Make local function ovs_nsh_key_attr_size() static openvswitch: Fix return value check in ovs_meter_cmd_features() ...
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r--include/net/tcp.h224
1 files changed, 86 insertions, 138 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e6d0002a1b0b..85ea578195d4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,9 +45,6 @@
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
-
-#include <linux/bpf.h>
-#include <linux/filter.h>
#include <linux/bpf-cgroup.h>
extern struct inet_hashinfo tcp_hashinfo;
@@ -191,6 +188,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
* experimental options. See draft-ietf-tcpm-experimental-options-00.txt
*/
#define TCPOPT_FASTOPEN_MAGIC 0xF989
+#define TCPOPT_SMC_MAGIC 0xE2D4C3D9
/*
* TCP option lengths
@@ -203,6 +201,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOLEN_MD5SIG 18
#define TCPOLEN_FASTOPEN_BASE 2
#define TCPOLEN_EXP_FASTOPEN_BASE 4
+#define TCPOLEN_EXP_SMC_BASE 6
/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED 12
@@ -213,6 +212,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOLEN_SACK_PERBLOCK 8
#define TCPOLEN_MD5SIG_ALIGNED 20
#define TCPOLEN_MSS_ALIGNED 4
+#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
@@ -240,41 +240,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* sysctl variables for tcp */
-extern int sysctl_tcp_fastopen;
-extern int sysctl_tcp_retrans_collapse;
-extern int sysctl_tcp_stdurg;
-extern int sysctl_tcp_rfc1337;
-extern int sysctl_tcp_abort_on_overflow;
extern int sysctl_tcp_max_orphans;
-extern int sysctl_tcp_fack;
-extern int sysctl_tcp_reordering;
-extern int sysctl_tcp_max_reordering;
-extern int sysctl_tcp_dsack;
extern long sysctl_tcp_mem[3];
-extern int sysctl_tcp_wmem[3];
-extern int sysctl_tcp_rmem[3];
-extern int sysctl_tcp_app_win;
-extern int sysctl_tcp_adv_win_scale;
-extern int sysctl_tcp_frto;
-extern int sysctl_tcp_nometrics_save;
-extern int sysctl_tcp_moderate_rcvbuf;
-extern int sysctl_tcp_tso_win_divisor;
-extern int sysctl_tcp_workaround_signed_windows;
-extern int sysctl_tcp_slow_start_after_idle;
-extern int sysctl_tcp_thin_linear_timeouts;
-extern int sysctl_tcp_thin_dupack;
-extern int sysctl_tcp_early_retrans;
-extern int sysctl_tcp_recovery;
-#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
-extern int sysctl_tcp_limit_output_bytes;
-extern int sysctl_tcp_challenge_ack_limit;
-extern int sysctl_tcp_min_tso_segs;
-extern int sysctl_tcp_min_rtt_wlen;
-extern int sysctl_tcp_autocorking;
-extern int sysctl_tcp_invalid_ratelimit;
-extern int sysctl_tcp_pacing_ss_ratio;
-extern int sysctl_tcp_pacing_ca_ratio;
+#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
+#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -414,9 +384,9 @@ void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
-void tcp_disable_fack(struct tcp_sock *tp);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
+void tcp_init_transfer(struct sock *sk, int bpf_op);
unsigned int tcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -551,7 +521,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
+enum tcp_queue {
+ TCP_FRAG_IN_WRITE_QUEUE,
+ TCP_FRAG_IN_RTX_QUEUE,
+};
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ struct sk_buff *skb, u32 len,
+ unsigned int mss_now, gfp_t gfp);
void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
@@ -796,16 +772,10 @@ struct tcp_skb_cb {
u16 tcp_gso_segs;
u16 tcp_gso_size;
};
-
- /* Used to stash the receive timestamp while this skb is in the
- * out of order queue, as skb->tstamp is overwritten by the
- * rbnode.
- */
- ktime_t swtstamp;
};
__u8 tcp_flags; /* TCP header flags. (tcp[13]) */
- __u8 sacked; /* State flags for SACK/FACK. */
+ __u8 sacked; /* State flags for SACK. */
#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */
#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
#define TCPCB_LOST 0x04 /* SKB is lost */
@@ -1032,8 +1002,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
void tcp_cleanup_congestion_control(struct sock *sk);
-int tcp_set_default_congestion_control(const char *name);
-void tcp_get_default_congestion_control(char *name);
+int tcp_set_default_congestion_control(struct net *net, const char *name);
+void tcp_get_default_congestion_control(struct net *net, char *name);
void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
@@ -1047,7 +1017,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
-u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
@@ -1095,7 +1065,6 @@ void tcp_rate_check_app_limited(struct sock *sk);
*
* tcp_is_sack - SACK enabled
* tcp_is_reno - No SACK
- * tcp_is_fack - FACK enabled, implies SACK enabled
*/
static inline int tcp_is_sack(const struct tcp_sock *tp)
{
@@ -1107,16 +1076,6 @@ static inline bool tcp_is_reno(const struct tcp_sock *tp)
return !tcp_is_sack(tp);
}
-static inline bool tcp_is_fack(const struct tcp_sock *tp)
-{
- return tp->rx_opt.sack_ok & TCP_FACK_ENABLED;
-}
-
-static inline void tcp_enable_fack(struct tcp_sock *tp)
-{
- tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
-}
-
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
return tp->sacked_out + tp->lost_out;
@@ -1309,7 +1268,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
s32 delta;
- if (!sysctl_tcp_slow_start_after_idle || tp->packets_out ||
+ if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out ||
ca_ops->cong_control)
return;
delta = tcp_jiffies32 - tp->lsndtime;
@@ -1318,13 +1277,14 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
}
/* Determine a window scaling and initial window to offer. */
-void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
+void tcp_select_initial_window(const struct sock *sk, int __space,
+ __u32 mss, __u32 *rcv_wnd,
__u32 *window_clamp, int wscale_ok,
__u8 *rcv_wscale, __u32 init_rcv_wnd);
-static inline int tcp_win_from_space(int space)
+static inline int tcp_win_from_space(const struct sock *sk, int space)
{
- int tcp_adv_win_scale = sysctl_tcp_adv_win_scale;
+ int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale;
return tcp_adv_win_scale <= 0 ?
(space>>(-tcp_adv_win_scale)) :
@@ -1334,13 +1294,13 @@ static inline int tcp_win_from_space(int space)
/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk)
{
- return tcp_win_from_space(sk->sk_rcvbuf -
+ return tcp_win_from_space(sk, sk->sk_rcvbuf -
atomic_read(&sk->sk_rmem_alloc));
}
static inline int tcp_full_space(const struct sock *sk)
{
- return tcp_win_from_space(sk->sk_rcvbuf);
+ return tcp_win_from_space(sk, sk->sk_rcvbuf);
}
extern void tcp_openreq_init_rwin(struct request_sock *req,
@@ -1561,14 +1521,16 @@ struct tcp_fastopen_request {
int copied; /* queued in tcp_connect() */
};
void tcp_free_fastopen_req(struct tcp_sock *tp);
-
-extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-int tcp_fastopen_reset_cipher(void *key, unsigned int len);
+void tcp_fastopen_destroy_cipher(struct sock *sk);
+void tcp_fastopen_ctx_destroy(struct net *net);
+int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
+ void *key, unsigned int len);
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct tcp_fastopen_cookie *foc);
-void tcp_fastopen_init_key_once(bool publish);
+ struct tcp_fastopen_cookie *foc,
+ const struct dst_entry *dst);
+void tcp_fastopen_init_key_once(struct net *net);
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie);
bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
@@ -1601,52 +1563,46 @@ enum tcp_chrono {
void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
-/* write queue abstraction */
-static inline void tcp_write_queue_purge(struct sock *sk)
+/* This helper is needed, because skb->tcp_tsorted_anchor uses
+ * the same memory storage than skb->destructor/_skb_refdst
+ */
+static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
{
- struct sk_buff *skb;
-
- tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
- while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
- sk_wmem_free_skb(sk, skb);
- sk_mem_reclaim(sk);
- tcp_clear_all_retrans_hints(tcp_sk(sk));
+ skb->destructor = NULL;
+ skb->_skb_refdst = 0UL;
}
-static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
-{
- return skb_peek(&sk->sk_write_queue);
+#define tcp_skb_tsorted_save(skb) { \
+ unsigned long _save = skb->_skb_refdst; \
+ skb->_skb_refdst = 0UL;
+
+#define tcp_skb_tsorted_restore(skb) \
+ skb->_skb_refdst = _save; \
}
-static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
+void tcp_write_queue_purge(struct sock *sk);
+
+static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
{
- return skb_peek_tail(&sk->sk_write_queue);
+ return skb_rb_first(&sk->tcp_rtx_queue);
}
-static inline struct sk_buff *tcp_write_queue_next(const struct sock *sk,
- const struct sk_buff *skb)
+static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
{
- return skb_queue_next(&sk->sk_write_queue, skb);
+ return skb_peek(&sk->sk_write_queue);
}
-static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
- const struct sk_buff *skb)
+static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
{
- return skb_queue_prev(&sk->sk_write_queue, skb);
+ return skb_peek_tail(&sk->sk_write_queue);
}
-#define tcp_for_write_queue(skb, sk) \
- skb_queue_walk(&(sk)->sk_write_queue, skb)
-
-#define tcp_for_write_queue_from(skb, sk) \
- skb_queue_walk_from(&(sk)->sk_write_queue, skb)
-
#define tcp_for_write_queue_from_safe(skb, tmp, sk) \
skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
static inline struct sk_buff *tcp_send_head(const struct sock *sk)
{
- return sk->sk_send_head;
+ return skb_peek(&sk->sk_write_queue);
}
static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1655,27 +1611,25 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
return skb_queue_is_last(&sk->sk_write_queue, skb);
}
-static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
+static inline bool tcp_write_queue_empty(const struct sock *sk)
{
- if (tcp_skb_is_last(sk, skb))
- sk->sk_send_head = NULL;
- else
- sk->sk_send_head = tcp_write_queue_next(sk, skb);
+ return skb_queue_empty(&sk->sk_write_queue);
}
-static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
+static inline bool tcp_rtx_queue_empty(const struct sock *sk)
{
- if (sk->sk_send_head == skb_unlinked) {
- sk->sk_send_head = NULL;
- tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
- }
- if (tcp_sk(sk)->highest_sack == skb_unlinked)
- tcp_sk(sk)->highest_sack = NULL;
+ return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
+}
+
+static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
+{
+ return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
}
-static inline void tcp_init_send_head(struct sock *sk)
+static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
{
- sk->sk_send_head = NULL;
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
}
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
@@ -1688,26 +1642,8 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
__tcp_add_write_queue_tail(sk, skb);
/* Queue it, remembering where we must start sending. */
- if (sk->sk_send_head == NULL) {
- sk->sk_send_head = skb;
+ if (sk->sk_write_queue.next == skb)
tcp_chrono_start(sk, TCP_CHRONO_BUSY);
-
- if (tcp_sk(sk)->highest_sack == NULL)
- tcp_sk(sk)->highest_sack = skb;
- }
-}
-
-static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *skb)
-{
- __skb_queue_head(&sk->sk_write_queue, skb);
-}
-
-/* Insert buff after skb on the write queue of sk. */
-static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
- struct sk_buff *buff,
- struct sock *sk)
-{
- __skb_queue_after(&sk->sk_write_queue, skb, buff);
}
/* Insert new before skb on the write queue of sk. */
@@ -1716,19 +1652,27 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
struct sock *sk)
{
__skb_queue_before(&sk->sk_write_queue, skb, new);
-
- if (sk->sk_send_head == skb)
- sk->sk_send_head = new;
}
static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
+ tcp_skb_tsorted_anchor_cleanup(skb);
__skb_unlink(skb, &sk->sk_write_queue);
}
-static inline bool tcp_write_queue_empty(struct sock *sk)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
+
+static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
{
- return skb_queue_empty(&sk->sk_write_queue);
+ tcp_skb_tsorted_anchor_cleanup(skb);
+ rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
+}
+
+static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
+{
+ list_del(&skb->tcp_tsorted_anchor);
+ tcp_rtx_queue_unlink(skb, sk);
+ sk_wmem_free_skb(sk, skb);
}
static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1757,8 +1701,7 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
{
- tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
- tcp_write_queue_next(sk, skb);
+ tcp_sk(sk)->highest_sack = skb_rb_next(skb);
}
static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1768,7 +1711,7 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
static inline void tcp_highest_sack_reset(struct sock *sk)
{
- tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
+ tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk);
}
/* Called when old skb is about to be deleted and replaced by new skb */
@@ -1934,11 +1877,12 @@ extern void tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);
+extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
- const struct sk_buff *skb = tcp_write_queue_head(sk);
+ const struct sk_buff *skb = tcp_rtx_queue_head(sk);
u32 rto = inet_csk(sk)->icsk_rto;
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
@@ -2115,4 +2059,8 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
}
+
+#if IS_ENABLED(CONFIG_SMC)
+extern struct static_key_false tcp_have_smc;
+#endif
#endif /* _TCP_H */