From 864b656f82ccd433d3e38149c3673d295ad64bf6 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:40 -0600 Subject: bpf: Add support for writing to nf_conn:mark Support direct writes to nf_conn:mark from TC and XDP prog types. This is useful when applications want to store per-connection metadata. This is also particularly useful for applications that run both bpf and iptables/nftables because the latter can trivially access this metadata. One example use case would be if a bpf prog is responsible for advanced packet classification and iptables/nftables is later used for routing due to pre-existing/legacy code. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/ebca06dea366e3e7e861c12f375a548cc4c61108.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e872f45399b0..4b2be211bcbe 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -8604,6 +8605,36 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } +DEFINE_MUTEX(nf_conn_btf_access_lock); +EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); + +int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); +EXPORT_SYMBOL_GPL(nfct_bsa); + +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_bsa) + ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -8663,6 +8694,27 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static int xdp_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_bsa) + ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -10557,6 +10609,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10568,6 +10621,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, + .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { -- cgit v1.2.3-70-g09d2 From 5a090aa35038e3dad1ee334e3c509c39e7599bb4 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 20 Sep 2022 08:15:23 -0600 Subject: bpf: Rename nfct_bsa to nfct_btf_struct_access The former name was a little hard to guess. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/73adc72385c8b162391fbfb404f0b6d4c5cc55d7.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau --- include/net/netfilter/nf_conntrack_bpf.h | 8 ++++---- net/core/filter.c | 18 +++++++++--------- net/netfilter/nf_conntrack_bpf.c | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index 9c07d2d59da5..1199d4f8e019 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -13,10 +13,10 @@ extern int register_nf_conntrack_bpf(void); extern void cleanup_nf_conntrack_bpf(void); extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); #else diff --git a/net/core/filter.c b/net/core/filter.c index 4b2be211bcbe..2fd9449026aa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8608,11 +8608,11 @@ static bool tc_cls_act_is_valid_access(int off, int size, DEFINE_MUTEX(nf_conn_btf_access_lock); EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); -int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); -EXPORT_SYMBOL_GPL(nfct_bsa); +int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); +EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, @@ -8628,8 +8628,8 @@ static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, flag); mutex_lock(&nf_conn_btf_access_lock); - if (nfct_bsa) - ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); mutex_unlock(&nf_conn_btf_access_lock); return ret; @@ -8708,8 +8708,8 @@ static int xdp_btf_struct_access(struct bpf_verifier_log *log, flag); mutex_lock(&nf_conn_btf_access_lock); - if (nfct_bsa) - ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); mutex_unlock(&nf_conn_btf_access_lock); return ret; diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 77eb8e959f61..29c4efb3da5e 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -502,7 +502,7 @@ int register_nf_conntrack_bpf(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); if (!ret) { mutex_lock(&nf_conn_btf_access_lock); - nfct_bsa = _nf_conntrack_btf_struct_access; + nfct_btf_struct_access = _nf_conntrack_btf_struct_access; mutex_unlock(&nf_conn_btf_access_lock); } @@ -512,6 +512,6 @@ int register_nf_conntrack_bpf(void) void cleanup_nf_conntrack_bpf(void) { mutex_lock(&nf_conn_btf_access_lock); - nfct_bsa = NULL; + nfct_btf_struct_access = NULL; mutex_unlock(&nf_conn_btf_access_lock); } -- cgit v1.2.3-70-g09d2 From 3f8ef65af927db247418d4e1db49164d7a158fc5 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 23 Aug 2022 21:37:54 +0800 Subject: net: If sock is dead don't access sock's sk_wq in sk_stream_wait_memory Fixes the below NULL pointer dereference: [...] [ 14.471200] Call Trace: [ 14.471562] [ 14.471882] lock_acquire+0x245/0x2e0 [ 14.472416] ? remove_wait_queue+0x12/0x50 [ 14.473014] ? _raw_spin_lock_irqsave+0x17/0x50 [ 14.473681] _raw_spin_lock_irqsave+0x3d/0x50 [ 14.474318] ? remove_wait_queue+0x12/0x50 [ 14.474907] remove_wait_queue+0x12/0x50 [ 14.475480] sk_stream_wait_memory+0x20d/0x340 [ 14.476127] ? do_wait_intr_irq+0x80/0x80 [ 14.476704] do_tcp_sendpages+0x287/0x600 [ 14.477283] tcp_bpf_push+0xab/0x260 [ 14.477817] tcp_bpf_sendmsg_redir+0x297/0x500 [ 14.478461] ? __local_bh_enable_ip+0x77/0xe0 [ 14.479096] tcp_bpf_send_verdict+0x105/0x470 [ 14.479729] tcp_bpf_sendmsg+0x318/0x4f0 [ 14.480311] sock_sendmsg+0x2d/0x40 [ 14.480822] ____sys_sendmsg+0x1b4/0x1c0 [ 14.481390] ? copy_msghdr_from_user+0x62/0x80 [ 14.482048] ___sys_sendmsg+0x78/0xb0 [ 14.482580] ? vmf_insert_pfn_prot+0x91/0x150 [ 14.483215] ? __do_fault+0x2a/0x1a0 [ 14.483738] ? do_fault+0x15e/0x5d0 [ 14.484246] ? __handle_mm_fault+0x56b/0x1040 [ 14.484874] ? lock_is_held_type+0xdf/0x130 [ 14.485474] ? find_held_lock+0x2d/0x90 [ 14.486046] ? __sys_sendmsg+0x41/0x70 [ 14.486587] __sys_sendmsg+0x41/0x70 [ 14.487105] ? intel_pmu_drain_pebs_core+0x350/0x350 [ 14.487822] do_syscall_64+0x34/0x80 [ 14.488345] entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The test scenario has the following flow: thread1 thread2 ----------- --------------- tcp_bpf_sendmsg tcp_bpf_send_verdict tcp_bpf_sendmsg_redir sock_close tcp_bpf_push_locked __sock_release tcp_bpf_push //inet_release do_tcp_sendpages sock->ops->release sk_stream_wait_memory // tcp_close sk_wait_event sk->sk_prot->close release_sock(__sk); *** lock_sock(sk); __tcp_close sock_orphan(sk) sk->sk_wq = NULL release_sock **** lock_sock(__sk); remove_wait_queue(sk_sleep(sk), &wait); sk_sleep(sk) //NULL pointer dereference &rcu_dereference_raw(sk->sk_wq)->wait While waiting for memory in thread1, the socket is released with its wait queue because thread2 has closed it. This caused by tcp_bpf_send_verdict didn't increase the f_count of psock->sk_redir->sk_socket->file in thread1. We should check if SOCK_DEAD flag is set on wakeup in sk_stream_wait_memory before accessing the wait queue. Suggested-by: Jakub Sitnicki Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/20220823133755.314697-2-liujian56@huawei.com --- net/core/stream.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/stream.c b/net/core/stream.c index ccc083cdef23..1105057ce00a 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - remove_wait_queue(sk_sleep(sk), &wait); + if (!sock_flag(sk, SOCK_DEAD)) + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: -- cgit v1.2.3-70-g09d2 From bec217197b412d74168c6a42fc0f76d0cc9cad00 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Wed, 7 Sep 2022 15:13:11 +0800 Subject: skmsg: Schedule psock work if the cached skb exists on the psock In sk_psock_backlog function, for ingress direction skb, if no new data packet arrives after the skb is cached, the cached skb does not have a chance to be added to the receive queue of psock. As a result, the cached skb cannot be received by the upper-layer application. Fix this by reschedule the psock work to dispose the cached skb in sk_msg_recvmsg function. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220907071311.60534-1-liujian56@huawei.com --- net/core/skmsg.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 188f8558d27d..ca70525621c7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -434,8 +434,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (copied + copy > len) copy = len - copied; copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; + } copied += copy; if (likely(!peek)) { @@ -455,7 +457,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, * didn't copy the entire length lets just break. */ if (copy != sge->length) - return copied; + goto out; sk_msg_iter_var_next(i); } @@ -477,7 +479,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } msg_rx = sk_psock_peek_msg(psock); } - +out: + if (psock->work_state.skb && copied > 0) + schedule_work(&psock->work); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); -- cgit v1.2.3-70-g09d2 From 37cfbe0bf2e85287350a6b0ca9521f5a4c7389ce Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:04 -0700 Subject: bpf: Move the "cdg" tcp-cc check to the common sol_tcp_sockopt() The check on the tcp-cc, "cdg", is done in the bpf_sk_setsockopt which is used by the bpf_tcp_ca, bpf_lsm, cg_sockopt, and tcp_iter hooks. However, it is not done for cg sock_ddr, cg sockops, and some of the bpf_lsm_cgroup hooks. The tcp-cc "cdg" should have very limited usage. This patch is to move the "cdg" check to the common sol_tcp_sockopt() so that all hooks have a consistent behavior. The motivation to make this check consistent now is because the latter patch will refactor the bpf_setsockopt(TCP_CONGESTION) into another function, so it is better to take this chance to refactor this piece also. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 2fd9449026aa..f4cea3ff994a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5127,6 +5127,13 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, case TCP_CONGESTION: if (*optlen < 2) return -EINVAL; + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (!getopt && *optlen >= sizeof("cdg") - 1 && + !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; break; case TCP_SAVED_SYN: if (*optlen < 1) @@ -5285,12 +5292,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { - if (level == SOL_TCP && optname == TCP_CONGESTION) { - if (optlen >= sizeof("cdg") - 1 && - !strncmp("cdg", optval, optlen)) - return -ENOTSUPP; - } - return _bpf_setsockopt(sk, level, optname, optval, optlen); } -- cgit v1.2.3-70-g09d2 From 1e7d217faa11ac027f622124a3842aafbd0c4a42 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:05 -0700 Subject: bpf: Refactor bpf_setsockopt(TCP_CONGESTION) handling into another function This patch moves the bpf_setsockopt(TCP_CONGESTION) logic into another function. The next patch will add extra logic to avoid recursion and this will make the latter patch easier to follow. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index f4cea3ff994a..96f2f7a65e65 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5102,6 +5102,33 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, return 0; } +static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, + int *optlen, bool getopt) +{ + if (*optlen < 2) + return -EINVAL; + + if (getopt) { + if (!inet_csk(sk)->icsk_ca_ops) + return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; + return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; + + return do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), *optlen); +} + static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5125,16 +5152,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, return -EINVAL; break; case TCP_CONGESTION: - if (*optlen < 2) - return -EINVAL; - /* "cdg" is the only cc that alloc a ptr - * in inet_csk_ca area. The bpf-tcp-cc may - * overwrite this ptr after switching to cdg. - */ - if (!getopt && *optlen >= sizeof("cdg") - 1 && - !strncmp("cdg", optval, *optlen)) - return -ENOTSUPP; - break; + return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); case TCP_SAVED_SYN: if (*optlen < 1) return -EINVAL; @@ -5159,13 +5177,6 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, return 0; } - if (optname == TCP_CONGESTION) { - if (!inet_csk(sk)->icsk_ca_ops) - return -EINVAL; - /* BPF expects NULL-terminated tcp-cc string */ - optval[--(*optlen)] = '\0'; - } - return do_tcp_getsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); -- cgit v1.2.3-70-g09d2 From 061ff040710e9f6f043d1fa80b1b362d2845b17a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:06 -0700 Subject: bpf: tcp: Stop bpf_setsockopt(TCP_CONGESTION) in init ops to recur itself When a bad bpf prog '.init' calls bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop: .init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ... ... => .init => bpf_setsockopt(tcp_cc). It was prevented by the prog->active counter before but the prog->active detection cannot be used in struct_ops as explained in the earlier patch of the set. In this patch, the second bpf_setsockopt(tcp_cc) is not allowed in order to break the loop. This is done by using a bit of an existing 1 byte hole in tcp_sock to check if there is on-going bpf_setsockopt(TCP_CONGESTION) in this tcp_sock. Note that this essentially limits only the first '.init' can call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer does not support ECN) and the second '.init' cannot fallback to another cc. This applies even the second bpf_setsockopt(TCP_CONGESTION) will not cause a loop. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220929070407.965581-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/tcp.h | 6 ++++++ net/core/filter.c | 28 +++++++++++++++++++++++++++- net/ipv4/tcp_minisocks.c | 1 + 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9fbe22732c3..3bdf687e2fb3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -388,6 +388,12 @@ struct tcp_sock { u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ + u8 bpf_chg_cc_inprogress:1; /* In the middle of + * bpf_setsockopt(TCP_CONGESTION), + * it is to avoid the bpf_tcp_cc->init() + * to recur itself by calling + * bpf_setsockopt(TCP_CONGESTION, "itself"). + */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 diff --git a/net/core/filter.c b/net/core/filter.c index 96f2f7a65e65..ac4c45c02da5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5105,6 +5105,9 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, int *optlen, bool getopt) { + struct tcp_sock *tp; + int ret; + if (*optlen < 2) return -EINVAL; @@ -5125,8 +5128,31 @@ static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) return -ENOTSUPP; - return do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another. + */ + tp = tcp_sk(sk); + if (tp->bpf_chg_cc_inprogress) + return -EBUSY; + + tp->bpf_chg_cc_inprogress = 1; + ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), *optlen); + tp->bpf_chg_cc_inprogress = 0; + return ret; } static int sol_tcp_sockopt(struct sock *sk, int optname, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cb95d88497ae..ddcdc2bc4c04 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -541,6 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); -- cgit v1.2.3-70-g09d2