From 72bf4f1767f0386970dc04726dc5bc2e3991dc19 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Oct 2023 11:24:57 +0000 Subject: net: do not leave an empty skb in write queue Under memory stress conditions, tcp_sendmsg_locked() might call sk_stream_wait_memory(), thus releasing the socket lock. If a fresh skb has been allocated prior to this, we should not leave it in the write queue otherwise tcp_write_xmit() could panic. This apparently does not happen often, but a future change in __sk_mem_raise_allocated() that Shakeel and others are considering would increase chances of being hurt. Under discussion is to remove this controversial part: /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ if (memcg_charge && !charged) { mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } return 1; } Fixes: fdfc5c8594c2 ("tcp: remove empty skb from write queue in error cases") Signed-off-by: Eric Dumazet Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20231019112457.1190114-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d3456cf840de..3d3a24f79573 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -927,10 +927,11 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } -/* In some cases, both sendmsg() could have added an skb to the write queue, - * but failed adding payload on it. We need to remove it to consume less +/* In some cases, sendmsg() could have added an skb to the write queue, + * but failed adding payload on it. We need to remove it to consume less * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger - * epoll() users. + * epoll() users. Another reason is that tcp_write_xmit() does not like + * finding an empty skb in the write queue. */ void tcp_remove_empty_skb(struct sock *sk) { @@ -1289,6 +1290,7 @@ new_segment: wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + tcp_remove_empty_skb(sk); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); -- cgit v1.3.1 From a9beb7e81bcb876615e1fbb3c07f3f9dba69831f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Oct 2023 12:21:04 +0000 Subject: neighbour: fix various data-races 1) tbl->gc_thresh1, tbl->gc_thresh2, tbl->gc_thresh3 and tbl->gc_interval can be written from sysfs. 2) tbl->last_flush is read locklessly from neigh_alloc() 3) tbl->proxy_queue.qlen is read locklessly from neightbl_fill_info() 4) neightbl_fill_info() reads cpu stats that can be changed concurrently. Fixes: c7fb64db001f ("[NETLINK]: Neighbour table configuration and statistics via rtnetlink") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20231019122104.1448310-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 67 +++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9c09f091cbff..df81c1f0a570 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -251,7 +251,8 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { - int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + int max_clean = atomic_read(&tbl->gc_entries) - + READ_ONCE(tbl->gc_thresh2); unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; @@ -280,7 +281,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) } } - tbl->last_flush = jiffies; + WRITE_ONCE(tbl->last_flush, jiffies); write_unlock_bh(&tbl->lock); @@ -464,17 +465,17 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, { struct neighbour *n = NULL; unsigned long now = jiffies; - int entries; + int entries, gc_thresh3; if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; - if (entries >= tbl->gc_thresh3 || - (entries >= tbl->gc_thresh2 && - time_after(now, tbl->last_flush + 5 * HZ))) { - if (!neigh_forced_gc(tbl) && - entries >= tbl->gc_thresh3) { + gc_thresh3 = READ_ONCE(tbl->gc_thresh3); + if (entries >= gc_thresh3 || + (entries >= READ_ONCE(tbl->gc_thresh2) && + time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); @@ -955,13 +956,14 @@ static void neigh_periodic_work(struct work_struct *work) if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; - tbl->last_rand = jiffies; + + WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } - if (atomic_read(&tbl->entries) < tbl->gc_thresh1) + if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { @@ -2167,15 +2169,16 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || - nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) || - nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || - nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || - nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) + nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), + NDTA_PAD) || + nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || + nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || + nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; - long flush_delta = now - tbl->last_flush; - long rand_delta = now - tbl->last_rand; + long flush_delta = now - READ_ONCE(tbl->last_flush); + long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, @@ -2183,7 +2186,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_proxy_qlen = tbl->proxy_queue.qlen, + .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; rcu_read_lock(); @@ -2206,17 +2209,17 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); - ndst.ndts_allocs += st->allocs; - ndst.ndts_destroys += st->destroys; - ndst.ndts_hash_grows += st->hash_grows; - ndst.ndts_res_failed += st->res_failed; - ndst.ndts_lookups += st->lookups; - ndst.ndts_hits += st->hits; - ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast; - ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; - ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; - ndst.ndts_forced_gc_runs += st->forced_gc_runs; - ndst.ndts_table_fulls += st->table_fulls; + ndst.ndts_allocs += READ_ONCE(st->allocs); + ndst.ndts_destroys += READ_ONCE(st->destroys); + ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); + ndst.ndts_res_failed += READ_ONCE(st->res_failed); + ndst.ndts_lookups += READ_ONCE(st->lookups); + ndst.ndts_hits += READ_ONCE(st->hits); + ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); + ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); + ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); + ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); + ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, @@ -2445,16 +2448,16 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout_tbl_lock; if (tb[NDTA_THRESH1]) - tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); + WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) - tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]); + WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) - tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]); + WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) - tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]); + WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; -- cgit v1.3.1 From d2a0fc372aca561556e765d0a9ec365c7c12f0ad Mon Sep 17 00:00:00 2001 From: Fred Chen Date: Sat, 21 Oct 2023 08:19:47 +0800 Subject: tcp: fix wrong RTO timeout when received SACK reneging This commit fix wrong RTO timeout when received SACK reneging. When an ACK arrived pointing to a SACK reneging, tcp_check_sack_reneging() will rearm the RTO timer for min(1/2*srtt, 10ms) into to the future. But since the commit 62d9f1a6945b ("tcp: fix TLP timer not set when CA_STATE changes from DISORDER to OPEN") merged, the tcp_set_xmit_timer() is moved after tcp_fastretrans_alert()(which do the SACK reneging check), so the RTO timeout will be overwrited by tcp_set_xmit_timer() with icsk_rto instead of 1/2*srtt. Here is a packetdrill script to check this bug: 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 // simulate srtt to 100ms +0 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 +.1 < . 1:1(0) ack 1 win 1024 +0 accept(3, ..., ...) = 4 +0 write(4, ..., 10000) = 10000 +0 > P. 1:10001(10000) ack 1 // inject sack +.1 < . 1:1(0) ack 1 win 257 +0 > . 1:1001(1000) ack 1 // inject sack reneging +.1 < . 1:1(0) ack 1001 win 257 // we expect rto fired in 1/2*srtt (50ms) +.05 > . 1001:2001(1000) ack 1 This fix remove the FLAG_SET_XMIT_TIMER from ack_flag when tcp_check_sack_reneging() set RTO timer with 1/2*srtt to avoid being overwrited later. Fixes: 62d9f1a6945b ("tcp: fix TLP timer not set when CA_STATE changes from DISORDER to OPEN") Signed-off-by: Fred Chen Reviewed-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8afb0950a697..804821d6bd4d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2207,16 +2207,17 @@ void tcp_enter_loss(struct sock *sk) * restore sanity to the SACK scoreboard. If the apparent reneging * persists until this RTO then we'll clear the SACK scoreboard. */ -static bool tcp_check_sack_reneging(struct sock *sk, int flag) +static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) { - if (flag & FLAG_SACK_RENEGING && - flag & FLAG_SND_UNA_ADVANCED) { + if (*ack_flag & FLAG_SACK_RENEGING && + *ack_flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); + *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } return false; @@ -2986,7 +2987,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tcp_check_sack_reneging(sk, flag)) + if (tcp_check_sack_reneging(sk, ack_flag)) return; /* C. Check consistency of the current state. */ -- cgit v1.3.1 From 3e3929ef889e650dd585dc0f4f7f18240688811a Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Sat, 21 Oct 2023 08:48:27 -0700 Subject: wifi: cfg80211: pass correct pointer to rdev_inform_bss() Confusing struct member names here resulted in passing the wrong pointer, causing crashes. Pass the correct one. Fixes: eb142608e2c4 ("wifi: cfg80211: use a struct for inform_single_bss data") Signed-off-by: Ben Greear Link: https://lore.kernel.org/r/20231021154827.1142734-1-greearb@candelatech.com [rewrite commit message, add fixes] Signed-off-by: Johannes Berg --- net/wireless/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 939deecf0bbe..8210a6090ac1 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2125,7 +2125,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, if (!res) goto drop; - rdev_inform_bss(rdev, &res->pub, ies, data->drv_data); + rdev_inform_bss(rdev, &res->pub, ies, drv_data->drv_data); if (data->bss_source == BSS_SOURCE_MBSSID) { /* this is a nontransmitting bss, we need to add it to -- cgit v1.3.1 From c434b2be2d80d236bb090fdb493d4bd5ed589238 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 18 Oct 2023 11:42:51 +0200 Subject: wifi: cfg80211: fix assoc response warning on failed links The warning here shouldn't be done before we even set the bss field (or should've used the input data). Move the assignment before the warning to fix it. We noticed this now because of Wen's bugfix, where the bug fixed there had previously hidden this other bug. Fixes: 53ad07e9823b ("wifi: cfg80211: support reporting failed links") Signed-off-by: Johannes Berg --- net/wireless/mlme.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 3e2c398abddc..55a1d3633853 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -43,10 +43,11 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { cr.links[link_id].status = data->links[link_id].status; + cr.links[link_id].bss = data->links[link_id].bss; + WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS && (!cr.ap_mld_addr || !cr.links[link_id].bss)); - cr.links[link_id].bss = data->links[link_id].bss; if (!cr.links[link_id].bss) continue; cr.links[link_id].bssid = data->links[link_id].bss->bssid; -- cgit v1.3.1 From 91535613b6090fc968c601d11d4e2f16b333713c Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Mon, 16 Oct 2023 14:52:48 +0300 Subject: wifi: mac80211: don't drop all unprotected public action frames Not all public action frames have a protected variant. When MFP is enabled drop only public action frames that have a dual protected variant. Fixes: 76a3059cf124 ("wifi: mac80211: drop some unprotected action frames") Signed-off-by: Avraham Stern Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20231016145213.2973e3c8d3bb.I6198b8d3b04cf4a97b06660d346caec3032f232a@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 29 +++++++++++++++++++++++++++++ net/mac80211/rx.c | 3 +-- 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index bd2f6e19c357..b24fb80782c5 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4355,6 +4355,35 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr, return mgmt->u.action.category == WLAN_CATEGORY_PUBLIC; } +/** + * ieee80211_is_protected_dual_of_public_action - check if skb contains a + * protected dual of public action management frame + * @skb: the skb containing the frame, length will be checked + * + * Return: true if the skb contains a protected dual of public action + * management frame, false otherwise. + */ +static inline bool +ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb) +{ + u8 action; + + if (!ieee80211_is_public_action((void *)skb->data, skb->len) || + skb->len < IEEE80211_MIN_ACTION_SIZE + 1) + return false; + + action = *(u8 *)(skb->data + IEEE80211_MIN_ACTION_SIZE); + + return action != WLAN_PUB_ACTION_20_40_BSS_COEX && + action != WLAN_PUB_ACTION_DSE_REG_LOC_ANN && + action != WLAN_PUB_ACTION_MSMT_PILOT && + action != WLAN_PUB_ACTION_TDLS_DISCOVER_RES && + action != WLAN_PUB_ACTION_LOC_TRACK_NOTI && + action != WLAN_PUB_ACTION_FTM_REQUEST && + action != WLAN_PUB_ACTION_FTM_RESPONSE && + action != WLAN_PUB_ACTION_FILS_DISCOVERY; +} + /** * _ieee80211_is_group_privacy_action - check if frame is a group addressed * privacy action frame diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e751cda5eef6..8f6b6f56b65b 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2468,8 +2468,7 @@ static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) /* drop unicast public action frames when using MPF */ if (is_unicast_ether_addr(mgmt->da) && - ieee80211_is_public_action((void *)rx->skb->data, - rx->skb->len)) + ieee80211_is_protected_dual_of_public_action(rx->skb)) return -EACCES; } -- cgit v1.3.1 From 7798b59409c345d4a6034a4326bceb9f7e2e8b58 Mon Sep 17 00:00:00 2001 From: Moritz Wanzenböck Date: Thu, 19 Oct 2023 14:58:47 +0200 Subject: net/handshake: fix file ref count in handshake_nl_accept_doit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If req->hr_proto->hp_accept() fail, we call fput() twice: Once in the error path, but also a second time because sock->file is at that point already associated with the file descriptor. Once the task exits, as it would probably do after receiving an error reading from netlink, the fd is closed, calling fput() a second time. To fix, we move installing the file after the error path for the hp_accept() call. In the case of errors we simply put the unused fd. In case of success we can use fd_install() to link the sock->file to the reserved fd. Fixes: 7ea9c1ec66bc ("net/handshake: Fix handshake_dup() ref counting") Signed-off-by: Moritz Wanzenböck Reviewed-by: Chuck Lever Link: https://lore.kernel.org/r/20231019125847.276443-1-moritz.wanzenboeck@linbit.com Signed-off-by: Jakub Kicinski --- net/handshake/netlink.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index d0bc1dd8e65a..80c7302692c7 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -87,29 +87,6 @@ struct nlmsghdr *handshake_genl_put(struct sk_buff *msg, } EXPORT_SYMBOL(handshake_genl_put); -/* - * dup() a kernel socket for use as a user space file descriptor - * in the current process. The kernel socket must have an - * instatiated struct file. - * - * Implicit argument: "current()" - */ -static int handshake_dup(struct socket *sock) -{ - struct file *file; - int newfd; - - file = get_file(sock->file); - newfd = get_unused_fd_flags(O_CLOEXEC); - if (newfd < 0) { - fput(file); - return newfd; - } - - fd_install(newfd, file); - return newfd; -} - int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); @@ -133,17 +110,20 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) goto out_status; sock = req->hr_sk->sk_socket; - fd = handshake_dup(sock); + fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { err = fd; goto out_complete; } + err = req->hr_proto->hp_accept(req, info, fd); if (err) { - fput(sock->file); + put_unused_fd(fd); goto out_complete; } + fd_install(fd, get_file(sock->file)); + trace_handshake_cmd_accept(net, req, req->hr_sk, fd); return 0; -- cgit v1.3.1 From 735795f68b37e9bb49f642407a0d49b1631ea1c7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Oct 2023 21:09:47 +0200 Subject: netfilter: flowtable: GC pushes back packets to classic path Since 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded unreplied tuple"), flowtable GC pushes back flows with IPS_SEEN_REPLY back to classic path in every run, ie. every second. This is because of a new check for NF_FLOW_HW_ESTABLISHED which is specific of sched/act_ct. In Netfilter's flowtable case, NF_FLOW_HW_ESTABLISHED never gets set on and IPS_SEEN_REPLY is unreliable since users decide when to offload the flow before, such bit might be set on at a later stage. Fix it by adding a custom .gc handler that sched/act_ct can use to deal with its NF_FLOW_HW_ESTABLISHED bit. Fixes: 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded unreplied tuple") Reported-by: Vladimir Smelhaus Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + net/netfilter/nf_flow_table_core.c | 14 +++++++------- net/sched/act_ct.c | 7 +++++++ 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index d466e1a3b0b1..fe1507c1db82 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -53,6 +53,7 @@ struct nf_flowtable_type { struct list_head list; int family; int (*init)(struct nf_flowtable *ft); + bool (*gc)(const struct flow_offload *flow); int (*setup)(struct nf_flowtable *ft, struct net_device *dev, enum flow_block_command cmd); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 1d34d700bd09..920a5a29ae1d 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -316,12 +316,6 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, } EXPORT_SYMBOL_GPL(flow_offload_refresh); -static bool nf_flow_is_outdated(const struct flow_offload *flow) -{ - return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && - !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); -} - static inline bool nf_flow_has_expired(const struct flow_offload *flow) { return nf_flow_timeout_delta(flow->timeout) <= 0; @@ -407,12 +401,18 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } +static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, + const struct flow_offload *flow) +{ + return flow_table->type->gc && flow_table->type->gc(flow); +} + static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - nf_flow_is_outdated(flow)) + nf_flow_custom_gc(flow_table, flow)) flow_offload_teardown(flow); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 7c652d14528b..0d44da4e8c8e 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -278,7 +278,14 @@ err_nat: return err; } +static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) +{ + return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && + !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); +} + static struct nf_flowtable_type flowtable_ct = { + .gc = tcf_ct_flow_is_outdated, .action = tcf_ct_flow_table_fill_actions, .owner = THIS_MODULE, }; -- cgit v1.3.1 From a63b6622120cd03a304796dbccb80655b3a21798 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Oct 2023 21:58:57 +0200 Subject: net/sched: act_ct: additional checks for outdated flows Current nf_flow_is_outdated() implementation considers any flow table flow which state diverged from its underlying CT connection status for teardown which can be problematic in the following cases: - Flow has never been offloaded to hardware in the first place either because flow table has hardware offload disabled (flag NF_FLOWTABLE_HW_OFFLOAD is not set) or because it is still pending on 'add' workqueue to be offloaded for the first time. The former is incorrect, the later generates excessive deletions and additions of flows. - Flow is already pending to be updated on the workqueue. Tearing down such flows will also generate excessive removals from the flow table, especially on highly loaded system where the latency to re-offload a flow via 'add' workqueue can be quite high. When considering a flow for teardown as outdated verify that it is both offloaded to hardware and doesn't have any pending updates. Fixes: 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded unreplied tuple") Reviewed-by: Paul Blakey Signed-off-by: Vlad Buslov Signed-off-by: Pablo Neira Ayuso --- net/sched/act_ct.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 0d44da4e8c8e..fb52d6f9aff9 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -281,6 +281,8 @@ err_nat: static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) { return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && + test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) && + !test_bit(NF_FLOW_HW_PENDING, &flow->flags) && !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); } -- cgit v1.3.1 From 197f9fba9663e765f8f3ae3b2375c6cc32b2e2b3 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Wed, 25 Oct 2023 02:14:34 -0400 Subject: net: ipv4: fix typo in comments The word "advertize" should be replaced by "advertise". Signed-off-by: Deming Wang Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d18f0f092fe7..4ccfc104f13a 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -786,7 +786,7 @@ int esp_input_done2(struct sk_buff *skb, int err) /* * 1) if the NAT-T peer's IP or port changed then - * advertize the change to the keying daemon. + * advertise the change to the keying daemon. * This is an inbound SA, so just compare * SRC ports. */ -- cgit v1.3.1 From 1711435e3e67e079d6a2bce54d96d1af21c7ef2c Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Wed, 25 Oct 2023 02:16:56 -0400 Subject: net: ipv6: fix typo in comments The word "advertize" should be replaced by "advertise". Signed-off-by: Deming Wang Signed-off-by: David S. Miller --- net/ipv6/esp6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index e023d29e919c..2cc1a45742d8 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -833,7 +833,7 @@ int esp6_input_done2(struct sk_buff *skb, int err) /* * 1) if the NAT-T peer's IP or port changed then - * advertize the change to the keying daemon. + * advertise the change to the keying daemon. * This is an inbound SA, so just compare * SRC ports. */ -- cgit v1.3.1 From 53b08c4985158430fd6d035fb49443bada535210 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Tue, 24 Oct 2023 22:17:42 +0300 Subject: vsock/virtio: initialize the_virtio_vsock before using VQs Once VQs are filled with empty buffers and we kick the host, it can send connection requests. If the_virtio_vsock is not initialized before, replies are silently dropped and do not reach the host. virtio_transport_send_pkt() can queue packets once the_virtio_vsock is set, but they won't be processed until vsock->tx_run is set to true. We queue vsock->send_pkt_work when initialization finishes to send those packets queued earlier. Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on the_virtio_vsock") Signed-off-by: Alexandru Matei Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20231024191742.14259-1-alexandru.matei@uipath.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index e95df847176b..b80bf681327b 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -555,6 +555,11 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) virtio_device_ready(vdev); + return 0; +} + +static void virtio_vsock_vqs_start(struct virtio_vsock *vsock) +{ mutex_lock(&vsock->tx_lock); vsock->tx_run = true; mutex_unlock(&vsock->tx_lock); @@ -569,7 +574,16 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) vsock->event_run = true; mutex_unlock(&vsock->event_lock); - return 0; + /* virtio_transport_send_pkt() can queue packets once + * the_virtio_vsock is set, but they won't be processed until + * vsock->tx_run is set to true. We queue vsock->send_pkt_work + * when initialization finishes to send those packets queued + * earlier. + * We don't need to queue the other workers (rx, event) because + * as long as we don't fill the queues with empty buffers, the + * host can't send us any notification. + */ + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); } static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) @@ -664,6 +678,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev) goto out; rcu_assign_pointer(the_virtio_vsock, vsock); + virtio_vsock_vqs_start(vsock); mutex_unlock(&the_virtio_vsock_mutex); @@ -736,6 +751,7 @@ static int virtio_vsock_restore(struct virtio_device *vdev) goto out; rcu_assign_pointer(the_virtio_vsock, vsock); + virtio_vsock_vqs_start(vsock); out: mutex_unlock(&the_virtio_vsock_mutex); -- cgit v1.3.1