diff options
author | Jakub Kicinski <kuba@kernel.org> | 2022-06-30 16:31:00 -0700 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2022-06-30 16:31:00 -0700 |
commit | 0d8730f07c822a351a624462918c7109cdc7f402 (patch) | |
tree | 3539d4c7c098894f3b2d5f49134c8cfccc06aaca /net | |
parent | b7d78b46d5e8dc77c656c13885d31e931923b915 (diff) | |
parent | 5e8379351dbde61ea383e514f0f9ecb2c047cf4e (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
9c5de246c1db ("net: sparx5: mdb add/del handle non-sparx5 devices")
fbb89d02e33a ("net: sparx5: Allow mdb entries to both CPU and ports")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
-rw-r--r-- | net/bridge/br_netfilter_hooks.c | 21 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 6 | ||||
-rw-r--r-- | net/ipv6/addrconf.c | 8 | ||||
-rw-r--r-- | net/ipv6/route.c | 9 | ||||
-rw-r--r-- | net/ipv6/seg6_hmac.c | 1 | ||||
-rw-r--r-- | net/ipv6/sit.c | 8 | ||||
-rw-r--r-- | net/mptcp/options.c | 7 | ||||
-rw-r--r-- | net/mptcp/pm.c | 10 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 84 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 24 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 127 | ||||
-rw-r--r-- | net/ncsi/ncsi-manage.c | 3 | ||||
-rw-r--r-- | net/netfilter/nf_tables_core.c | 24 | ||||
-rw-r--r-- | net/netfilter/nf_tables_trace.c | 44 | ||||
-rw-r--r-- | net/netfilter/nft_set_hash.c | 2 | ||||
-rw-r--r-- | net/rose/rose_timer.c | 34 | ||||
-rw-r--r-- | net/sched/act_api.c | 22 | ||||
-rw-r--r-- | net/socket.c | 16 | ||||
-rw-r--r-- | net/tipc/node.c | 41 | ||||
-rw-r--r-- | net/tipc/socket.c | 1 |
21 files changed, 327 insertions, 167 deletions
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 4fd882686b04..ff4779036649 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1012,9 +1012,24 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, return okfn(net, sk, skb); ops = nf_hook_entries_get_hook_ops(e); - for (i = 0; i < e->num_hook_entries && - ops[i]->priority <= NF_BR_PRI_BRNF; i++) - ; + for (i = 0; i < e->num_hook_entries; i++) { + /* These hooks have already been called */ + if (ops[i]->priority < NF_BR_PRI_BRNF) + continue; + + /* These hooks have not been called yet, run them. */ + if (ops[i]->priority > NF_BR_PRI_BRNF) + break; + + /* take a closer look at NF_BR_PRI_BRNF. */ + if (ops[i]->hook == br_nf_pre_routing) { + /* This hook diverted the skb to this function, + * hooks after this have not been run yet. + */ + i++; + break; + } + } nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6b2dc7b2b612..cc1caab4a654 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -410,7 +410,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, u32 mtu = dst_mtu(encap_dst) - headroom; if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) || - (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu)) + (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu)) return 0; skb_dst_update_pmtu_no_confirm(skb, mtu); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fda811a5251f..68d0d8a008e2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1964,7 +1964,10 @@ process: struct sock *nsk; sk = req->rsk_listener; - drop_reason = tcp_inbound_md5_hash(sk, skb, + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + drop_reason = SKB_DROP_REASON_XFRM_POLICY; + else + drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { @@ -2016,6 +2019,7 @@ process: } goto discard_and_relse; } + nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v4_restore_cb(skb); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index fa9f72895a63..88becb037eb6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1109,10 +1109,6 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - if (net->ipv6.devconf_all->disable_policy || - idev->cnf.disable_policy) - f6i->dst_nopolicy = true; - neigh_parms_data_state_setall(idev->nd_parms); ifa->addr = *cfg->pfx; @@ -5176,9 +5172,9 @@ next: fillargs->event = RTM_GETMULTICAST; /* multicast address */ - for (ifmca = rcu_dereference(idev->mc_list); + for (ifmca = rtnl_dereference(idev->mc_list); ifmca; - ifmca = rcu_dereference(ifmca->next), ip_idx++) { + ifmca = rtnl_dereference(ifmca->next), ip_idx++) { if (ip_idx < s_ip_idx) continue; err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1d6f75eef4bd..70cd50c1fa6f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4569,8 +4569,15 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, } f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); - if (!IS_ERR(f6i)) + if (!IS_ERR(f6i)) { f6i->dst_nocount = true; + + if (!anycast && + (net->ipv6.devconf_all->disable_policy || + idev->cnf.disable_policy)) + f6i->dst_nopolicy = true; + } + return f6i; } diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 6de01185cc68..d43c50a7310d 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -406,7 +406,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -EXPORT_SYMBOL(seg6_hmac_net_init); void seg6_hmac_exit(void) { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index fab89fd978f0..6b73b7a5f175 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -323,8 +323,6 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; - rcu_read_lock(); - ca = min(t->prl_count, cmax); if (!kp) { @@ -341,7 +339,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u } } - c = 0; + rcu_read_lock(); for_each_prl_rcu(t->prl) { if (c >= cmax) break; @@ -353,7 +351,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u if (kprl.addr != htonl(INADDR_ANY)) break; } -out: + rcu_read_unlock(); len = sizeof(*kp) * c; @@ -362,7 +360,7 @@ out: ret = -EFAULT; kfree(kp); - +out: return ret; } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index be3b918a6d15..aead331866a0 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -765,6 +765,7 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } @@ -788,6 +789,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk, opts->rcvr_key = msk->remote_key; pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } @@ -809,6 +811,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk, opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } @@ -833,13 +836,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); } return true; } @@ -966,7 +967,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto reset; subflow->mp_capable = 0; pr_fallback(msk); - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); return false; } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 59a85220edc9..45e2a48397b9 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -299,23 +299,21 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - struct sock *s = (struct sock *)msk; pr_debug("fail_seq=%llu", fail_seq); if (!READ_ONCE(msk->allow_infinite_fallback)) return; - if (!READ_ONCE(subflow->mp_fail_response_expect)) { + if (!subflow->fail_tout) { pr_debug("send MP_FAIL response and infinite map"); subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); subflow->send_infinite_map = 1; - } else if (!sock_flag(sk, SOCK_DEAD)) { + tcp_send_ack(sk); + } else { pr_debug("MP_FAIL response received"); - - sk_stop_timer(s, &s->sk_timer); + WRITE_ONCE(subflow->fail_tout, 0); } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e0fb9f96c45c..883bea93c2ae 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -500,7 +500,7 @@ static void mptcp_set_timeout(struct sock *sk) __mptcp_set_timeout(sk, tout); } -static bool tcp_can_send_ack(const struct sock *ssk) +static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); @@ -1245,7 +1245,7 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk, MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); mptcp_subflow_ctx(ssk)->send_infinite_map = 0; pr_fallback(msk); - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, @@ -2175,21 +2175,6 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } -static struct mptcp_subflow_context * -mp_fail_response_expect_subflow(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow, *ret = NULL; - - mptcp_for_each_subflow(msk, subflow) { - if (READ_ONCE(subflow->mp_fail_response_expect)) { - ret = subflow; - break; - } - } - - return ret; -} - static void mptcp_timeout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); @@ -2346,6 +2331,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(ssk); + inet_csk_listen_stop(ssk); + } __tcp_close(ssk, 0); /* close acquired an extra ref */ @@ -2518,27 +2508,50 @@ reset_timer: mptcp_reset_timer(sk); } +/* schedule the timeout timer for the relevant event: either close timeout + * or mp_fail timeout. The close timeout takes precedence on the mp_fail one + */ +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout) +{ + struct sock *sk = (struct sock *)msk; + unsigned long timeout, close_timeout; + + if (!fail_tout && !sock_flag(sk, SOCK_DEAD)) + return; + + close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN; + + /* the close timeout takes precedence on the fail one, and here at least one of + * them is active + */ + timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout; + + sk_reset_timer(sk, &sk->sk_timer, timeout); +} + static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) { - struct mptcp_subflow_context *subflow; - struct sock *ssk; + struct sock *ssk = msk->first; bool slow; - subflow = mp_fail_response_expect_subflow(msk); - if (subflow) { - pr_debug("MP_FAIL doesn't respond, reset the subflow"); + if (!ssk) + return; - ssk = mptcp_subflow_tcp_sock(subflow); - slow = lock_sock_fast(ssk); - mptcp_subflow_reset(ssk); - unlock_sock_fast(ssk, slow); - } + pr_debug("MP_FAIL doesn't respond, reset the subflow"); + + slow = lock_sock_fast(ssk); + mptcp_subflow_reset(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); + unlock_sock_fast(ssk, slow); + + mptcp_reset_timeout(msk, 0); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = &msk->sk.icsk_inet.sk; + unsigned long fail_tout; int state; lock_sock(sk); @@ -2575,7 +2588,9 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); - mptcp_mp_fail_no_response(msk); + fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; + if (fail_tout && time_after(jiffies, fail_tout)) + mptcp_mp_fail_no_response(msk); unlock: release_sock(sk); @@ -2822,6 +2837,7 @@ static void __mptcp_destroy_sock(struct sock *sk) static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; lock_sock(sk); @@ -2840,10 +2856,16 @@ static void mptcp_close(struct sock *sk, long timeout) cleanup: /* orphan all the subflows */ inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; - mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + /* since the close timeout takes precedence on the fail one, + * cancel the latter + */ + if (ssk == msk->first) + subflow->fail_tout = 0; + sock_orphan(ssk); unlock_sock_fast(ssk, slow); } @@ -2852,13 +2874,13 @@ cleanup: sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); if (mptcp_sk(sk)->token) - mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { - sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + mptcp_reset_timeout(msk, 0); } release_sock(sk); if (do_cancel_work) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 200f89f6d62f..c14d70c036d0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -306,6 +306,7 @@ struct mptcp_sock { u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; + struct mptcp_sock *dl_next; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) @@ -468,7 +469,6 @@ struct mptcp_subflow_context { local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1; /* at least one csum validated */ enum mptcp_data_avail data_avail; - bool mp_fail_response_expect; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -482,6 +482,7 @@ struct mptcp_subflow_context { u8 stale_count; long delegated_status; + unsigned long fail_tout; ); @@ -608,6 +609,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); @@ -662,6 +664,7 @@ void mptcp_get_options(const struct sk_buff *skb, void mptcp_finish_connect(struct sock *sk); void __mptcp_set_connected(struct sock *sk); +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout); static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && @@ -926,12 +929,25 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } -static inline void mptcp_do_fallback(struct sock *sk) +static inline void mptcp_do_fallback(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + struct mptcp_sock *msk; + msk = mptcp_sk(sk); __mptcp_do_fallback(msk); + if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { + gfp_t saved_allocation = ssk->sk_allocation; + + /* we are in a atomic (BH) scope, override ssk default for data + * fin allocation + */ + ssk->sk_allocation = GFP_ATOMIC; + ssk->sk_shutdown |= SEND_SHUTDOWN; + tcp_shutdown(ssk, SEND_SHUTDOWN); + ssk->sk_allocation = saved_allocation; + } } #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ad565ff9ebc3..d4b16d033978 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -843,7 +843,8 @@ enum mapping_status { MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, - MAPPING_DUMMY + MAPPING_DUMMY, + MAPPING_BAD_CSUM }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) @@ -958,11 +959,7 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); - if (subflow->mp_join || subflow->valid_csum_seen) { - subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); - } - return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + return MAPPING_BAD_CSUM; } subflow->valid_csum_seen = 1; @@ -974,7 +971,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); - struct sock *sk = (struct sock *)msk; struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -1016,9 +1012,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, pr_debug("infinite mapping received"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); subflow->map_data_len = 0; - if (!sock_flag(ssk, SOCK_DEAD)) - sk_stop_timer(sk, &sk->sk_timer); - return MAPPING_INVALID; } @@ -1165,6 +1158,33 @@ static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) return !subflow->fully_established; } +static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned long fail_tout; + + /* greceful failure can happen only on the MPC subflow */ + if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) + return; + + /* since the close timeout take precedence on the fail one, + * no need to start the latter when the first is already set + */ + if (sock_flag((struct sock *)msk, SOCK_DEAD)) + return; + + /* we don't need extreme accuracy here, use a zero fail_tout as special + * value meaning no fail timeout at all; + */ + fail_tout = jiffies + TCP_RTO_MAX; + if (!fail_tout) + fail_tout = 1; + WRITE_ONCE(subflow->fail_tout, fail_tout); + tcp_send_ack(ssk); + + mptcp_reset_timeout(msk, subflow->fail_tout); +} + static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -1184,10 +1204,8 @@ static bool subflow_check_data_avail(struct sock *ssk) status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); - if (unlikely(status == MAPPING_INVALID)) - goto fallback; - - if (unlikely(status == MAPPING_DUMMY)) + if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || + status == MAPPING_BAD_CSUM)) goto fallback; if (status != MAPPING_OK) @@ -1229,22 +1247,17 @@ no_data: fallback: if (!__mptcp_check_fallback(msk)) { /* RFC 8684 section 3.7. */ - if (subflow->send_mp_fail) { + if (status == MAPPING_BAD_CSUM && + (subflow->mp_join || subflow->valid_csum_seen)) { + subflow->send_mp_fail = 1; + if (!READ_ONCE(msk->allow_infinite_fallback)) { - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; - tcp_send_active_reset(ssk, GFP_ATOMIC); - while ((skb = skb_peek(&ssk->sk_receive_queue))) - sk_eat_skb(ssk, skb); - } else if (!sock_flag(ssk, SOCK_DEAD)) { - WRITE_ONCE(subflow->mp_fail_response_expect, true); - sk_reset_timer((struct sock *)msk, - &((struct sock *)msk)->sk_timer, - jiffies + TCP_RTO_MAX); + goto reset; } - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + mptcp_subflow_fail(msk, ssk); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); return true; } @@ -1252,16 +1265,20 @@ fallback: /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; + +reset: + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); tcp_send_active_reset(ssk, GFP_ATOMIC); WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return false; } - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); } skb = skb_peek(&ssk->sk_receive_queue); @@ -1706,6 +1723,58 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + struct mptcp_sock *msk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + msk = mptcp_sk(subflow->conn); + if (msk->dl_next || msk == head) + continue; + + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + struct sock *sk = (struct sock *)msk; + bool slow; + + slow = lock_sock_fast_nested(sk); + next = msk->dl_next; + msk->first = NULL; + msk->dl_next = NULL; + unlock_sock_fast(sk, slow); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 78814417d753..80713febfac6 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1803,7 +1803,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, pdev = to_platform_device(dev->dev.parent); if (pdev) { np = pdev->dev.of_node; - if (np && of_get_property(np, "mlx,multi-host", NULL)) + if (np && (of_get_property(np, "mellanox,multi-host", NULL) || + of_get_property(np, "mlx,multi-host", NULL))) ndp->mlx_multi_host = true; } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 53f40e473855..3ddce24ac76d 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -25,9 +25,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, enum nft_trace_types type) { - const struct nft_pktinfo *pkt = info->pkt; - - if (!info->trace || !pkt->skb->nf_trace) + if (!info->trace || !info->nf_trace) return; info->chain = chain; @@ -42,11 +40,24 @@ static inline void nft_trace_packet(struct nft_traceinfo *info, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { + const struct nft_pktinfo *pkt = info->pkt; + + info->nf_trace = pkt->skb->nf_trace; info->rule = rule; __nft_trace_packet(info, chain, type); } } +static inline void nft_trace_copy_nftrace(struct nft_traceinfo *info) +{ + if (static_branch_unlikely(&nft_trace_enabled)) { + const struct nft_pktinfo *pkt = info->pkt; + + if (info->trace) + info->nf_trace = pkt->skb->nf_trace; + } +} + static void nft_bitwise_fast_eval(const struct nft_expr *expr, struct nft_regs *regs) { @@ -85,6 +96,7 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, const struct nft_chain *chain, const struct nft_regs *regs) { + const struct nft_pktinfo *pkt = info->pkt; enum nft_trace_types type; switch (regs->verdict.code) { @@ -92,8 +104,13 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, case NFT_RETURN: type = NFT_TRACETYPE_RETURN; break; + case NF_STOLEN: + type = NFT_TRACETYPE_RULE; + /* can't access skb->nf_trace; use copy */ + break; default: type = NFT_TRACETYPE_RULE; + info->nf_trace = pkt->skb->nf_trace; break; } @@ -254,6 +271,7 @@ next_rule: switch (regs.verdict.code) { case NFT_BREAK: regs.verdict.code = NFT_CONTINUE; + nft_trace_copy_nftrace(&info); continue; case NFT_CONTINUE: nft_trace_packet(&info, chain, rule, diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 5041725423c2..1163ba9c1401 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -7,7 +7,7 @@ #include <linux/module.h> #include <linux/static_key.h> #include <linux/hash.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/skbuff.h> @@ -25,22 +25,6 @@ DEFINE_STATIC_KEY_FALSE(nft_trace_enabled); EXPORT_SYMBOL_GPL(nft_trace_enabled); -static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb) -{ - __be32 id; - - /* using skb address as ID results in a limited number of - * values (and quick reuse). - * - * So we attempt to use as many skb members that will not - * change while skb is with netfilter. - */ - id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb), - skb->skb_iif); - - return nla_put_be32(nlskb, NFTA_TRACE_ID, id); -} - static int trace_fill_header(struct sk_buff *nlskb, u16 type, const struct sk_buff *skb, int off, unsigned int len) @@ -186,6 +170,7 @@ void nft_trace_notify(struct nft_traceinfo *info) struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; + u32 mark = 0; u16 event; if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE)) @@ -229,7 +214,7 @@ void nft_trace_notify(struct nft_traceinfo *info) if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type))) goto nla_put_failure; - if (trace_fill_id(skb, pkt->skb)) + if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid)) goto nla_put_failure; if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name)) @@ -249,16 +234,24 @@ void nft_trace_notify(struct nft_traceinfo *info) case NFT_TRACETYPE_RULE: if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict)) goto nla_put_failure; + + /* pkt->skb undefined iff NF_STOLEN, disable dump */ + if (info->verdict->code == NF_STOLEN) + info->packet_dumped = true; + else + mark = pkt->skb->mark; + break; case NFT_TRACETYPE_POLICY: + mark = pkt->skb->mark; + if (nla_put_be32(skb, NFTA_TRACE_POLICY, htonl(info->basechain->policy))) goto nla_put_failure; break; } - if (pkt->skb->mark && - nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark))) + if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark))) goto nla_put_failure; if (!info->packet_dumped) { @@ -283,9 +276,20 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_chain *chain) { + static siphash_key_t trace_key __read_mostly; + struct sk_buff *skb = pkt->skb; + info->basechain = nft_base_chain(chain); info->trace = true; + info->nf_trace = pkt->skb->nf_trace; info->packet_dumped = false; info->pkt = pkt; info->verdict = verdict; + + net_get_random_once(&trace_key, sizeof(trace_key)); + + info->skbid = (u32)siphash_3u32(hash32_ptr(skb), + skb_get_hash(skb), + skb->skb_iif, + &trace_key); } diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index df40314de21f..76de6c8d9865 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -143,6 +143,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, /* Another cpu may race to insert the element with the same key */ if (prev) { nft_set_elem_destroy(set, he, true); + atomic_dec(&set->nelems); he = prev; } @@ -152,6 +153,7 @@ out: err2: nft_set_elem_destroy(set, he, true); + atomic_dec(&set->nelems); err1: return false; } diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index b3138fc2e552..f06ddbed3fed 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -31,89 +31,89 @@ static void rose_idletimer_expiry(struct timer_list *); void rose_start_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); sk->sk_timer.function = rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; - add_timer(&sk->sk_timer); + sk_reset_timer(sk, &sk->sk_timer, sk->sk_timer.expires); } void rose_start_t1timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t2timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t3timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_hbtimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_idletimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->idletimer); + sk_stop_timer(sk, &rose->idletimer); if (rose->idle > 0) { rose->idletimer.function = rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; - add_timer(&rose->idletimer); + sk_reset_timer(sk, &rose->idletimer, rose->idletimer.expires); } } void rose_stop_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); } void rose_stop_timer(struct sock *sk) { - del_timer(&rose_sk(sk)->timer); + sk_stop_timer(sk, &rose_sk(sk)->timer); } void rose_stop_idletimer(struct sock *sk) { - del_timer(&rose_sk(sk)->idletimer); + sk_stop_timer(sk, &rose_sk(sk)->idletimer); } static void rose_heartbeat_expiry(struct timer_list *t) @@ -130,6 +130,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { bh_unlock_sock(sk); rose_destroy_socket(sk); + sock_put(sk); return; } break; @@ -152,6 +153,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) rose_start_heartbeat(sk); bh_unlock_sock(sk); + sock_put(sk); } static void rose_timer_expiry(struct timer_list *t) @@ -181,6 +183,7 @@ static void rose_timer_expiry(struct timer_list *t) break; } bh_unlock_sock(sk); + sock_put(sk); } static void rose_idletimer_expiry(struct timer_list *t) @@ -205,4 +208,5 @@ static void rose_idletimer_expiry(struct timer_list *t) sock_set_flag(sk, SOCK_DEAD); } bh_unlock_sock(sk); + sock_put(sk); } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index da9733da9868..817065aa2833 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -588,7 +588,8 @@ static int tcf_idr_release_unsafe(struct tc_action *p) } static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct nlattr *nest; int n_i = 0; @@ -604,20 +605,25 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; + ret = 0; mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p)) continue; ret = tcf_idr_release_unsafe(p); - if (ret == ACT_P_DELETED) { + if (ret == ACT_P_DELETED) module_put(ops->owner); - n_i++; - } else if (ret < 0) { - mutex_unlock(&idrinfo->lock); - goto nla_put_failure; - } + else if (ret < 0) + break; + n_i++; } mutex_unlock(&idrinfo->lock); + if (ret < 0) { + if (n_i) + NL_SET_ERR_MSG(extack, "Unable to flush all TC actions"); + else + goto nla_put_failure; + } ret = nla_put_u32(skb, TCA_FCNT, n_i); if (ret) @@ -638,7 +644,7 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct tcf_idrinfo *idrinfo = tn->idrinfo; if (type == RTM_DELACTION) { - return tcf_del_walker(idrinfo, skb, ops); + return tcf_del_walker(idrinfo, skb, ops, extack); } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { diff --git a/net/socket.c b/net/socket.c index 1b6f5e2ebef5..3d7eb2a79e82 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2146,10 +2146,13 @@ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len) { + struct sockaddr_storage address; + struct msghdr msg = { + /* Save some cycles and don't copy the address if not needed */ + .msg_name = addr ? (struct sockaddr *)&address : NULL, + }; struct socket *sock; struct iovec iov; - struct msghdr msg; - struct sockaddr_storage address; int err, err2; int fput_needed; @@ -2160,14 +2163,6 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, if (!sock) goto out; - msg.msg_control = NULL; - msg.msg_controllen = 0; - /* Save some cycles and don't copy the address if not needed */ - msg.msg_name = addr ? (struct sockaddr *)&address : NULL; - /* We assume all kernel code knows the size of sockaddr_storage */ - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, flags); @@ -2372,6 +2367,7 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, return -EFAULT; kmsg->msg_control_is_user = true; + kmsg->msg_get_inq = 0; kmsg->msg_control_user = msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; diff --git a/net/tipc/node.c b/net/tipc/node.c index 6ef95ce565bd..b48d97cbbe29 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -472,8 +472,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; - struct tipc_link *l; unsigned long intv; int bearer_id; int i; @@ -488,6 +488,16 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); + if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, + tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), + n->capabilities, &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { + pr_warn("Broadcast rcv link refresh failed, no memory\n"); + tipc_node_write_unlock_fast(n); + tipc_node_put(n); + n = NULL; + goto exit; + } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); @@ -567,7 +577,16 @@ update: n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - n->bc_entry.link = NULL; + if (!preliminary && + !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, + tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), + n->capabilities, &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { + pr_warn("Broadcast rcv link creation failed, no memory\n"); + kfree(n); + n = NULL; + goto exit; + } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ @@ -1155,7 +1174,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool *respond, bool *dupl_addr) { struct tipc_node *n; - struct tipc_link *l, *snd_l; + struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; @@ -1175,22 +1194,6 @@ void tipc_node_check_dest(struct net *net, u32 addr, return; tipc_node_write_lock(n); - if (unlikely(!n->bc_entry.link)) { - snd_l = tipc_bc_sndlink(net); - if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, peer_id, U16_MAX, - tipc_link_min_win(snd_l), - tipc_link_max_win(snd_l), - n->capabilities, - &n->bc_entry.inputq1, - &n->bc_entry.namedq, snd_l, - &n->bc_entry.link)) { - pr_warn("Broadcast rcv link creation failed, no mem\n"); - tipc_node_write_unlock_fast(n); - tipc_node_put(n); - return; - } - } le = &n->links[b->identity]; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 17f8c523e33b..43509c7e90fc 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -502,6 +502,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sock_init_data(sock, sk); tipc_set_sk_state(sk, TIPC_OPEN); if (tipc_sk_insert(tsk)) { + sk_free(sk); pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } |