From 7e9be1124dbe7888907e82cab20164578e3f9ab7 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 29 Aug 2023 19:51:57 +0200 Subject: netfilter: nf_tables: Audit log setelem reset Since set element reset is not integrated into nf_tables' transaction logic, an explicit log call is needed, similar to NFT_MSG_GETOBJ_RESET handling. For the sake of simplicity, catchall element reset will always generate a dedicated log entry. This relieves nf_tables_dump_set() from having to adjust the logged element count depending on whether a catchall element was found or not. Fixes: 079cd633219d7 ("netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET") Signed-off-by: Phil Sutter Reviewed-by: Richard Guy Briggs Signed-off-by: Pablo Neira Ayuso --- include/linux/audit.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index 6a3a9e122bb5..192bf03aacc5 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -117,6 +117,7 @@ enum audit_nfcfgop { AUDIT_NFT_OP_OBJ_RESET, AUDIT_NFT_OP_FLOWTABLE_REGISTER, AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, + AUDIT_NFT_OP_SETELEM_RESET, AUDIT_NFT_OP_INVALID, }; -- cgit v1.3.1 From ea078ae9108e25fc881c84369f7c03931d22e555 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 29 Aug 2023 19:51:58 +0200 Subject: netfilter: nf_tables: Audit log rule reset Resetting rules' stateful data happens outside of the transaction logic, so 'get' and 'dump' handlers have to emit audit log entries themselves. Fixes: 8daa8fde3fc3f ("netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET") Signed-off-by: Phil Sutter Reviewed-by: Richard Guy Briggs Signed-off-by: Pablo Neira Ayuso --- include/linux/audit.h | 1 + kernel/auditsc.c | 1 + net/netfilter/nf_tables_api.c | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index 192bf03aacc5..51b1b7054a23 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -118,6 +118,7 @@ enum audit_nfcfgop { AUDIT_NFT_OP_FLOWTABLE_REGISTER, AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, AUDIT_NFT_OP_SETELEM_RESET, + AUDIT_NFT_OP_RULE_RESET, AUDIT_NFT_OP_INVALID, }; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 38481e318197..fc0c7c03eeab 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -144,6 +144,7 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" }, + { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" }, { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 361e98e71692..2c81cee858d6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3422,6 +3422,18 @@ err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } +static void audit_log_rule_reset(const struct nft_table *table, + unsigned int base_seq, + unsigned int nentries) +{ + char *buf = kasprintf(GFP_ATOMIC, "%s:%u", + table->name, base_seq); + + audit_log_nfcfg(buf, table->family, nentries, + AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); + kfree(buf); +} + struct nft_rule_dump_ctx { char *table; char *chain; @@ -3528,6 +3540,9 @@ static int nf_tables_dump_rules(struct sk_buff *skb, done: rcu_read_unlock(); + if (reset && idx > cb->args[0]) + audit_log_rule_reset(table, cb->seq, idx - cb->args[0]); + cb->args[0] = idx; return skb->len; } @@ -3635,6 +3650,9 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) goto err_fill_rule_info; + if (reset) + audit_log_rule_reset(table, nft_pernet(net)->base_seq, 1); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_rule_info: -- cgit v1.3.1 From 8aae7625ff3f0bd5484d01f1b8d5af82e44bec2d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 30 Aug 2023 13:00:37 +0200 Subject: net: fib: avoid warn splat in flow dissector New skbs allocated via nf_send_reset() have skb->dev == NULL. fib*_rules_early_flow_dissect helpers already have a 'struct net' argument but its not passed down to the flow dissector core, which will then WARN as it can't derive a net namespace to use: WARNING: CPU: 0 PID: 0 at net/core/flow_dissector.c:1016 __skb_flow_dissect+0xa91/0x1cd0 [..] ip_route_me_harder+0x143/0x330 nf_send_reset+0x17c/0x2d0 [nf_reject_ipv4] nft_reject_inet_eval+0xa9/0xf2 [nft_reject_inet] nft_do_chain+0x198/0x5d0 [nf_tables] nft_do_chain_inet+0xa4/0x110 [nf_tables] nf_hook_slow+0x41/0xc0 ip_local_deliver+0xce/0x110 .. Cc: Stanislav Fomichev Cc: David Ahern Cc: Ido Schimmel Fixes: 812fa71f0d96 ("netfilter: Dissect flow after packet mangling") Link: https://bugzilla.kernel.org/show_bug.cgi?id=217826 Signed-off-by: Florian Westphal Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230830110043.30497-1-fw@strlen.de Signed-off-by: Paolo Abeni --- include/net/ip6_fib.h | 5 ++++- include/net/ip_fib.h | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c9ff23cf313e..1ba9f4ddf2f6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -642,7 +642,10 @@ static inline bool fib6_rules_early_flow_dissect(struct net *net, if (!net->ipv6.fib6_rules_require_fldissect) return false; - skb_flow_dissect_flow_keys(skb, flkeys, flag); + memset(flkeys, 0, sizeof(*flkeys)); + __skb_flow_dissect(net, skb, &flow_keys_dissector, + flkeys, NULL, 0, 0, 0, flag); + fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; fl6->flowi6_proto = flkeys->basic.ip_proto; diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a378eff827c7..f0c13864180e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -418,7 +418,10 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, if (!net->ipv4.fib_rules_require_fldissect) return false; - skb_flow_dissect_flow_keys(skb, flkeys, flag); + memset(flkeys, 0, sizeof(*flkeys)); + __skb_flow_dissect(net, skb, &flow_keys_dissector, + flkeys, NULL, 0, 0, 0, flag); + fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; fl4->flowi4_proto = flkeys->basic.ip_proto; -- cgit v1.3.1 From 6a86b5b5cd76d2734304a0173f5f01aa8aa2025e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 29 Aug 2023 22:53:52 +0200 Subject: bpf: Annotate bpf_long_memcpy with data_race syzbot reported a data race splat between two processes trying to update the same BPF map value via syscall on different CPUs: BUG: KCSAN: data-race in bpf_percpu_array_update / bpf_percpu_array_update write to 0xffffe8fffe7425d8 of 8 bytes by task 8257 on cpu 1: bpf_long_memcpy include/linux/bpf.h:428 [inline] bpf_obj_memcpy include/linux/bpf.h:441 [inline] copy_map_value_long include/linux/bpf.h:464 [inline] bpf_percpu_array_update+0x3bb/0x500 kernel/bpf/arraymap.c:380 bpf_map_update_value+0x190/0x370 kernel/bpf/syscall.c:175 generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1749 bpf_map_do_batch+0x2df/0x3d0 kernel/bpf/syscall.c:4648 __sys_bpf+0x28a/0x780 __do_sys_bpf kernel/bpf/syscall.c:5241 [inline] __se_sys_bpf kernel/bpf/syscall.c:5239 [inline] __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5239 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd write to 0xffffe8fffe7425d8 of 8 bytes by task 8268 on cpu 0: bpf_long_memcpy include/linux/bpf.h:428 [inline] bpf_obj_memcpy include/linux/bpf.h:441 [inline] copy_map_value_long include/linux/bpf.h:464 [inline] bpf_percpu_array_update+0x3bb/0x500 kernel/bpf/arraymap.c:380 bpf_map_update_value+0x190/0x370 kernel/bpf/syscall.c:175 generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1749 bpf_map_do_batch+0x2df/0x3d0 kernel/bpf/syscall.c:4648 __sys_bpf+0x28a/0x780 __do_sys_bpf kernel/bpf/syscall.c:5241 [inline] __se_sys_bpf kernel/bpf/syscall.c:5239 [inline] __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5239 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x0000000000000000 -> 0xfffffff000002788 The bpf_long_memcpy is used with 8-byte aligned pointers, power-of-8 size and forced to use long read/writes to try to atomically copy long counters. It is best-effort only and no barriers are here since it _will_ race with concurrent updates from BPF programs. The bpf_long_memcpy() is called from bpf(2) syscall. Marco suggested that the best way to make this known to KCSAN would be to use data_race() annotation. Reported-by: syzbot+97522333291430dd277f@syzkaller.appspotmail.com Suggested-by: Marco Elver Signed-off-by: Daniel Borkmann Acked-by: Marco Elver Link: https://lore.kernel.org/bpf/000000000000d87a7f06040c970c@google.com Link: https://lore.kernel.org/bpf/57628f7a15e20d502247c3b55fceb1cb2b31f266.1693342186.git.daniel@iogearbox.net --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 12596af59c00..024e8b28c34b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -438,7 +438,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) size /= sizeof(long); while (size--) - *ldst++ = *lsrc++; + data_race(*ldst++ = *lsrc++); } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ -- cgit v1.3.1 From 5e6300e7b3a4ab5b72a82079753868e91fbf9efc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 31 Aug 2023 13:52:09 +0000 Subject: net: annotate data-races around sk->sk_forward_alloc Every time sk->sk_forward_alloc is read locklessly, add a READ_ONCE(). Add sk_forward_alloc_add() helper to centralize updates, to reduce number of WRITE_ONCE(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 12 +++++++++--- net/core/sock.c | 8 ++++---- net/ipv4/tcp_output.c | 2 +- net/ipv4/udp.c | 6 +++--- net/mptcp/protocol.c | 6 +++--- 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 11d503417591..f04869ac1d92 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1053,6 +1053,12 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val) WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } +static inline void sk_forward_alloc_add(struct sock *sk, int val) +{ + /* Paired with lockless reads of sk->sk_forward_alloc */ + WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); +} + void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ @@ -1377,7 +1383,7 @@ static inline int sk_forward_alloc_get(const struct sock *sk) if (sk->sk_prot->forward_alloc_get) return sk->sk_prot->forward_alloc_get(sk); #endif - return sk->sk_forward_alloc; + return READ_ONCE(sk->sk_forward_alloc); } static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) @@ -1673,14 +1679,14 @@ static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; - sk->sk_forward_alloc -= size; + sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; - sk->sk_forward_alloc += size; + sk_forward_alloc_add(sk, size); sk_mem_reclaim(sk); } diff --git a/net/core/sock.c b/net/core/sock.c index a61ec97098ad..40e1bda4bde0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1045,7 +1045,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } - sk->sk_forward_alloc += pages << PAGE_SHIFT; + sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem + (pages << PAGE_SHIFT)); @@ -3139,10 +3139,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) - sk->sk_forward_alloc -= amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -3174,7 +3174,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; - sk->sk_forward_alloc -= amount << PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e6b4fbd642f7..ccfc8bbf7455 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3474,7 +3474,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) if (delta <= 0) return; amt = sk_mem_pages(delta); - sk->sk_forward_alloc += amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0794a2c46a56..f39b9c844580 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1414,9 +1414,9 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, spin_lock(&sk_queue->lock); - sk->sk_forward_alloc += size; + sk_forward_alloc_add(sk, size); amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); - sk->sk_forward_alloc -= amt; + sk_forward_alloc_add(sk, -amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); @@ -1527,7 +1527,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) goto uncharge_drop; } - sk->sk_forward_alloc -= size; + sk_forward_alloc_add(sk, -size); /* no need to setup a destructor, we will explicitly release the * forward allocated memory on dequeue diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 933b257eee02..625df3a36c46 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1800,7 +1800,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } /* data successfully copied into the write queue */ - sk->sk_forward_alloc -= total_ts; + sk_forward_alloc_add(sk, -total_ts); copied += psize; dfrag->data_len += psize; frag_truesize += psize; @@ -3257,7 +3257,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ - sk->sk_forward_alloc += msk->rmem_fwd_alloc; + sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); @@ -3522,7 +3522,7 @@ static void mptcp_shutdown(struct sock *sk, int how) static int mptcp_forward_alloc_get(const struct sock *sk) { - return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; + return READ_ONCE(sk->sk_forward_alloc) + mptcp_sk(sk)->rmem_fwd_alloc; } static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) -- cgit v1.3.1 From e3390b30a5dfb112e8e802a59c0f68f947b638b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 31 Aug 2023 13:52:11 +0000 Subject: net: annotate data-races around sk->sk_tsflags sk->sk_tsflags can be read locklessly, add corresponding annotations. Fixes: b9f40e21ef42 ("net-timestamp: move timestamp flags out of sk_flags") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- include/net/sock.h | 17 ++++++++++------- net/can/j1939/socket.c | 10 ++++++---- net/core/skbuff.c | 10 ++++++---- net/core/sock.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- net/socket.c | 13 +++++++------ 13 files changed, 40 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 19adacd5ece0..9276cea775cc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -94,7 +94,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); - ipcm->sockc.tsflags = inet->sk.sk_tsflags; + ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; diff --git a/include/net/sock.h b/include/net/sock.h index f04869ac1d92..b770261fbdaf 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1906,7 +1906,9 @@ struct sockcm_cookie { static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { - *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; + *sockc = (struct sockcm_cookie) { + .tsflags = READ_ONCE(sk->sk_tsflags) + }; } int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, @@ -2701,9 +2703,9 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - ktime_t kt = skb->tstamp; struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); - + u32 tsflags = READ_ONCE(sk->sk_tsflags); + ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested @@ -2711,10 +2713,10 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || - (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || - (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) || + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || + (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && - (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); @@ -2736,7 +2738,8 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY) + if (sk->sk_flags & FLAGS_RECV_CMSGS || + READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index feaec4ad6d16..b28c976f52a0 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -974,6 +974,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; + u32 tsflags; int err; jsk = j1939_sk(sk); @@ -981,13 +982,14 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; + tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) + if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) + if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: @@ -997,7 +999,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) + if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: @@ -1054,7 +1056,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, } serr->opt_stats = true; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 45707059082f..24f26e816184 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5207,7 +5207,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) serr->ee.ee_data -= atomic_read(&sk->sk_tskey); @@ -5263,21 +5263,23 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, { struct sk_buff *skb; bool tsonly, opt_stats = false; + u32 tsflags; if (!sk) return; - if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + tsflags = READ_ONCE(sk->sk_tsflags); + if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) return; - tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) { #ifdef CONFIG_INET - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); diff --git a/net/core/sock.c b/net/core/sock.c index 40e1bda4bde0..d05a290300b6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -937,7 +937,7 @@ int sock_set_timestamping(struct sock *sk, int optname, return ret; } - sk->sk_tsflags = val; + WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) @@ -1719,7 +1719,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_TIMESTAMPING_OLD: lv = sizeof(v.timestamping); - v.timestamping.flags = sk->sk_tsflags; + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); v.timestamping.bind_phc = sk->sk_bind_phc; break; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b2e0ad312028..4ab877cf6d35 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -981,7 +981,7 @@ static int __ip_append_data(struct sock *sk, paged = !!cork->gso_size; if (cork->tx_flags & SKBTX_ANY_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d1c73660b844..cce9cb25f3b3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -511,7 +511,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk, * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). */ info = PKTINFO_SKB_CB(skb); - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) || + if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) || !info->ipi_ifindex) return false; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cee1e548660c..cc4b250262c1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2259,14 +2259,14 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, } } - if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { - if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 4ab50169a5a9..54fc4c711f2c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1501,7 +1501,7 @@ static int __ip6_append_data(struct sock *sk, orig_mtu = mtu; if (cork->tx_flags & SKBTX_ANY_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 1b2772834972..5831aaa53d75 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,7 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, np); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0eae7661a85c..42fcec3ecf5e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -772,7 +772,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; if (sin6) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ebc6ae47cfea..86b5d509a468 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1339,7 +1339,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); /* destination address check */ diff --git a/net/socket.c b/net/socket.c index 848116d06b51..98ffffab949e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -825,7 +825,7 @@ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) { - bool cycles = sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC; + bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); struct net_device *orig_dev; ktime_t hwtstamp; @@ -877,12 +877,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); struct scm_timestamping_internal tss; - int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); int if_index; ktime_t hwtstamp; + u32 tsflags; /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ @@ -924,11 +924,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, } memset(&tss, 0, sizeof(tss)); - if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) && + tsflags = READ_ONCE(sk->sk_tsflags); + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && - (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && !skb_is_swtx_tstamp(skb, false_tstamp)) { if_index = 0; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) @@ -936,14 +937,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, else hwtstamp = shhwtstamps->hwtstamp; - if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) + if (tsflags & SOF_TIMESTAMPING_BIND_PHC) hwtstamp = ptp_convert_timestamp(&hwtstamp, sk->sk_bind_phc); if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && + if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb, if_index); } -- cgit v1.3.1 From 6ac66cb03ae306c2e288a9be18226310529f5b25 Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Thu, 31 Aug 2023 10:03:30 +0200 Subject: ipv4: ignore dst hint for multipath routes Route hints when the nexthop is part of a multipath group causes packets in the same receive batch to be sent to the same nexthop irrespective of the multipath hash of the packet. So, do not extract route hint for packets whose destination is part of a multipath group. A new SKB flag IPSKB_MULTIPATH is introduced for this purpose, set the flag when route is looked up in ip_mkroute_input() and use it in ip_extract_route_hint() to check for the existence of the flag. Fixes: 02b24941619f ("ipv4: use dst hint for ipv4 list receive") Signed-off-by: Sriram Yagnaraman Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_input.c | 3 ++- net/ipv4/route.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 9276cea775cc..3489a1cca5e7 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -57,6 +57,7 @@ struct inet_skb_parm { #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) +#define IPSKB_MULTIPATH BIT(9) u16 frag_max_size; }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fe9ead9ee863..5e9c8156656a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -584,7 +584,8 @@ static void ip_sublist_rcv_finish(struct list_head *head) static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb, int rt_type) { - if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || + IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; return skb; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a4e153dd615b..6a3f57a3fa41 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2144,6 +2144,7 @@ static int ip_mkroute_input(struct sk_buff *skb, int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); + IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif -- cgit v1.3.1 From 8423be8926aa82cd2e28bba5cc96ccb72c7ce6be Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Thu, 31 Aug 2023 10:03:31 +0200 Subject: ipv6: ignore dst hint for multipath routes Route hints when the nexthop is part of a multipath group causes packets in the same receive batch to be sent to the same nexthop irrespective of the multipath hash of the packet. So, do not extract route hint for packets whose destination is part of a multipath group. A new SKB flag IP6SKB_MULTIPATH is introduced for this purpose, set the flag when route is looked up in fib6_select_path() and use it in ip6_can_use_hint() to check for the existence of the flag. Fixes: 197dbf24e360 ("ipv6: introduce and uses route look hints for list input.") Signed-off-by: Sriram Yagnaraman Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + net/ipv6/ip6_input.c | 3 ++- net/ipv6/route.c | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 5883551b1ee8..af8a771a053c 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -147,6 +147,7 @@ struct inet6_skb_parm { #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 #define IP6SKB_FAKEJUMBO 512 +#define IP6SKB_MULTIPATH 1024 }; #if defined(CONFIG_NET_L3_MASTER_DEV) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index d94041bb4287..b8378814532c 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -99,7 +99,8 @@ static bool ip6_can_use_hint(const struct sk_buff *skb, static struct sk_buff *ip6_extract_route_hint(const struct net *net, struct sk_buff *skb) { - if (fib6_routes_require_src(net) || fib6_has_custom_rules(net)) + if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) || + IP6CB(skb)->flags & IP6SKB_MULTIPATH) return NULL; return skb; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 846aec8e0093..01d6d352850a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -423,6 +423,9 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (match->nh && have_oif_match && res->nh) return; + if (skb) + IP6CB(skb)->flags |= IP6SKB_MULTIPATH; + /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ -- cgit v1.3.1 From 719c5e37e99d2fd588d1c994284d17650a66354c Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 1 Sep 2023 06:53:23 +0200 Subject: net: phy: micrel: Correct bit assignments for phy_device flags Previously, the defines for phy_device flags in the Micrel driver were ambiguous in their representation. They were intended to be bit masks but were mistakenly defined as bit positions. This led to the following issues: - MICREL_KSZ8_P1_ERRATA, designated for KSZ88xx switches, overlapped with MICREL_PHY_FXEN and MICREL_PHY_50MHZ_CLK. - Due to this overlap, the code path for MICREL_PHY_FXEN, tailored for the KSZ8041 PHY, was not executed for KSZ88xx PHYs. - Similarly, the code associated with MICREL_PHY_50MHZ_CLK wasn't triggered for KSZ88xx. To rectify this, all three flags have now been explicitly converted to use the `BIT()` macro, ensuring they are defined as bit masks and preventing potential overlaps in the future. Fixes: 49011e0c1555 ("net: phy: micrel: ksz886x/ksz8081: add cabletest support") Signed-off-by: Oleksij Rempel Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/micrel_phy.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 8bef1ab62bba..322d87255984 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -41,9 +41,9 @@ #define PHY_ID_KSZ9477 0x00221631 /* struct phy_device dev_flags definitions */ -#define MICREL_PHY_50MHZ_CLK 0x00000001 -#define MICREL_PHY_FXEN 0x00000002 -#define MICREL_KSZ8_P1_ERRATA 0x00000003 +#define MICREL_PHY_50MHZ_CLK BIT(0) +#define MICREL_PHY_FXEN BIT(1) +#define MICREL_KSZ8_P1_ERRATA BIT(2) #define MICREL_KSZ9021_EXTREG_CTRL 0xB #define MICREL_KSZ9021_EXTREG_DATA_WRITE 0xC -- cgit v1.3.1 From 718e6b51298e0f254baca0d40ab52a00e004e014 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 1 Sep 2023 16:46:04 -0700 Subject: af_unix: Fix msg_controllen test in scm_pidfd_recv() for MSG_CMSG_COMPAT. Heiko Carstens reported that SCM_PIDFD does not work with MSG_CMSG_COMPAT because scm_pidfd_recv() always checks msg_controllen against sizeof(struct cmsghdr). We need to use sizeof(struct compat_cmsghdr) for the compat case. Fixes: 5e2ff6704a27 ("scm: add SO_PASSPIDFD and SCM_PIDFD") Reported-by: Heiko Carstens Closes: https://lore.kernel.org/netdev/20230901200517.8742-A-hca@linux.ibm.com/ Signed-off-by: Kuniyuki Iwashima Tested-by: Heiko Carstens Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Michal Swiatkowski Acked-by: Christian Brauner Signed-off-by: David S. Miller --- include/net/scm.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/scm.h b/include/net/scm.h index c5bcdf65f55c..e8c76b4be2fe 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -9,6 +9,7 @@ #include #include #include +#include /* Well, we should have at least one descriptor open * to accept passed FDs 8) @@ -123,14 +124,17 @@ static inline bool scm_has_secdata(struct socket *sock) static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) { struct file *pidfd_file = NULL; - int pidfd; + int len, pidfd; - /* - * put_cmsg() doesn't return an error if CMSG is truncated, + /* put_cmsg() doesn't return an error if CMSG is truncated, * that's why we need to opencode these checks here. */ - if ((msg->msg_controllen <= sizeof(struct cmsghdr)) || - (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) { + if (msg->msg_flags & MSG_CMSG_COMPAT) + len = sizeof(struct compat_cmsghdr) + sizeof(int); + else + len = sizeof(struct cmsghdr) + sizeof(int); + + if (msg->msg_controllen < len) { msg->msg_flags |= MSG_CTRUNC; return; } -- cgit v1.3.1 From 39285e124edbc752331e98ace37cc141a6a3747a Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 5 Sep 2023 08:46:10 +0000 Subject: net: team: do not use dynamic lockdep key team interface has used a dynamic lockdep key to avoid false-positive lockdep deadlock detection. Virtual interfaces such as team usually have their own lock for protecting private data. These interfaces can be nested. team0 | team1 Each interface's lock is actually different(team0->lock and team1->lock). So, mutex_lock(&team0->lock); mutex_lock(&team1->lock); mutex_unlock(&team1->lock); mutex_unlock(&team0->lock); The above case is absolutely safe. But lockdep warns about deadlock. Because the lockdep understands these two locks are same. This is a false-positive lockdep warning. So, in order to avoid this problem, the team interfaces started to use dynamic lockdep key. The false-positive problem was fixed, but it introduced a new problem. When the new team virtual interface is created, it registers a dynamic lockdep key(creates dynamic lockdep key) and uses it. But there is the limitation of the number of lockdep keys. So, If so many team interfaces are created, it consumes all lockdep keys. Then, the lockdep stops to work and warns about it. In order to fix this problem, team interfaces use the subclass instead of the dynamic key. So, when a new team interface is created, it doesn't register(create) a new lockdep, but uses existed subclass key instead. It is already used by the bonding interface for a similar case. As the bonding interface does, the subclass variable is the same as the 'dev->nested_level'. This variable indicates the depth in the stacked interface graph. The 'dev->nested_level' is protected by RTNL and RCU. So, 'mutex_lock_nested()' for 'team->lock' requires RTNL or RCU. In the current code, 'team->lock' is usually acquired under RTNL, there is no problem with using 'dev->nested_level'. The 'team_nl_team_get()' and The 'lb_stats_refresh()' functions acquire 'team->lock' without RTNL. But these don't iterate their own ports nested so they don't need nested lock. Reproducer: for i in {0..1000} do ip link add team$i type team ip link add dummy$i master team$i type dummy ip link set dummy$i up ip link set team$i up done Splat looks like: BUG: MAX_LOCKDEP_ENTRIES too low! turning off the locking correctness validator. Please attach the output of /proc/lock_stat to the bug report CPU: 0 PID: 4104 Comm: ip Not tainted 6.5.0-rc7+ #45 Call Trace: dump_stack_lvl+0x64/0xb0 add_lock_to_list+0x30d/0x5e0 check_prev_add+0x73a/0x23a0 ... sock_def_readable+0xfe/0x4f0 netlink_broadcast+0x76b/0xac0 nlmsg_notify+0x69/0x1d0 dev_open+0xed/0x130 ... Reported-by: syzbot+9bbbacfbf1e04d5221f7@syzkaller.appspotmail.com Fixes: 369f61bee0f5 ("team: fix nested locking lockdep warning") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/team/team.c | 111 +++++++++++++++---------------- drivers/net/team/team_mode_loadbalance.c | 4 +- include/linux/if_team.h | 30 ++++++++- 3 files changed, 85 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index e8b94580194e..ad29122a5468 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1135,8 +1135,8 @@ static int team_port_add(struct team *team, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct net_device *dev = team->dev; - struct team_port *port; char *portname = port_dev->name; + struct team_port *port; int err; if (port_dev->flags & IFF_LOOPBACK) { @@ -1203,18 +1203,31 @@ static int team_port_add(struct team *team, struct net_device *port_dev, memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len); - err = team_port_enter(team, port); + err = dev_open(port_dev, extack); if (err) { - netdev_err(dev, "Device %s failed to enter team mode\n", + netdev_dbg(dev, "Device %s opening failed\n", portname); - goto err_port_enter; + goto err_dev_open; } - err = dev_open(port_dev, extack); + err = team_upper_dev_link(team, port, extack); if (err) { - netdev_dbg(dev, "Device %s opening failed\n", + netdev_err(dev, "Device %s failed to set upper link\n", portname); - goto err_dev_open; + goto err_set_upper_link; + } + + /* lockdep subclass variable(dev->nested_level) was updated by + * team_upper_dev_link(). + */ + team_unlock(team); + team_lock(team); + + err = team_port_enter(team, port); + if (err) { + netdev_err(dev, "Device %s failed to enter team mode\n", + portname); + goto err_port_enter; } err = vlan_vids_add_by_dev(port_dev, dev); @@ -1242,13 +1255,6 @@ static int team_port_add(struct team *team, struct net_device *port_dev, goto err_handler_register; } - err = team_upper_dev_link(team, port, extack); - if (err) { - netdev_err(dev, "Device %s failed to set upper link\n", - portname); - goto err_set_upper_link; - } - err = __team_option_inst_add_port(team, port); if (err) { netdev_err(dev, "Device %s failed to add per-port options\n", @@ -1295,9 +1301,6 @@ err_set_slave_promisc: __team_option_inst_del_port(team, port); err_option_port_add: - team_upper_dev_unlink(team, port); - -err_set_upper_link: netdev_rx_handler_unregister(port_dev); err_handler_register: @@ -1307,13 +1310,16 @@ err_enable_netpoll: vlan_vids_del_by_dev(port_dev, dev); err_vids_add: + team_port_leave(team, port); + +err_port_enter: + team_upper_dev_unlink(team, port); + +err_set_upper_link: dev_close(port_dev); err_dev_open: - team_port_leave(team, port); team_port_set_orig_dev_addr(port); - -err_port_enter: dev_set_mtu(port_dev, port->orig.mtu); err_set_mtu: @@ -1616,6 +1622,7 @@ static int team_init(struct net_device *dev) int err; team->dev = dev; + mutex_init(&team->lock); team_set_no_mode(team); team->notifier_ctx = false; @@ -1643,8 +1650,6 @@ static int team_init(struct net_device *dev) goto err_options_register; netif_carrier_off(dev); - lockdep_register_key(&team->team_lock_key); - __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); netdev_lockdep_set_classes(dev); return 0; @@ -1665,7 +1670,7 @@ static void team_uninit(struct net_device *dev) struct team_port *port; struct team_port *tmp; - mutex_lock(&team->lock); + team_lock(team); list_for_each_entry_safe(port, tmp, &team->port_list, list) team_port_del(team, port->dev); @@ -1674,9 +1679,8 @@ static void team_uninit(struct net_device *dev) team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); - mutex_unlock(&team->lock); + team_unlock(team); netdev_change_features(dev); - lockdep_unregister_key(&team->team_lock_key); } static void team_destructor(struct net_device *dev) @@ -1790,18 +1794,18 @@ static void team_set_rx_mode(struct net_device *dev) static int team_set_mac_address(struct net_device *dev, void *p) { - struct sockaddr *addr = p; struct team *team = netdev_priv(dev); + struct sockaddr *addr = p; struct team_port *port; if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; dev_addr_set(dev, addr->sa_data); - mutex_lock(&team->lock); + team_lock(team); list_for_each_entry(port, &team->port_list, list) if (team->ops.port_change_dev_addr) team->ops.port_change_dev_addr(team, port); - mutex_unlock(&team->lock); + team_unlock(team); return 0; } @@ -1815,7 +1819,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu) * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ - mutex_lock(&team->lock); + team_lock(team); team->port_mtu_change_allowed = true; list_for_each_entry(port, &team->port_list, list) { err = dev_set_mtu(port->dev, new_mtu); @@ -1826,7 +1830,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu) } } team->port_mtu_change_allowed = false; - mutex_unlock(&team->lock); + team_unlock(team); dev->mtu = new_mtu; @@ -1836,7 +1840,7 @@ unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) dev_set_mtu(port->dev, dev->mtu); team->port_mtu_change_allowed = false; - mutex_unlock(&team->lock); + team_unlock(team); return err; } @@ -1890,20 +1894,20 @@ static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ - mutex_lock(&team->lock); + team_lock(team); list_for_each_entry(port, &team->port_list, list) { err = vlan_vid_add(port->dev, proto, vid); if (err) goto unwind; } - mutex_unlock(&team->lock); + team_unlock(team); return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); - mutex_unlock(&team->lock); + team_unlock(team); return err; } @@ -1913,10 +1917,10 @@ static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) struct team *team = netdev_priv(dev); struct team_port *port; - mutex_lock(&team->lock); + team_lock(team); list_for_each_entry(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); - mutex_unlock(&team->lock); + team_unlock(team); return 0; } @@ -1938,9 +1942,9 @@ static void team_netpoll_cleanup(struct net_device *dev) { struct team *team = netdev_priv(dev); - mutex_lock(&team->lock); + team_lock(team); __team_netpoll_cleanup(team); - mutex_unlock(&team->lock); + team_unlock(team); } static int team_netpoll_setup(struct net_device *dev, @@ -1950,7 +1954,7 @@ static int team_netpoll_setup(struct net_device *dev, struct team_port *port; int err = 0; - mutex_lock(&team->lock); + team_lock(team); list_for_each_entry(port, &team->port_list, list) { err = __team_port_enable_netpoll(port); if (err) { @@ -1958,7 +1962,7 @@ static int team_netpoll_setup(struct net_device *dev, break; } } - mutex_unlock(&team->lock); + team_unlock(team); return err; } #endif @@ -1969,9 +1973,9 @@ static int team_add_slave(struct net_device *dev, struct net_device *port_dev, struct team *team = netdev_priv(dev); int err; - mutex_lock(&team->lock); + team_lock(team); err = team_port_add(team, port_dev, extack); - mutex_unlock(&team->lock); + team_unlock(team); if (!err) netdev_change_features(dev); @@ -1984,19 +1988,12 @@ static int team_del_slave(struct net_device *dev, struct net_device *port_dev) struct team *team = netdev_priv(dev); int err; - mutex_lock(&team->lock); + team_lock(team); err = team_port_del(team, port_dev); - mutex_unlock(&team->lock); - - if (err) - return err; + team_unlock(team); - if (netif_is_team_master(port_dev)) { - lockdep_unregister_key(&team->team_lock_key); - lockdep_register_key(&team->team_lock_key); - lockdep_set_class(&team->lock, &team->team_lock_key); - } - netdev_change_features(dev); + if (!err) + netdev_change_features(dev); return err; } @@ -2316,13 +2313,13 @@ static struct team *team_nl_team_get(struct genl_info *info) } team = netdev_priv(dev); - mutex_lock(&team->lock); + __team_lock(team); return team; } static void team_nl_team_put(struct team *team) { - mutex_unlock(&team->lock); + team_unlock(team); dev_put(team->dev); } @@ -2984,9 +2981,9 @@ static void team_port_change_check(struct team_port *port, bool linkup) { struct team *team = port->team; - mutex_lock(&team->lock); + team_lock(team); __team_port_change_check(port, linkup); - mutex_unlock(&team->lock); + team_unlock(team); } diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c index 00f8989c29c0..7bcc9d37447a 100644 --- a/drivers/net/team/team_mode_loadbalance.c +++ b/drivers/net/team/team_mode_loadbalance.c @@ -478,7 +478,7 @@ static void lb_stats_refresh(struct work_struct *work) team = lb_priv_ex->team; lb_priv = get_lb_priv(team); - if (!mutex_trylock(&team->lock)) { + if (!team_trylock(team)) { schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, 0); return; } @@ -515,7 +515,7 @@ static void lb_stats_refresh(struct work_struct *work) schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, (lb_priv_ex->stats.refresh_interval * HZ) / 10); - mutex_unlock(&team->lock); + team_unlock(team); } static void lb_stats_refresh_interval_get(struct team *team, diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 1b9b15a492fa..12d4447fc8ab 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -221,10 +221,38 @@ struct team { atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; - struct lock_class_key team_lock_key; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; +static inline void __team_lock(struct team *team) +{ + mutex_lock(&team->lock); +} + +static inline int team_trylock(struct team *team) +{ + return mutex_trylock(&team->lock); +} + +#ifdef CONFIG_LOCKDEP +static inline void team_lock(struct team *team) +{ + ASSERT_RTNL(); + mutex_lock_nested(&team->lock, team->dev->nested_level); +} + +#else +static inline void team_lock(struct team *team) +{ + __team_lock(team); +} +#endif + +static inline void team_unlock(struct team *team) +{ + mutex_unlock(&team->lock); +} + static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { -- cgit v1.3.1 From 9b271ebaf9a2c5c566a54bc6cd915962e8241130 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Sep 2023 13:40:46 +0000 Subject: ip_tunnels: use DEV_STATS_INC() syzbot/KCSAN reported data-races in iptunnel_xmit_stats() [1] This can run from multiple cpus without mutual exclusion. Adopt SMP safe DEV_STATS_INC() to update dev->stats fields. [1] BUG: KCSAN: data-race in iptunnel_xmit / iptunnel_xmit read-write to 0xffff8881353df170 of 8 bytes by task 30263 on cpu 1: iptunnel_xmit_stats include/net/ip_tunnels.h:493 [inline] iptunnel_xmit+0x432/0x4a0 net/ipv4/ip_tunnel_core.c:87 ip_tunnel_xmit+0x1477/0x1750 net/ipv4/ip_tunnel.c:831 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:662 __netdev_start_xmit include/linux/netdevice.h:4889 [inline] netdev_start_xmit include/linux/netdevice.h:4903 [inline] xmit_one net/core/dev.c:3544 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3560 __dev_queue_xmit+0xeee/0x1de0 net/core/dev.c:4340 dev_queue_xmit include/linux/netdevice.h:3082 [inline] __bpf_tx_skb net/core/filter.c:2129 [inline] __bpf_redirect_no_mac net/core/filter.c:2159 [inline] __bpf_redirect+0x723/0x9c0 net/core/filter.c:2182 ____bpf_clone_redirect net/core/filter.c:2453 [inline] bpf_clone_redirect+0x16c/0x1d0 net/core/filter.c:2425 ___bpf_prog_run+0xd7d/0x41e0 kernel/bpf/core.c:1954 __bpf_prog_run512+0x74/0xa0 kernel/bpf/core.c:2195 bpf_dispatcher_nop_func include/linux/bpf.h:1181 [inline] __bpf_prog_run include/linux/filter.h:609 [inline] bpf_prog_run include/linux/filter.h:616 [inline] bpf_test_run+0x15d/0x3d0 net/bpf/test_run.c:423 bpf_prog_test_run_skb+0x77b/0xa00 net/bpf/test_run.c:1045 bpf_prog_test_run+0x265/0x3d0 kernel/bpf/syscall.c:3996 __sys_bpf+0x3af/0x780 kernel/bpf/syscall.c:5353 __do_sys_bpf kernel/bpf/syscall.c:5439 [inline] __se_sys_bpf kernel/bpf/syscall.c:5437 [inline] __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5437 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read-write to 0xffff8881353df170 of 8 bytes by task 30249 on cpu 0: iptunnel_xmit_stats include/net/ip_tunnels.h:493 [inline] iptunnel_xmit+0x432/0x4a0 net/ipv4/ip_tunnel_core.c:87 ip_tunnel_xmit+0x1477/0x1750 net/ipv4/ip_tunnel.c:831 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:662 __netdev_start_xmit include/linux/netdevice.h:4889 [inline] netdev_start_xmit include/linux/netdevice.h:4903 [inline] xmit_one net/core/dev.c:3544 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3560 __dev_queue_xmit+0xeee/0x1de0 net/core/dev.c:4340 dev_queue_xmit include/linux/netdevice.h:3082 [inline] __bpf_tx_skb net/core/filter.c:2129 [inline] __bpf_redirect_no_mac net/core/filter.c:2159 [inline] __bpf_redirect+0x723/0x9c0 net/core/filter.c:2182 ____bpf_clone_redirect net/core/filter.c:2453 [inline] bpf_clone_redirect+0x16c/0x1d0 net/core/filter.c:2425 ___bpf_prog_run+0xd7d/0x41e0 kernel/bpf/core.c:1954 __bpf_prog_run512+0x74/0xa0 kernel/bpf/core.c:2195 bpf_dispatcher_nop_func include/linux/bpf.h:1181 [inline] __bpf_prog_run include/linux/filter.h:609 [inline] bpf_prog_run include/linux/filter.h:616 [inline] bpf_test_run+0x15d/0x3d0 net/bpf/test_run.c:423 bpf_prog_test_run_skb+0x77b/0xa00 net/bpf/test_run.c:1045 bpf_prog_test_run+0x265/0x3d0 kernel/bpf/syscall.c:3996 __sys_bpf+0x3af/0x780 kernel/bpf/syscall.c:5353 __do_sys_bpf kernel/bpf/syscall.c:5439 [inline] __se_sys_bpf kernel/bpf/syscall.c:5437 [inline] __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5437 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x0000000000018830 -> 0x0000000000018831 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 30249 Comm: syz-executor.4 Not tainted 6.5.0-syzkaller-11704-g3f86ed6ec0b3 #0 Fixes: 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e8750b4ef7e1..f346b4efbc30 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -483,15 +483,14 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); + return; + } + + if (pkt_len < 0) { + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_aborted_errors); } else { - struct net_device_stats *err_stats = &dev->stats; - - if (pkt_len < 0) { - err_stats->tx_errors++; - err_stats->tx_aborted_errors++; - } else { - err_stats->tx_dropped++; - } + DEV_STATS_INC(dev, tx_dropped); } } -- cgit v1.3.1 From 1a961e74d5abbea049588a3d74b759955b4ed9d5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 Sep 2023 16:42:02 -0700 Subject: net: phylink: fix sphinx complaint about invalid literal sphinx complains about the use of "%PHYLINK_PCS_NEG_*": Documentation/networking/kapi:144: ./include/linux/phylink.h:601: WARNING: Inline literal start-string without end-string. Documentation/networking/kapi:144: ./include/linux/phylink.h:633: WARNING: Inline literal start-string without end-string. These are not valid symbols so drop the '%' prefix. Alternatively we could use %PHYLINK_PCS_NEG_\* (escape the *) or use normal literal ``PHYLINK_PCS_NEG_*`` but there is already a handful of un-adorned DEFINE_* in this file. Fixes: f99d471afa03 ("net: phylink: add PCS negotiation mode") Reported-by: Stephen Rothwell Link: https://lore.kernel.org/all/20230626162908.2f149f98@canb.auug.org.au/ Signed-off-by: Jakub Kicinski Reviewed-by: Bagas Sanjaya Signed-off-by: David S. Miller --- include/linux/phylink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 7d07f8736431..2b886ea654bb 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -600,7 +600,7 @@ void pcs_get_state(struct phylink_pcs *pcs, * * The %neg_mode argument should be tested via the phylink_mode_*() family of * functions, or for PCS that set pcs->neg_mode true, should be tested - * against the %PHYLINK_PCS_NEG_* definitions. + * against the PHYLINK_PCS_NEG_* definitions. */ int pcs_config(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, const unsigned long *advertising, @@ -630,7 +630,7 @@ void pcs_an_restart(struct phylink_pcs *pcs); * * The %mode argument should be tested via the phylink_mode_*() family of * functions, or for PCS that set pcs->neg_mode true, should be tested - * against the %PHYLINK_PCS_NEG_* definitions. + * against the PHYLINK_PCS_NEG_* definitions. */ void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, int speed, int duplex); -- cgit v1.3.1 From fdc04cc2d5fd0bb9c17f36d0a895cf3e151109e6 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 1 Sep 2023 14:15:16 +0200 Subject: netfilter: nf_tables: uapi: Describe NFTA_RULE_CHAIN_ID Add a brief description to the enum's comment. Fixes: 837830a4b439 ("netfilter: nf_tables: add NFTA_RULE_CHAIN_ID attribute") Signed-off-by: Phil Sutter Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8466c2a9938f..ca30232b7bc8 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -263,6 +263,7 @@ enum nft_chain_attributes { * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN) * @NFTA_RULE_ID: uniquely identifies a rule in a transaction (NLA_U32) * @NFTA_RULE_POSITION_ID: transaction unique identifier of the previous rule (NLA_U32) + * @NFTA_RULE_CHAIN_ID: add the rule to chain by ID, alternative to @NFTA_RULE_CHAIN (NLA_U32) */ enum nft_rule_attributes { NFTA_RULE_UNSPEC, -- cgit v1.3.1 From 08c6d8bae48c2c28f7017d7b61b5d5a1518ceb39 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 5 Sep 2023 11:33:15 +0200 Subject: net: phy: Provide Module 4 KSZ9477 errata (DS80000754C) The KSZ9477 errata points out (in 'Module 4') the link up/down problems when EEE (Energy Efficient Ethernet) is enabled in the device to which the KSZ9477 tries to auto negotiate. The suggested workaround is to clear advertisement of EEE for PHYs in this chip driver. To avoid regressions with other switch ICs the new MICREL_NO_EEE flag has been introduced. Moreover, the in-register disablement of MMD_DEVICE_ID_EEE_ADV.MMD_EEE_ADV MMD register is removed, as this code is both; now executed too late (after previous rework of the PHY and DSA for KSZ switches) and not required as setting all members of eee_broken_modes bit field prevents the KSZ9477 from advertising EEE. Fixes: 69d3b36ca045 ("net: dsa: microchip: enable EEE support") # for KSZ9477 Signed-off-by: Lukasz Majewski Tested-by: Oleksij Rempel # Confirmed disabled EEE with oscilloscope. Reviewed-by: Oleksij Rempel Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20230905093315.784052-1-lukma@denx.de Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 16 +++++++++++++++- drivers/net/phy/micrel.c | 9 ++++++--- include/linux/micrel_phy.h | 1 + 3 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 6673122266b7..42db7679c360 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2335,13 +2335,27 @@ static u32 ksz_get_phy_flags(struct dsa_switch *ds, int port) { struct ksz_device *dev = ds->priv; - if (dev->chip_id == KSZ8830_CHIP_ID) { + switch (dev->chip_id) { + case KSZ8830_CHIP_ID: /* Silicon Errata Sheet (DS80000830A): * Port 1 does not work with LinkMD Cable-Testing. * Port 1 does not respond to received PAUSE control frames. */ if (!port) return MICREL_KSZ8_P1_ERRATA; + break; + case KSZ9477_CHIP_ID: + /* KSZ9477 Errata DS80000754C + * + * Module 4: Energy Efficient Ethernet (EEE) feature select must + * be manually disabled + * The EEE feature is enabled by default, but it is not fully + * operational. It must be manually disabled through register + * controls. If not disabled, the PHY ports can auto-negotiate + * to enable EEE, and this feature can cause link drops when + * linked to another device supporting EEE. + */ + return MICREL_NO_EEE; } return 0; diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index b6d7981b2d1e..927d3d54658e 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1800,9 +1800,6 @@ static const struct ksz9477_errata_write ksz9477_errata_writes[] = { /* Transmit waveform amplitude can be improved (1000BASE-T, 100BASE-TX, 10BASE-Te) */ {0x1c, 0x04, 0x00d0}, - /* Energy Efficient Ethernet (EEE) feature select must be manually disabled */ - {0x07, 0x3c, 0x0000}, - /* Register settings are required to meet data sheet supply current specifications */ {0x1c, 0x13, 0x6eff}, {0x1c, 0x14, 0xe6ff}, @@ -1847,6 +1844,12 @@ static int ksz9477_config_init(struct phy_device *phydev) return err; } + /* According to KSZ9477 Errata DS80000754C (Module 4) all EEE modes + * in this switch shall be regarded as broken. + */ + if (phydev->dev_flags & MICREL_NO_EEE) + phydev->eee_broken_modes = -1; + err = genphy_restart_aneg(phydev); if (err) return err; diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 322d87255984..4e27ca7c49de 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -44,6 +44,7 @@ #define MICREL_PHY_50MHZ_CLK BIT(0) #define MICREL_PHY_FXEN BIT(1) #define MICREL_KSZ8_P1_ERRATA BIT(2) +#define MICREL_NO_EEE BIT(3) #define MICREL_KSZ9021_EXTREG_CTRL 0xB #define MICREL_KSZ9021_EXTREG_DATA_WRITE 0xC -- cgit v1.3.1 From 6afcf0fb92701487421aa73c692855aa70fbc796 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Sep 2023 11:01:04 -0700 Subject: Revert "net: team: do not use dynamic lockdep key" This reverts commit 39285e124edbc752331e98ace37cc141a6a3747a. Looks like the change has unintended consequences in exposing objects before they are initialized. Let's drop this patch and try again in net-next. Reported-by: syzbot+44ae022028805f4600fc@syzkaller.appspotmail.com Fixes: 39285e124edb ("net: team: do not use dynamic lockdep key") Link: https://lore.kernel.org/all/20230907103124.6adb7256@kernel.org/ Signed-off-by: Jakub Kicinski --- drivers/net/team/team.c | 111 ++++++++++++++++--------------- drivers/net/team/team_mode_loadbalance.c | 4 +- include/linux/if_team.h | 30 +-------- 3 files changed, 60 insertions(+), 85 deletions(-) (limited to 'include') diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index ad29122a5468..e8b94580194e 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1135,8 +1135,8 @@ static int team_port_add(struct team *team, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct net_device *dev = team->dev; - char *portname = port_dev->name; struct team_port *port; + char *portname = port_dev->name; int err; if (port_dev->flags & IFF_LOOPBACK) { @@ -1203,31 +1203,18 @@ static int team_port_add(struct team *team, struct net_device *port_dev, memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len); - err = dev_open(port_dev, extack); - if (err) { - netdev_dbg(dev, "Device %s opening failed\n", - portname); - goto err_dev_open; - } - - err = team_upper_dev_link(team, port, extack); + err = team_port_enter(team, port); if (err) { - netdev_err(dev, "Device %s failed to set upper link\n", + netdev_err(dev, "Device %s failed to enter team mode\n", portname); - goto err_set_upper_link; + goto err_port_enter; } - /* lockdep subclass variable(dev->nested_level) was updated by - * team_upper_dev_link(). - */ - team_unlock(team); - team_lock(team); - - err = team_port_enter(team, port); + err = dev_open(port_dev, extack); if (err) { - netdev_err(dev, "Device %s failed to enter team mode\n", + netdev_dbg(dev, "Device %s opening failed\n", portname); - goto err_port_enter; + goto err_dev_open; } err = vlan_vids_add_by_dev(port_dev, dev); @@ -1255,6 +1242,13 @@ static int team_port_add(struct team *team, struct net_device *port_dev, goto err_handler_register; } + err = team_upper_dev_link(team, port, extack); + if (err) { + netdev_err(dev, "Device %s failed to set upper link\n", + portname); + goto err_set_upper_link; + } + err = __team_option_inst_add_port(team, port); if (err) { netdev_err(dev, "Device %s failed to add per-port options\n", @@ -1301,6 +1295,9 @@ err_set_slave_promisc: __team_option_inst_del_port(team, port); err_option_port_add: + team_upper_dev_unlink(team, port); + +err_set_upper_link: netdev_rx_handler_unregister(port_dev); err_handler_register: @@ -1310,16 +1307,13 @@ err_enable_netpoll: vlan_vids_del_by_dev(port_dev, dev); err_vids_add: - team_port_leave(team, port); - -err_port_enter: - team_upper_dev_unlink(team, port); - -err_set_upper_link: dev_close(port_dev); err_dev_open: + team_port_leave(team, port); team_port_set_orig_dev_addr(port); + +err_port_enter: dev_set_mtu(port_dev, port->orig.mtu); err_set_mtu: @@ -1622,7 +1616,6 @@ static int team_init(struct net_device *dev) int err; team->dev = dev; - mutex_init(&team->lock); team_set_no_mode(team); team->notifier_ctx = false; @@ -1650,6 +1643,8 @@ static int team_init(struct net_device *dev) goto err_options_register; netif_carrier_off(dev); + lockdep_register_key(&team->team_lock_key); + __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); netdev_lockdep_set_classes(dev); return 0; @@ -1670,7 +1665,7 @@ static void team_uninit(struct net_device *dev) struct team_port *port; struct team_port *tmp; - team_lock(team); + mutex_lock(&team->lock); list_for_each_entry_safe(port, tmp, &team->port_list, list) team_port_del(team, port->dev); @@ -1679,8 +1674,9 @@ static void team_uninit(struct net_device *dev) team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); - team_unlock(team); + mutex_unlock(&team->lock); netdev_change_features(dev); + lockdep_unregister_key(&team->team_lock_key); } static void team_destructor(struct net_device *dev) @@ -1794,18 +1790,18 @@ static void team_set_rx_mode(struct net_device *dev) static int team_set_mac_address(struct net_device *dev, void *p) { - struct team *team = netdev_priv(dev); struct sockaddr *addr = p; + struct team *team = netdev_priv(dev); struct team_port *port; if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; dev_addr_set(dev, addr->sa_data); - team_lock(team); + mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) if (team->ops.port_change_dev_addr) team->ops.port_change_dev_addr(team, port); - team_unlock(team); + mutex_unlock(&team->lock); return 0; } @@ -1819,7 +1815,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu) * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ - team_lock(team); + mutex_lock(&team->lock); team->port_mtu_change_allowed = true; list_for_each_entry(port, &team->port_list, list) { err = dev_set_mtu(port->dev, new_mtu); @@ -1830,7 +1826,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu) } } team->port_mtu_change_allowed = false; - team_unlock(team); + mutex_unlock(&team->lock); dev->mtu = new_mtu; @@ -1840,7 +1836,7 @@ unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) dev_set_mtu(port->dev, dev->mtu); team->port_mtu_change_allowed = false; - team_unlock(team); + mutex_unlock(&team->lock); return err; } @@ -1894,20 +1890,20 @@ static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ - team_lock(team); + mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { err = vlan_vid_add(port->dev, proto, vid); if (err) goto unwind; } - team_unlock(team); + mutex_unlock(&team->lock); return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); - team_unlock(team); + mutex_unlock(&team->lock); return err; } @@ -1917,10 +1913,10 @@ static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) struct team *team = netdev_priv(dev); struct team_port *port; - team_lock(team); + mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); - team_unlock(team); + mutex_unlock(&team->lock); return 0; } @@ -1942,9 +1938,9 @@ static void team_netpoll_cleanup(struct net_device *dev) { struct team *team = netdev_priv(dev); - team_lock(team); + mutex_lock(&team->lock); __team_netpoll_cleanup(team); - team_unlock(team); + mutex_unlock(&team->lock); } static int team_netpoll_setup(struct net_device *dev, @@ -1954,7 +1950,7 @@ static int team_netpoll_setup(struct net_device *dev, struct team_port *port; int err = 0; - team_lock(team); + mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { err = __team_port_enable_netpoll(port); if (err) { @@ -1962,7 +1958,7 @@ static int team_netpoll_setup(struct net_device *dev, break; } } - team_unlock(team); + mutex_unlock(&team->lock); return err; } #endif @@ -1973,9 +1969,9 @@ static int team_add_slave(struct net_device *dev, struct net_device *port_dev, struct team *team = netdev_priv(dev); int err; - team_lock(team); + mutex_lock(&team->lock); err = team_port_add(team, port_dev, extack); - team_unlock(team); + mutex_unlock(&team->lock); if (!err) netdev_change_features(dev); @@ -1988,12 +1984,19 @@ static int team_del_slave(struct net_device *dev, struct net_device *port_dev) struct team *team = netdev_priv(dev); int err; - team_lock(team); + mutex_lock(&team->lock); err = team_port_del(team, port_dev); - team_unlock(team); + mutex_unlock(&team->lock); - if (!err) - netdev_change_features(dev); + if (err) + return err; + + if (netif_is_team_master(port_dev)) { + lockdep_unregister_key(&team->team_lock_key); + lockdep_register_key(&team->team_lock_key); + lockdep_set_class(&team->lock, &team->team_lock_key); + } + netdev_change_features(dev); return err; } @@ -2313,13 +2316,13 @@ static struct team *team_nl_team_get(struct genl_info *info) } team = netdev_priv(dev); - __team_lock(team); + mutex_lock(&team->lock); return team; } static void team_nl_team_put(struct team *team) { - team_unlock(team); + mutex_unlock(&team->lock); dev_put(team->dev); } @@ -2981,9 +2984,9 @@ static void team_port_change_check(struct team_port *port, bool linkup) { struct team *team = port->team; - team_lock(team); + mutex_lock(&team->lock); __team_port_change_check(port, linkup); - team_unlock(team); + mutex_unlock(&team->lock); } diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c index 7bcc9d37447a..00f8989c29c0 100644 --- a/drivers/net/team/team_mode_loadbalance.c +++ b/drivers/net/team/team_mode_loadbalance.c @@ -478,7 +478,7 @@ static void lb_stats_refresh(struct work_struct *work) team = lb_priv_ex->team; lb_priv = get_lb_priv(team); - if (!team_trylock(team)) { + if (!mutex_trylock(&team->lock)) { schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, 0); return; } @@ -515,7 +515,7 @@ static void lb_stats_refresh(struct work_struct *work) schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, (lb_priv_ex->stats.refresh_interval * HZ) / 10); - team_unlock(team); + mutex_unlock(&team->lock); } static void lb_stats_refresh_interval_get(struct team *team, diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 12d4447fc8ab..1b9b15a492fa 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -221,38 +221,10 @@ struct team { atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; + struct lock_class_key team_lock_key; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; -static inline void __team_lock(struct team *team) -{ - mutex_lock(&team->lock); -} - -static inline int team_trylock(struct team *team) -{ - return mutex_trylock(&team->lock); -} - -#ifdef CONFIG_LOCKDEP -static inline void team_lock(struct team *team) -{ - ASSERT_RTNL(); - mutex_lock_nested(&team->lock, team->dev->nested_level); -} - -#else -static inline void team_lock(struct team *team) -{ - __team_lock(team); -} -#endif - -static inline void team_unlock(struct team *team) -{ - mutex_unlock(&team->lock); -} - static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { -- cgit v1.3.1