From 41a5db8d8161457b121a03fde999ff6e00090ee2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:34 -0700
Subject: bpf: Add support for non-fix-size percpu mem allocation

This is needed for later percpu mem allocation when the
allocation is done by bpf program. For such cases, a global
bpf_global_percpu_ma is added where a flexible allocation
size is needed.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152734.1995725-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 024e8b28c34b..440dd1f59a1c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -55,8 +55,8 @@ struct cgroup;
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
-extern struct bpf_mem_alloc bpf_global_ma;
-extern bool bpf_global_ma_set;
+extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
-- 
cgit v1.2.3-70-g09d2


From 55db92f42fe4a4ef7b4c2b4960c6212c8512dd53 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:39 -0700
Subject: bpf: Add BPF_KPTR_PERCPU as a field type

BPF_KPTR_PERCPU represents a percpu field type like below

  struct val_t {
    ... fields ...
  };
  struct t {
    ...
    struct val_t __percpu_kptr *percpu_data_ptr;
    ...
  };

where
  #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr")))

While BPF_KPTR_REF points to a trusted kernel object or a trusted
local object, BPF_KPTR_PERCPU points to a trusted local
percpu object.

This patch added basic support for BPF_KPTR_PERCPU
related to percpu_kptr field parsing, recording and free operations.
BPF_KPTR_PERCPU also supports the same map types
as BPF_KPTR_REF does.

Note that unlike a local kptr, it is possible that
a BPF_KTPR_PERCPU struct may not contain any
special fields like other kptr, bpf_spin_lock, bpf_list_head, etc.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152739.1996391-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 18 ++++++++++++------
 kernel/bpf/btf.c     |  5 +++++
 kernel/bpf/syscall.c |  4 ++++
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 440dd1f59a1c..87eeb3a46a1d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -180,14 +180,15 @@ enum btf_field_type {
 	BPF_TIMER      = (1 << 1),
 	BPF_KPTR_UNREF = (1 << 2),
 	BPF_KPTR_REF   = (1 << 3),
-	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
-	BPF_LIST_HEAD  = (1 << 4),
-	BPF_LIST_NODE  = (1 << 5),
-	BPF_RB_ROOT    = (1 << 6),
-	BPF_RB_NODE    = (1 << 7),
+	BPF_KPTR_PERCPU = (1 << 4),
+	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU,
+	BPF_LIST_HEAD  = (1 << 5),
+	BPF_LIST_NODE  = (1 << 6),
+	BPF_RB_ROOT    = (1 << 7),
+	BPF_RB_NODE    = (1 << 8),
 	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
 				 BPF_RB_NODE | BPF_RB_ROOT,
-	BPF_REFCOUNT   = (1 << 8),
+	BPF_REFCOUNT   = (1 << 9),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -300,6 +301,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return "kptr";
+	case BPF_KPTR_PERCPU:
+		return "percpu_kptr";
 	case BPF_LIST_HEAD:
 		return "bpf_list_head";
 	case BPF_LIST_NODE:
@@ -325,6 +328,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		return sizeof(u64);
 	case BPF_LIST_HEAD:
 		return sizeof(struct bpf_list_head);
@@ -351,6 +355,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		return __alignof__(u64);
 	case BPF_LIST_HEAD:
 		return __alignof__(struct bpf_list_head);
@@ -389,6 +394,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
 	case BPF_TIMER:
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		break;
 	default:
 		WARN_ON_ONCE(1);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1095bbe29859..187b57276fec 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3293,6 +3293,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 		type = BPF_KPTR_UNREF;
 	else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
 		type = BPF_KPTR_REF;
+	else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
+		type = BPF_KPTR_PERCPU;
 	else
 		return -EINVAL;
 
@@ -3457,6 +3459,7 @@ static int btf_find_struct_field(const struct btf *btf,
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_find_kptr(btf, member_type, off, sz,
 					    idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3523,6 +3526,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_find_kptr(btf, var_type, off, sz,
 					    idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3783,6 +3787,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
 			if (ret < 0)
 				goto end;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb01c31ed591..6a692f3bea15 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -514,6 +514,7 @@ void btf_record_free(struct btf_record *rec)
 		switch (rec->fields[i].type) {
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			if (rec->fields[i].kptr.module)
 				module_put(rec->fields[i].kptr.module);
 			btf_put(rec->fields[i].kptr.btf);
@@ -560,6 +561,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 		switch (fields[i].type) {
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			btf_get(fields[i].kptr.btf);
 			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
 				ret = -ENXIO;
@@ -650,6 +652,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
 			if (!xchgd_field)
 				break;
@@ -1045,6 +1048,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				break;
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_KPTR_PERCPU:
 			case BPF_REFCOUNT:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
 				    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
-- 
cgit v1.2.3-70-g09d2


From 01cc55af93884f1ff5a883426e1924378dfcc62a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:49 -0700
Subject: bpf: Add bpf_this_cpu_ptr/bpf_per_cpu_ptr support for allocated
 percpu obj

The bpf helpers bpf_this_cpu_ptr() and bpf_per_cpu_ptr() are re-purposed
for allocated percpu objects. For an allocated percpu obj,
the reg type is 'PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU'.

The return type for these two re-purposed helpera is
'PTR_TO_MEM | MEM_RCU | MEM_ALLOC'.
The MEM_ALLOC allows that the per-cpu data can be read and written.

Since the memory allocator bpf_mem_alloc() returns
a ptr to a percpu ptr for percpu data, the first argument
of bpf_this_cpu_ptr() and bpf_per_cpu_ptr() is patched
with a dereference before passing to the helper func.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152749.1997202-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 59 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 52 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b6e58dab8e27..a3236651ec64 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -480,6 +480,7 @@ struct bpf_insn_aux_data {
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
+	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c886ead18f6..6b7e7ca611f3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6221,7 +6221,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		}
 
 		if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
-		    !reg->ref_obj_id) {
+		    !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
 			verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
 			return -EFAULT;
 		}
@@ -7765,6 +7765,7 @@ static const struct bpf_reg_types btf_ptr_types = {
 static const struct bpf_reg_types percpu_btf_ptr_types = {
 	.types = {
 		PTR_TO_BTF_ID | MEM_PERCPU,
+		PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
 		PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
 	}
 };
@@ -7941,6 +7942,7 @@ found:
 		}
 		break;
 	case PTR_TO_BTF_ID | MEM_PERCPU:
+	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
 	case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
 		/* Handled by helper specific checks */
 		break;
@@ -9547,6 +9549,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			     int *insn_idx_p)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+	bool returns_cpu_specific_alloc_ptr = false;
 	const struct bpf_func_proto *fn = NULL;
 	enum bpf_return_type ret_type;
 	enum bpf_type_flag ret_flag;
@@ -9785,6 +9788,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 		break;
 	}
+	case BPF_FUNC_per_cpu_ptr:
+	case BPF_FUNC_this_cpu_ptr:
+	{
+		struct bpf_reg_state *reg = &regs[BPF_REG_1];
+		const struct btf_type *type;
+
+		if (reg->type & MEM_RCU) {
+			type = btf_type_by_id(reg->btf, reg->btf_id);
+			if (!type || !btf_type_is_struct(type)) {
+				verbose(env, "Helper has invalid btf/btf_id in R1\n");
+				return -EFAULT;
+			}
+			returns_cpu_specific_alloc_ptr = true;
+			env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
+		}
+		break;
+	}
 	case BPF_FUNC_user_ringbuf_drain:
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_user_ringbuf_callback_state);
@@ -9874,14 +9894,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
 			regs[BPF_REG_0].mem_size = tsize;
 		} else {
-			/* MEM_RDONLY may be carried from ret_flag, but it
-			 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
-			 * it will confuse the check of PTR_TO_BTF_ID in
-			 * check_mem_access().
-			 */
-			ret_flag &= ~MEM_RDONLY;
+			if (returns_cpu_specific_alloc_ptr) {
+				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
+			} else {
+				/* MEM_RDONLY may be carried from ret_flag, but it
+				 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+				 * it will confuse the check of PTR_TO_BTF_ID in
+				 * check_mem_access().
+				 */
+				ret_flag &= ~MEM_RDONLY;
+				regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+			}
 
-			regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
 			regs[BPF_REG_0].btf = meta.ret_btf;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
@@ -18676,6 +18700,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
+		/* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+		if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+			/* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+			 * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+			 */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+			insn_buf[1] = *insn;
+			cnt = 2;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+			env->prog = prog = new_prog;
+			insn = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
 		 * only.
-- 
cgit v1.2.3-70-g09d2


From 9bc95a95abbe91e9315c1fe27dc124019bd2592c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:28:37 -0700
Subject: bpf: Mark BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE deprecated

Now 'BPF_MAP_TYPE_CGRP_STORAGE + local percpu ptr'
can cover all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE functionality
and more. So mark BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE deprecated.
Also make changes in selftests/bpf/test_bpftool_synctypes.py
and selftest libbpf_str to fix otherwise test errors.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152837.2003563-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h                              | 9 ++++++++-
 tools/include/uapi/linux/bpf.h                        | 9 ++++++++-
 tools/testing/selftests/bpf/prog_tests/libbpf_str.c   | 6 +++++-
 tools/testing/selftests/bpf/test_bpftool_synctypes.py | 9 +++++++++
 4 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8790b3962e4b..73b155e52204 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -932,7 +932,14 @@ enum bpf_map_type {
 	 */
 	BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
-	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED,
+	/* BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE is available to bpf programs
+	 * attaching to a cgroup. The new mechanism (BPF_MAP_TYPE_CGRP_STORAGE +
+	 * local percpu kptr) supports all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE
+	 * functionality and more. So mark * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE
+	 * deprecated.
+	 */
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED,
 	BPF_MAP_TYPE_QUEUE,
 	BPF_MAP_TYPE_STACK,
 	BPF_MAP_TYPE_SK_STORAGE,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8790b3962e4b..73b155e52204 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -932,7 +932,14 @@ enum bpf_map_type {
 	 */
 	BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
-	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED,
+	/* BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE is available to bpf programs
+	 * attaching to a cgroup. The new mechanism (BPF_MAP_TYPE_CGRP_STORAGE +
+	 * local percpu kptr) supports all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE
+	 * functionality and more. So mark * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE
+	 * deprecated.
+	 */
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED,
 	BPF_MAP_TYPE_QUEUE,
 	BPF_MAP_TYPE_STACK,
 	BPF_MAP_TYPE_SK_STORAGE,
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index efb8bd43653c..c440ea3311ed 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -142,10 +142,14 @@ static void test_libbpf_bpf_map_type_str(void)
 		/* Special case for map_type_name BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED
 		 * where it and BPF_MAP_TYPE_CGROUP_STORAGE have the same enum value
 		 * (map_type). For this enum value, libbpf_bpf_map_type_str() picks
-		 * BPF_MAP_TYPE_CGROUP_STORAGE.
+		 * BPF_MAP_TYPE_CGROUP_STORAGE. The same for
+		 * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED and
+		 * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE.
 		 */
 		if (strcmp(map_type_name, "BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED") == 0)
 			continue;
+		if (strcmp(map_type_name, "BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED") == 0)
+			continue;
 
 		ASSERT_STREQ(buf, map_type_name, "exp_str_value");
 	}
diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
index 0cfece7ff4f8..0ed67b6b31dd 100755
--- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py
+++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
@@ -509,6 +509,15 @@ def main():
     source_map_types.remove('cgroup_storage_deprecated')
     source_map_types.add('cgroup_storage')
 
+    # The same applied to BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED and
+    # BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE which share the same enum value
+    # and source_map_types picks
+    # BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED/percpu_cgroup_storage_deprecated.
+    # Replace 'percpu_cgroup_storage_deprecated' with 'percpu_cgroup_storage'
+    # so it aligns with what `bpftool map help` shows.
+    source_map_types.remove('percpu_cgroup_storage_deprecated')
+    source_map_types.add('percpu_cgroup_storage')
+
     help_map_types = map_info.get_map_help()
     help_map_options = map_info.get_options()
     map_info.close()
-- 
cgit v1.2.3-70-g09d2


From f498f6ab7adb461a68e13ea6d4443cb3636f2d93 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 28 Aug 2023 15:09:24 +0200
Subject: wifi: mac80211: rework ack_frame_id handling a bit

Take one more free bit to indicate it's IDR vs. internal
usage, to be able to carve out some bits here for other
internal usage, other than IDR handling with a full ACK
SKB, that is.

Reviewed-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  9 ++++++---
 net/mac80211/cfg.c         |  3 ++-
 net/mac80211/ieee80211_i.h |  5 +++++
 net/mac80211/status.c      |  4 ++--
 net/mac80211/tx.c          | 14 ++++++++++----
 5 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7c707358d15c..7f3b6f00f8a2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1115,7 +1115,9 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate)
  *	not valid if the interface is an MLD since we won't know which
  *	link the frame will be transmitted on
  * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC
- * @ack_frame_id: internal frame ID for TX status, used internally
+ * @status_data: internal data for TX status handling, assigned privately,
+ *	see also &enum ieee80211_status_data for the internal documentation
+ * @status_data_idr: indicates status data is IDR allocated ID for ack frame
  * @tx_time_est: TX time estimate in units of 4us, used internally
  * @control: union part for control data
  * @control.rates: TX rates array to try
@@ -1155,10 +1157,11 @@ struct ieee80211_tx_info {
 	/* common information */
 	u32 flags;
 	u32 band:3,
-	    ack_frame_id:13,
+	    status_data_idr:1,
+	    status_data:13,
 	    hw_queue:4,
 	    tx_time_est:10;
-	/* 2 free bits */
+	/* 1 free bit */
 
 	union {
 		struct {
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 45e7a5d9c7d9..29a6da5ee77f 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -4034,7 +4034,8 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
 		return -ENOMEM;
 	}
 
-	IEEE80211_SKB_CB(skb)->ack_frame_id = id;
+	IEEE80211_SKB_CB(skb)->status_data_idr = 1;
+	IEEE80211_SKB_CB(skb)->status_data = id;
 
 	*cookie = ieee80211_mgmt_tx_cookie(local);
 	IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 06bd406846d2..7b74cf96ee0a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -85,6 +85,11 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];
 
 #define IEEE80211_MAX_NAN_INSTANCE_ID 255
 
+enum ieee80211_status_data {
+	IEEE80211_STATUS_TYPE_MASK	= 0x00f,
+	IEEE80211_STATUS_TYPE_INVALID	= 0,
+	IEEE80211_STATUS_SUBDATA_MASK	= 0xff0,
+};
 
 /*
  * Keep a station's queues on the active list for deficit accounting purposes
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 44d83da60aee..f24aceb59db0 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -633,7 +633,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 	unsigned long flags;
 
 	spin_lock_irqsave(&local->ack_status_lock, flags);
-	skb = idr_remove(&local->ack_status_frames, info->ack_frame_id);
+	skb = idr_remove(&local->ack_status_frames, info->status_data);
 	spin_unlock_irqrestore(&local->ack_status_lock, flags);
 
 	if (!skb)
@@ -759,7 +759,7 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 		}
 
 		rcu_read_unlock();
-	} else if (info->ack_frame_id) {
+	} else if (info->status_data_idr) {
 		ieee80211_report_ack_skb(local, skb, acked, dropped,
 					 ack_hwtstamp);
 	}
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3a5b41c2ee3d..ae33f727c6a8 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2942,7 +2942,10 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	memset(info, 0, sizeof(*info));
 
 	info->flags = info_flags;
-	info->ack_frame_id = info_id;
+	if (info_id) {
+		info->status_data = info_id;
+		info->status_data_idr = 1;
+	}
 	info->band = band;
 
 	if (likely(!cookie)) {
@@ -4639,9 +4642,12 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (unlikely(skb->sk &&
-		     skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
-		info->ack_frame_id = ieee80211_store_ack_skb(local, skb,
-							     &info->flags, NULL);
+		     skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) {
+		info->status_data = ieee80211_store_ack_skb(local, skb,
+							    &info->flags, NULL);
+		if (info->status_data)
+			info->status_data_idr = 1;
+	}
 
 	dev_sw_netstats_tx_add(dev, skbs, len);
 	sta->deflink.tx_stats.packets[queue] += skbs;
-- 
cgit v1.2.3-70-g09d2


From 56cfb8ce1f7f6c4e5ca571a2ec0880e131cd0311 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 28 Aug 2023 13:59:33 +0200
Subject: wifi: cfg80211: add flush functions for wiphy work

There may be sometimes reasons to actually run the work
if it's pending, add flush functions for both regular and
delayed wiphy work that will do this.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 21 +++++++++++++++++++++
 net/wireless/core.c    | 34 ++++++++++++++++++++++++++++++++--
 net/wireless/core.h    |  3 ++-
 net/wireless/sysfs.c   |  4 ++--
 4 files changed, 57 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 3a4b684f89bf..d1964a6d0b35 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5826,6 +5826,16 @@ void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work);
  */
 void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work);
 
+/**
+ * wiphy_work_flush - flush previously queued work
+ * @wiphy: the wiphy, for debug purposes
+ * @work: the work to flush, this can be %NULL to flush all work
+ *
+ * Flush the work (i.e. run it if pending). This must be called
+ * under the wiphy mutex acquired by wiphy_lock().
+ */
+void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work);
+
 struct wiphy_delayed_work {
 	struct wiphy_work work;
 	struct wiphy *wiphy;
@@ -5869,6 +5879,17 @@ void wiphy_delayed_work_queue(struct wiphy *wiphy,
 void wiphy_delayed_work_cancel(struct wiphy *wiphy,
 			       struct wiphy_delayed_work *dwork);
 
+/**
+ * wiphy_delayed work_flush - flush previously queued delayed work
+ * @wiphy: the wiphy, for debug purposes
+ * @work: the work to flush
+ *
+ * Flush the work (i.e. run it if pending). This must be called
+ * under the wiphy mutex acquired by wiphy_lock().
+ */
+void wiphy_delayed_work_flush(struct wiphy *wiphy,
+			      struct wiphy_delayed_work *dwork);
+
 /**
  * struct wireless_dev - wireless device state
  *
diff --git a/net/wireless/core.c b/net/wireless/core.c
index fdb9d736a2e8..88042a647aaa 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1049,7 +1049,8 @@ void wiphy_rfkill_start_polling(struct wiphy *wiphy)
 }
 EXPORT_SYMBOL(wiphy_rfkill_start_polling);
 
-void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev)
+void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev,
+				  struct wiphy_work *end)
 {
 	unsigned int runaway_limit = 100;
 	unsigned long flags;
@@ -1068,6 +1069,10 @@ void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev)
 		wk->func(&rdev->wiphy, wk);
 
 		spin_lock_irqsave(&rdev->wiphy_work_lock, flags);
+
+		if (wk == end)
+			break;
+
 		if (WARN_ON(--runaway_limit == 0))
 			INIT_LIST_HEAD(&rdev->wiphy_work_list);
 	}
@@ -1118,7 +1123,7 @@ void wiphy_unregister(struct wiphy *wiphy)
 #endif
 
 	/* surely nothing is reachable now, clean up work */
-	cfg80211_process_wiphy_works(rdev);
+	cfg80211_process_wiphy_works(rdev, NULL);
 	wiphy_unlock(&rdev->wiphy);
 	rtnl_unlock();
 
@@ -1640,6 +1645,21 @@ void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work)
 }
 EXPORT_SYMBOL_GPL(wiphy_work_cancel);
 
+void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+	unsigned long flags;
+	bool run;
+
+	spin_lock_irqsave(&rdev->wiphy_work_lock, flags);
+	run = !work || !list_empty(&work->entry);
+	spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags);
+
+	if (run)
+		cfg80211_process_wiphy_works(rdev, work);
+}
+EXPORT_SYMBOL_GPL(wiphy_work_flush);
+
 void wiphy_delayed_work_timer(struct timer_list *t)
 {
 	struct wiphy_delayed_work *dwork = from_timer(dwork, t, timer);
@@ -1672,6 +1692,16 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy,
 }
 EXPORT_SYMBOL_GPL(wiphy_delayed_work_cancel);
 
+void wiphy_delayed_work_flush(struct wiphy *wiphy,
+			      struct wiphy_delayed_work *dwork)
+{
+	lockdep_assert_held(&wiphy->mtx);
+
+	del_timer_sync(&dwork->timer);
+	wiphy_work_flush(wiphy, &dwork->work);
+}
+EXPORT_SYMBOL_GPL(wiphy_delayed_work_flush);
+
 static int __init cfg80211_init(void)
 {
 	int err;
diff --git a/net/wireless/core.h b/net/wireless/core.h
index d4976550e7f6..b9876b444e1b 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -469,7 +469,8 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			  struct net_device *dev, enum nl80211_iftype ntype,
 			  struct vif_params *params);
 void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
-void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev);
+void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev,
+				  struct wiphy_work *end);
 void cfg80211_process_wdev_events(struct wireless_dev *wdev);
 
 bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index c629bac3f298..565511a3f461 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -105,14 +105,14 @@ static int wiphy_suspend(struct device *dev)
 			cfg80211_leave_all(rdev);
 			cfg80211_process_rdev_events(rdev);
 		}
-		cfg80211_process_wiphy_works(rdev);
+		cfg80211_process_wiphy_works(rdev, NULL);
 		if (rdev->ops->suspend)
 			ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config);
 		if (ret == 1) {
 			/* Driver refuse to configure wowlan */
 			cfg80211_leave_all(rdev);
 			cfg80211_process_rdev_events(rdev);
-			cfg80211_process_wiphy_works(rdev);
+			cfg80211_process_wiphy_works(rdev, NULL);
 			ret = rdev_suspend(rdev, NULL);
 		}
 		if (ret == 0)
-- 
cgit v1.2.3-70-g09d2


From 076fc8775dafe995e94c106bb732bf2d42dedcea Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 29 Aug 2023 12:18:56 +0200
Subject: wifi: cfg80211: remove wdev mutex

Since we're now protecting everything with the wiphy mutex
(and were really using it for almost everything before),
there's no longer any real reason to have a separate wdev
mutex. It may feel better, but really has no value.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c     |   4 +-
 drivers/net/wireless/marvell/mwifiex/11h.c     |   4 +-
 drivers/net/wireless/quantenna/qtnfmac/event.c |   4 +-
 include/net/cfg80211.h                         |   6 +-
 include/net/mac80211.h                         |  20 +-
 net/mac80211/cfg.c                             |  43 +--
 net/mac80211/debugfs_netdev.c                  |  10 +-
 net/mac80211/driver-ops.c                      |   2 -
 net/mac80211/driver-ops.h                      |  14 -
 net/mac80211/ibss.c                            |  39 +--
 net/mac80211/ieee80211_i.h                     |  30 +-
 net/mac80211/iface.c                           |  11 -
 net/mac80211/link.c                            |  34 +--
 net/mac80211/main.c                            |   7 +-
 net/mac80211/mesh.c                            |  16 +-
 net/mac80211/mlme.c                            | 123 +++-----
 net/mac80211/ocb.c                             |   6 +-
 net/mac80211/offchannel.c                      |   2 -
 net/mac80211/tdls.c                            |  26 +-
 net/mac80211/util.c                            |   5 -
 net/wireless/ap.c                              |  24 +-
 net/wireless/chan.c                            |  32 +-
 net/wireless/core.c                            |  24 +-
 net/wireless/core.h                            |  36 ---
 net/wireless/ibss.c                            |  76 +----
 net/wireless/mesh.c                            |  23 +-
 net/wireless/mlme.c                            |  21 +-
 net/wireless/nl80211.c                         | 398 ++++++-------------------
 net/wireless/ocb.c                             |  43 +--
 net/wireless/pmsr.c                            |   4 +-
 net/wireless/reg.c                             |  16 +-
 net/wireless/sme.c                             |  55 ++--
 net/wireless/util.c                            |  14 +-
 net/wireless/wext-compat.c                     |  43 +--
 net/wireless/wext-sme.c                        |  59 +---
 35 files changed, 300 insertions(+), 974 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 0c2b8b1a10d5..1dba55c2d9dc 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1118,9 +1118,9 @@ void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
 				 ath6kl_band_2ghz.ht_cap.ht_supported) ?
 					NL80211_CHAN_HT20 : NL80211_CHAN_NO_HT);
 
-	mutex_lock(&vif->wdev.mtx);
+	wiphy_lock(vif->ar->wiphy);
 	cfg80211_ch_switch_notify(vif->ndev, &chandef, 0, 0);
-	mutex_unlock(&vif->wdev.mtx);
+	wiphy_unlock(vif->ar->wiphy);
 }
 
 static int ath6kl_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev,
diff --git a/drivers/net/wireless/marvell/mwifiex/11h.c b/drivers/net/wireless/marvell/mwifiex/11h.c
index 2ea03725f188..da211372a481 100644
--- a/drivers/net/wireless/marvell/mwifiex/11h.c
+++ b/drivers/net/wireless/marvell/mwifiex/11h.c
@@ -287,7 +287,7 @@ void mwifiex_dfs_chan_sw_work_queue(struct work_struct *work)
 
 	mwifiex_dbg(priv->adapter, MSG,
 		    "indicating channel switch completion to kernel\n");
-	mutex_lock(&priv->wdev.mtx);
+	wiphy_lock(priv->wdev.wiphy);
 	cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef, 0, 0);
-	mutex_unlock(&priv->wdev.mtx);
+	wiphy_unlock(priv->wdev.wiphy);
 }
diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c
index 31bc58e96ac0..3b283e93a13e 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/event.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/event.c
@@ -477,9 +477,9 @@ qtnf_event_handle_freq_change(struct qtnf_wmac *mac,
 		if (!vif->netdev)
 			continue;
 
-		mutex_lock(&vif->wdev.mtx);
+		wiphy_lock(priv_to_wiphy(vif->mac));
 		cfg80211_ch_switch_notify(vif->netdev, &chandef, 0, 0);
-		mutex_unlock(&vif->wdev.mtx);
+		wiphy_unlock(priv_to_wiphy(vif->mac));
 	}
 
 	return 0;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d1964a6d0b35..aa9c26a03f30 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5938,8 +5938,6 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy,
  * @mgmt_registrations: list of registrations for management frames
  * @mgmt_registrations_need_update: mgmt registrations were updated,
  *	need to propagate the update to the driver
- * @mtx: mutex used to lock data in this struct, may be used by drivers
- *	and some API functions require it held
  * @beacon_interval: beacon interval used on this device for transmitting
  *	beacons, 0 when not valid
  * @address: The address for this device, valid only if @netdev is %NULL
@@ -5985,8 +5983,6 @@ struct wireless_dev {
 	struct list_head mgmt_registrations;
 	u8 mgmt_registrations_need_update:1;
 
-	struct mutex mtx;
-
 	bool use_4addr, is_running, registered, registering;
 
 	u8 address[ETH_ALEN] __aligned(sizeof(u16));
@@ -8589,7 +8585,7 @@ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
  * @link_id: the link ID for MLO, must be 0 for non-MLO
  * @punct_bitmap: the new puncturing bitmap
  *
- * Caller must acquire wdev_lock, therefore must only be called from sleepable
+ * Caller must hold wiphy mutex, therefore must only be called from sleepable
  * driver context!
  */
 void cfg80211_ch_switch_notify(struct net_device *dev,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7f3b6f00f8a2..154592ce48e5 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -643,9 +643,7 @@ struct ieee80211_fils_discovery {
  * @pwr_reduction: power constraint of BSS.
  * @eht_support: does this BSS support EHT
  * @eht_puncturing: bitmap to indicate which channels are punctured in this BSS
- * @csa_active: marks whether a channel switch is going on. Internally it is
- *	write-protected by sdata_lock and local->mtx so holding either is fine
- *	for read access.
+ * @csa_active: marks whether a channel switch is going on.
  * @csa_punct_bitmap: new puncturing bitmap for channel switch
  * @mu_mimo_owner: indicates interface owns MU-MIMO capability
  * @chanctx_conf: The channel context this interface is assigned to, or %NULL
@@ -653,9 +651,7 @@ struct ieee80211_fils_discovery {
  *	path needing to access it; even though the netdev carrier will always
  *	be off when it is %NULL there can still be races and packets could be
  *	processed after it switches back to %NULL.
- * @color_change_active: marks whether a color change is ongoing. Internally it is
- *	write-protected by sdata_lock and local->mtx so holding either is fine
- *	for read access.
+ * @color_change_active: marks whether a color change is ongoing.
  * @color_change_color: the bss color that will be used after the change.
  * @ht_ldpc: in AP mode, indicates interface has HT LDPC capability.
  * @vht_ldpc: in AP mode, indicates interface has VHT LDPC capability.
@@ -1974,22 +1970,18 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev);
  */
 struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif);
 
-/**
- * lockdep_vif_mutex_held - for lockdep checks on link poiners
- * @vif: the interface to check
- */
-static inline bool lockdep_vif_mutex_held(struct ieee80211_vif *vif)
+static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif)
 {
-	return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->mtx);
+	return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->wiphy->mtx);
 }
 
 #define link_conf_dereference_protected(vif, link_id)		\
 	rcu_dereference_protected((vif)->link_conf[link_id],	\
-				  lockdep_vif_mutex_held(vif))
+				  lockdep_vif_wiphy_mutex_held(vif))
 
 #define link_conf_dereference_check(vif, link_id)		\
 	rcu_dereference_check((vif)->link_conf[link_id],	\
-			      lockdep_vif_mutex_held(vif))
+			      lockdep_vif_wiphy_mutex_held(vif))
 
 /**
  * enum ieee80211_key_flags - key flags
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 0201a3320136..851d6ed68367 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -573,8 +573,7 @@ ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata, int link_id,
 	struct ieee80211_key *key;
 
 	if (link_id >= 0) {
-		link = rcu_dereference_check(sdata->link[link_id],
-					     lockdep_is_held(&sdata->wdev.mtx));
+		link = sdata_dereference(sdata->link[link_id], sdata);
 		if (!link)
 			return NULL;
 	}
@@ -896,12 +895,10 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
 		sdata = wiphy_dereference(local->hw.wiphy,
 					  local->monitor_sdata);
 		if (sdata) {
-			sdata_lock(sdata);
 			ieee80211_link_release_channel(&sdata->deflink);
 			ret = ieee80211_link_use_channel(&sdata->deflink,
 							 chandef,
 							 IEEE80211_CHANCTX_EXCLUSIVE);
-			sdata_unlock(sdata);
 		}
 	} else {
 		if (local->open_count == local->monitors) {
@@ -1490,7 +1487,7 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	struct ieee80211_bss_conf *link_conf;
 	u64 changed = 0;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(wiphy);
 
 	link = sdata_dereference(sdata->link[params->link_id], sdata);
 	if (!link)
@@ -1549,7 +1546,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 		sdata_dereference(sdata->link[link_id], sdata);
 	struct ieee80211_bss_conf *link_conf = link->conf;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	old_beacon = sdata_dereference(link->u.ap.beacon, sdata);
@@ -2163,14 +2159,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
 		}
 	}
 
-	/* we use sta_info_get_bss() so this might be different */
-	if (sdata != sta->sdata) {
-		mutex_lock_nested(&sta->sdata->wdev.mtx, 1);
-		err = sta_apply_parameters(local, sta, params);
-		mutex_unlock(&sta->sdata->wdev.mtx);
-	} else {
-		err = sta_apply_parameters(local, sta, params);
-	}
+	err = sta_apply_parameters(local, sta, params);
 	if (err)
 		return err;
 
@@ -3132,7 +3121,7 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	struct sta_info *sta;
 	bool tdls_peer_found = false;
 
-	lockdep_assert_held(&sdata->wdev.mtx);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION))
 		return -EINVAL;
@@ -3211,7 +3200,6 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 	local->dynamic_ps_forced_timeout = timeout;
 
 	/* no change, but if automatic follow powersave */
-	sdata_lock(sdata);
 	for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
 		struct ieee80211_link_data *link;
 
@@ -3222,7 +3210,6 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 		__ieee80211_request_smps_mgd(sdata, link,
 					     link->u.mgd.req_smps);
 	}
-	sdata_unlock(sdata);
 
 	if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
 		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
@@ -3609,7 +3596,6 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 	u64 changed = 0;
 	int err;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	/*
@@ -3681,20 +3667,16 @@ void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work)
 			     deflink.csa_finalize_work);
 	struct ieee80211_local *local = sdata->local;
 
-	sdata_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	/* AP might have been stopped while waiting for the lock. */
 	if (!sdata->vif.bss_conf.csa_active)
-		goto unlock;
+		return;
 
 	if (!ieee80211_sdata_running(sdata))
-		goto unlock;
+		return;
 
 	ieee80211_csa_finalize(sdata);
-
-unlock:
-	sdata_unlock(sdata);
 }
 
 static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
@@ -3850,7 +3832,6 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	u64 changed = 0;
 	int err;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	if (!list_empty(&local->roc_list) || local->scanning)
@@ -4665,7 +4646,6 @@ static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata)
 	u64 changed = 0;
 	int err;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	sdata->vif.bss_conf.color_change_active = false;
@@ -4692,20 +4672,16 @@ void ieee80211_color_change_finalize_work(struct wiphy *wiphy,
 			     deflink.color_change_finalize_work);
 	struct ieee80211_local *local = sdata->local;
 
-	sdata_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	/* AP might have been stopped while waiting for the lock. */
 	if (!sdata->vif.bss_conf.color_change_active)
-		goto unlock;
+		return;
 
 	if (!ieee80211_sdata_running(sdata))
-		goto unlock;
+		return;
 
 	ieee80211_color_change_finalize(sdata);
-
-unlock:
-	sdata_unlock(sdata);
 }
 
 void ieee80211_color_collision_detection_work(struct work_struct *work)
@@ -4716,9 +4692,7 @@ void ieee80211_color_collision_detection_work(struct work_struct *work)
 			     color_collision_detect_work);
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 
-	sdata_lock(sdata);
 	cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap);
-	sdata_unlock(sdata);
 }
 
 void ieee80211_color_change_finish(struct ieee80211_vif *vif)
@@ -4762,7 +4736,6 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev,
 	u64 changed = 0;
 	int err;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	if (sdata->vif.bss_conf.nontransmitted)
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 706330fadc97..14a40348959a 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -322,7 +322,6 @@ static int ieee80211_set_smps(struct ieee80211_link_data *link,
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
-	int err;
 
 	if (sdata->vif.driver_flags & IEEE80211_VIF_DISABLE_SMPS_OVERRIDE)
 		return -EOPNOTSUPP;
@@ -340,11 +339,7 @@ static int ieee80211_set_smps(struct ieee80211_link_data *link,
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return -EOPNOTSUPP;
 
-	sdata_lock(sdata);
-	err = __ieee80211_request_smps_mgd(link->sdata, link, smps_mode);
-	sdata_unlock(sdata);
-
-	return err;
+	return __ieee80211_request_smps_mgd(link->sdata, link, smps_mode);
 }
 
 static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = {
@@ -416,16 +411,13 @@ static ssize_t ieee80211_if_parse_tkip_mic_test(
 	case NL80211_IFTYPE_STATION:
 		fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
 		/* BSSID SA DA */
-		sdata_lock(sdata);
 		if (!sdata->u.mgd.associated) {
-			sdata_unlock(sdata);
 			dev_kfree_skb(skb);
 			return -ENOTCONN;
 		}
 		memcpy(hdr->addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN);
 		memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
 		memcpy(hdr->addr3, addr, ETH_ALEN);
-		sdata_unlock(sdata);
 		break;
 	default:
 		dev_kfree_skb(skb);
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index 9fc110264808..919300750527 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -300,7 +300,6 @@ int drv_assign_vif_chanctx(struct ieee80211_local *local,
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	drv_verify_link_exists(sdata, link_conf);
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
@@ -329,7 +328,6 @@ void drv_unassign_vif_chanctx(struct ieee80211_local *local,
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	drv_verify_link_exists(sdata, link_conf);
 	if (!check_sdata_in_driver(sdata))
 		return;
 
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 2fac7dc2eb9d..554c7aa10cc2 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -996,14 +996,6 @@ static inline void drv_change_chanctx(struct ieee80211_local *local,
 	trace_drv_return_void(local);
 }
 
-static inline void drv_verify_link_exists(struct ieee80211_sub_if_data *sdata,
-					  struct ieee80211_bss_conf *link_conf)
-{
-	/* deflink always exists, so need to check only for other links */
-	if (sdata->deflink.conf != link_conf)
-		sdata_assert_lock(sdata);
-}
-
 int drv_assign_vif_chanctx(struct ieee80211_local *local,
 			   struct ieee80211_sub_if_data *sdata,
 			   struct ieee80211_bss_conf *link_conf,
@@ -1022,9 +1014,6 @@ static inline int drv_start_ap(struct ieee80211_local *local,
 {
 	int ret = 0;
 
-	/* make sure link_conf is protected */
-	drv_verify_link_exists(sdata, link_conf);
-
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 
@@ -1045,9 +1034,6 @@ static inline void drv_stop_ap(struct ieee80211_local *local,
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	/* make sure link_conf is protected */
-	drv_verify_link_exists(sdata, link_conf);
-
 	if (!check_sdata_in_driver(sdata))
 		return;
 
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index b95098c13153..9907cea6457c 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -235,7 +235,6 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	bool radar_required;
 	int err;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	/* Reset own TSF to allow time synchronization work. */
@@ -403,7 +402,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	u32 rate_flags;
 	int shift;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (beacon_int < 10)
 		beacon_int = 10;
@@ -484,7 +483,7 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
 	u16 capability = WLAN_CAPABILITY_IBSS;
 	u64 tsf;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (ifibss->privacy)
 		capability |= WLAN_CAPABILITY_PRIVACY;
@@ -526,7 +525,7 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed)
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 	struct cfg80211_bss *cbss;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	/* When not connected/joined, sending CSA doesn't make sense. */
 	if (ifibss->state != IEEE80211_IBSS_MLME_JOINED)
@@ -648,7 +647,7 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
 	int active = 0;
 	struct sta_info *sta;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	rcu_read_lock();
 
@@ -734,16 +733,12 @@ static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy,
 		container_of(work, struct ieee80211_sub_if_data,
 			     u.ibss.csa_connection_drop_work);
 
-	sdata_lock(sdata);
-
 	ieee80211_ibss_disconnect(sdata);
 	synchronize_rcu();
 	skb_queue_purge(&sdata->skb_queue);
 
 	/* trigger a scan to find another IBSS network to join */
 	wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work);
-
-	sdata_unlock(sdata);
 }
 
 static void ieee80211_ibss_csa_mark_radar(struct ieee80211_sub_if_data *sdata)
@@ -775,7 +770,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	ieee80211_conn_flags_t conn_flags;
 	u32 vht_cap_info = 0;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	conn_flags = IEEE80211_CONN_DISABLE_VHT;
 
@@ -947,7 +942,7 @@ static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
 {
 	u16 auth_alg, auth_transaction;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (len < 24 + 6)
 		return;
@@ -1289,7 +1284,7 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 	enum nl80211_bss_scan_width scan_width;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	mod_timer(&ifibss->timer,
 		  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
@@ -1321,7 +1316,7 @@ static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
 	u16 capability;
 	int i;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (ifibss->fixed_bssid) {
 		memcpy(bssid, ifibss->bssid, ETH_ALEN);
@@ -1432,7 +1427,7 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 	enum nl80211_bss_scan_width scan_width;
 	int active_ibss;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	active_ibss = ieee80211_sta_active_ibss(sdata);
 	ibss_dbg(sdata, "sta_find_ibss (active_ibss=%d)\n", active_ibss);
@@ -1526,7 +1521,7 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
 	struct beacon_data *presp;
 	u8 *pos, *end;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	presp = sdata_dereference(ifibss->presp, sdata);
 
@@ -1622,10 +1617,8 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 	mgmt = (struct ieee80211_mgmt *) skb->data;
 	fc = le16_to_cpu(mgmt->frame_control);
 
-	sdata_lock(sdata);
-
 	if (!sdata->u.ibss.ssid_len)
-		goto mgmt_out; /* not ready to merge yet */
+		return; /* not ready to merge yet */
 
 	switch (fc & IEEE80211_FCTL_STYPE) {
 	case IEEE80211_STYPE_PROBE_REQ:
@@ -1665,9 +1658,6 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 			break;
 		}
 	}
-
- mgmt_out:
-	sdata_unlock(sdata);
 }
 
 void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata)
@@ -1675,15 +1665,13 @@ void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 	struct sta_info *sta;
 
-	sdata_lock(sdata);
-
 	/*
 	 * Work could be scheduled after scan or similar
 	 * when we aren't even joined (or trying) with a
 	 * network.
 	 */
 	if (!ifibss->ssid_len)
-		goto out;
+		return;
 
 	spin_lock_bh(&ifibss->incomplete_lock);
 	while (!list_empty(&ifibss->incomplete_stations)) {
@@ -1709,9 +1697,6 @@ void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata)
 		WARN_ON(1);
 		break;
 	}
-
- out:
-	sdata_unlock(sdata);
 }
 
 static void ieee80211_ibss_timer(struct timer_list *t)
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2d2a4445714e..b8465d205076 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -991,7 +991,7 @@ struct ieee80211_link_data {
 	struct ieee80211_key __rcu *default_beacon_key;
 
 	struct wiphy_work csa_finalize_work;
-	bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */
+	bool csa_block_tx;
 
 	bool operating_11g_mode;
 
@@ -1135,28 +1135,8 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p)
 	return container_of(p, struct ieee80211_sub_if_data, vif);
 }
 
-static inline void sdata_lock(struct ieee80211_sub_if_data *sdata)
-	__acquires(&sdata->wdev.mtx)
-{
-	mutex_lock(&sdata->wdev.mtx);
-	__acquire(&sdata->wdev.mtx);
-}
-
-static inline void sdata_unlock(struct ieee80211_sub_if_data *sdata)
-	__releases(&sdata->wdev.mtx)
-{
-	mutex_unlock(&sdata->wdev.mtx);
-	__release(&sdata->wdev.mtx);
-}
-
 #define sdata_dereference(p, sdata) \
-	rcu_dereference_protected(p, lockdep_is_held(&sdata->wdev.mtx))
-
-static inline void
-sdata_assert_lock(struct ieee80211_sub_if_data *sdata)
-{
-	lockdep_assert_held(&sdata->wdev.mtx);
-}
+	wiphy_dereference(sdata->local->hw.wiphy, p)
 
 static inline int
 ieee80211_chanwidth_get_shift(enum nl80211_chan_width width)
@@ -2034,8 +2014,10 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
 void ieee80211_link_stop(struct ieee80211_link_data *link);
 int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata,
 			    u16 new_links, u16 dormant_links);
-void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata);
-int __ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links);
+static inline void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata)
+{
+	ieee80211_vif_set_links(sdata, 0, 0);
+}
 
 /* tx handling */
 void ieee80211_clear_tx_pending(struct ieee80211_local *local);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index fc407be04ce9..7e3acf670f0f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -529,7 +529,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 	del_timer_sync(&local->dynamic_ps_timer);
 	wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work);
 
-	sdata_lock(sdata);
 	WARN(ieee80211_vif_is_mld(&sdata->vif),
 	     "destroying interface with valid links 0x%04x\n",
 	     sdata->vif.valid_links);
@@ -542,7 +541,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
 		sdata->deflink.csa_block_tx = false;
 	}
-	sdata_unlock(sdata);
 
 	wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa_finalize_work);
 	wiphy_work_cancel(local->hw.wiphy,
@@ -1133,7 +1131,6 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 	snprintf(sdata->name, IFNAMSIZ, "%s-monitor",
 		 wiphy_name(local->hw.wiphy));
 	sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
-	mutex_init(&sdata->wdev.mtx);
 	sdata->wdev.wiphy = local->hw.wiphy;
 
 	ieee80211_sdata_init(local, sdata);
@@ -1159,17 +1156,14 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 	rcu_assign_pointer(local->monitor_sdata, sdata);
 	mutex_unlock(&local->iflist_mtx);
 
-	sdata_lock(sdata);
 	ret = ieee80211_link_use_channel(&sdata->deflink, &local->monitor_chandef,
 					 IEEE80211_CHANCTX_EXCLUSIVE);
-	sdata_unlock(sdata);
 	if (ret) {
 		mutex_lock(&local->iflist_mtx);
 		RCU_INIT_POINTER(local->monitor_sdata, NULL);
 		mutex_unlock(&local->iflist_mtx);
 		synchronize_net();
 		drv_remove_interface(local, sdata);
-		mutex_destroy(&sdata->wdev.mtx);
 		kfree(sdata);
 		return ret;
 	}
@@ -1205,13 +1199,10 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
 
 	synchronize_net();
 
-	sdata_lock(sdata);
 	ieee80211_link_release_channel(&sdata->deflink);
-	sdata_unlock(sdata);
 
 	drv_remove_interface(local, sdata);
 
-	mutex_destroy(&sdata->wdev.mtx);
 	kfree(sdata);
 }
 
@@ -2279,13 +2270,11 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local)
 		 * we can't acquire the wiphy_lock() again there if already
 		 * inside this locked section.
 		 */
-		sdata_lock(sdata);
 		sdata->vif.cfg.arp_addr_cnt = 0;
 		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
 		    sdata->u.mgd.associated)
 			ieee80211_vif_cfg_change_notify(sdata,
 							BSS_CHANGED_ARP_FILTER);
-		sdata_unlock(sdata);
 
 		list_del(&sdata->list);
 		cfg80211_unregister_wdev(&sdata->wdev);
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index 80571dcc57f5..2a78374f6f04 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -191,7 +191,7 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_link_data *old_data[IEEE80211_MLD_MAX_NUM_LINKS];
 	bool use_deflink = old_links == 0; /* set for error case */
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	memset(to_free, 0, sizeof(links));
 
@@ -303,23 +303,6 @@ int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata,
 	return ret;
 }
 
-void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata)
-{
-	struct link_container *links[IEEE80211_MLD_MAX_NUM_LINKS];
-
-	/*
-	 * The locking here is different because when we free links
-	 * in the station case we need to be able to cancel_work_sync()
-	 * something that also takes the lock.
-	 */
-
-	sdata_lock(sdata);
-	ieee80211_vif_update_links(sdata, links, 0, 0);
-	sdata_unlock(sdata);
-
-	ieee80211_free_links(sdata, links);
-}
-
 static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata,
 				       u16 active_links)
 {
@@ -447,14 +430,13 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
-int __ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links)
+int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_local *local = sdata->local;
 	u16 old_active;
 	int ret;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	old_active = sdata->vif.active_links;
@@ -475,18 +457,6 @@ int __ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links)
 
 	return ret;
 }
-
-int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links)
-{
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
-	int ret;
-
-	sdata_lock(sdata);
-	ret = __ieee80211_set_active_links(vif, active_links);
-	sdata_unlock(sdata);
-
-	return ret;
-}
 EXPORT_SYMBOL_GPL(ieee80211_set_active_links);
 
 void ieee80211_set_active_links_async(struct ieee80211_vif *vif,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 411e44239bb9..0ab603850a85 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -365,13 +365,10 @@ static void ieee80211_restart_work(struct work_struct *work)
 			 */
 			wiphy_work_cancel(local->hw.wiphy,
 					  &sdata->u.mgd.csa_connection_drop_work);
-			if (sdata->vif.bss_conf.csa_active) {
-				sdata_lock(sdata);
+			if (sdata->vif.bss_conf.csa_active)
 				ieee80211_sta_connection_lost(sdata,
 							      WLAN_REASON_UNSPECIFIED,
 							      false);
-				sdata_unlock(sdata);
-			}
 		}
 		wiphy_delayed_work_flush(local->hw.wiphy,
 					 &sdata->dec_tailroom_needed_wk);
@@ -473,7 +470,6 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 	 */
 	mutex_lock_nested(&local->hw.wiphy->mtx, 1);
 	__acquire(&local->hw.wiphy->mtx);
-	sdata_lock(sdata);
 
 	/* Copy the addresses to the vif config list */
 	ifa = rtnl_dereference(idev->ifa_list);
@@ -490,7 +486,6 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 	if (ifmgd->associated)
 		ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_ARP_FILTER);
 
-	sdata_unlock(sdata);
 	wiphy_unlock(local->hw.wiphy);
 
 	return NOTIFY_OK;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index af8c5fc2db14..0d0fbae51b61 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1291,7 +1291,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
 	ieee80211_conn_flags_t conn_flags = 0;
 	u32 vht_cap_info = 0;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	sband = ieee80211_get_sband(sdata);
 	if (!sband)
@@ -1559,7 +1559,7 @@ int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata,
 	struct mesh_csa_settings *tmp_csa_settings;
 	int ret = 0;
 
-	lockdep_assert_held(&sdata->wdev.mtx);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	tmp_csa_settings = kmalloc(sizeof(*tmp_csa_settings),
 				   GFP_ATOMIC);
@@ -1691,11 +1691,11 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_mgmt *mgmt;
 	u16 stype;
 
-	sdata_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	/* mesh already went down */
 	if (!sdata->u.mesh.mesh_id_len)
-		goto out;
+		return;
 
 	rx_status = IEEE80211_SKB_RXCB(skb);
 	mgmt = (struct ieee80211_mgmt *) skb->data;
@@ -1714,8 +1714,6 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 		ieee80211_mesh_rx_mgmt_action(sdata, mgmt, skb->len, rx_status);
 		break;
 	}
-out:
-	sdata_unlock(sdata);
 }
 
 static void mesh_bss_info_changed(struct ieee80211_sub_if_data *sdata)
@@ -1745,11 +1743,11 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 
-	sdata_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	/* mesh already went down */
 	if (!sdata->u.mesh.mesh_id_len)
-		goto out;
+		return;
 
 	if (ifmsh->preq_queue_len &&
 	    time_after(jiffies,
@@ -1767,8 +1765,6 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
 
 	if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags))
 		mesh_bss_info_changed(sdata);
-out:
-	sdata_unlock(sdata);
 }
 
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 195e7202d51d..6d0a29749e8c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1,4 +1,3 @@
-// SPDX-License-Identifier: GPL-2.0-only
 /*
  * BSS client mode implementation
  * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
@@ -175,7 +174,7 @@ ieee80211_handle_puncturing_bitmap(struct ieee80211_link_data *link,
 static void run_again(struct ieee80211_sub_if_data *sdata,
 		      unsigned long timeout)
 {
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (!timer_pending(&sdata->u.mgd.timer) ||
 	    time_before(timeout, sdata->u.mgd.timer.expires))
@@ -1401,7 +1400,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 						      assoc_data->ie,
 						      assoc_data->ie_len);
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	size = local->hw.extra_tx_headroom +
 	       sizeof(*mgmt) + /* bit too much but doesn't matter */
@@ -1689,14 +1688,13 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy,
 	if (!ieee80211_sdata_running(sdata))
 		return;
 
-	sdata_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	if (!ifmgd->associated)
-		goto out;
+		return;
 
 	if (!link->conf->csa_active)
-		goto out;
+		return;
 
 	/*
 	 * using reservation isn't immediate as it may be deferred until later
@@ -1712,7 +1710,7 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy,
 		 * reservations
 		 */
 		if (link->reserved_ready)
-			goto out;
+			return;
 
 		ret = ieee80211_link_use_reserved_context(link);
 		if (ret) {
@@ -1721,10 +1719,8 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy,
 				   ret);
 			wiphy_work_queue(sdata->local->hw.wiphy,
 					 &ifmgd->csa_connection_drop_work);
-			goto out;
 		}
-
-		goto out;
+		return;
 	}
 
 	if (!cfg80211_chandef_identical(&link->conf->chandef,
@@ -1733,16 +1729,13 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy,
 			   "failed to finalize channel switch, disconnecting\n");
 		wiphy_work_queue(sdata->local->hw.wiphy,
 				 &ifmgd->csa_connection_drop_work);
-		goto out;
+		return;
 	}
 
 	link->u.mgd.csa_waiting_bcn = true;
 
 	ieee80211_sta_reset_beacon_monitor(sdata);
 	ieee80211_sta_reset_conn_monitor(sdata);
-
-out:
-	sdata_unlock(sdata);
 }
 
 static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link)
@@ -1752,7 +1745,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link)
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	int ret;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	WARN_ON(!link->conf->csa_active);
 
@@ -1846,7 +1839,6 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 	unsigned long timeout;
 	int res;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	if (!cbss)
@@ -2884,7 +2876,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 		.subtype = stype,
 	};
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	if (WARN_ON_ONCE(tx && !frame_buf))
@@ -3223,19 +3214,16 @@ static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata,
 	if (!ieee80211_sdata_running(sdata))
 		return;
 
-	sdata_lock(sdata);
-
 	if (!ifmgd->associated)
-		goto out;
+		return;
 
-	if (sdata->local->tmp_channel || sdata->local->scanning) {
-		goto out;
-	}
+	if (sdata->local->tmp_channel || sdata->local->scanning)
+		return;
 
 	if (sdata->local->suspending) {
 		/* reschedule after resume */
 		ieee80211_reset_ap_probe(sdata);
-		goto out;
+		return;
 	}
 
 	if (beacon) {
@@ -3263,14 +3251,12 @@ static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata,
 	ifmgd->flags |= IEEE80211_STA_CONNECTION_POLL;
 
 	if (already)
-		goto out;
+		return;
 
 	ieee80211_recalc_ps(sdata->local);
 
 	ifmgd->probe_send_count = 0;
 	ieee80211_mgd_probe_ap_send(sdata);
- out:
-	sdata_unlock(sdata);
 }
 
 struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
@@ -3283,12 +3269,12 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
 	const struct element *ssid;
 	int ssid_len;
 
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
 	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION ||
 		    ieee80211_vif_is_mld(&sdata->vif)))
 		return NULL;
 
-	sdata_assert_lock(sdata);
-
 	if (ifmgd->associated)
 		cbss = sdata->deflink.u.mgd.bss;
 	else if (ifmgd->auth_data)
@@ -3335,7 +3321,7 @@ static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata,
 	drv_event_callback(sdata->local, sdata, &event);
 }
 
-static void ___ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
+static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -3394,13 +3380,6 @@ static void ___ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
 	ifmgd->reconnect = false;
 }
 
-static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
-{
-	sdata_lock(sdata);
-	___ieee80211_disconnect(sdata);
-	sdata_unlock(sdata);
-}
-
 static void ieee80211_beacon_connection_loss_work(struct wiphy *wiphy,
 						  struct wiphy_work *work)
 {
@@ -3482,7 +3461,6 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (!assoc) {
@@ -3522,7 +3500,6 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (status != ASSOC_SUCCESS) {
@@ -3638,7 +3615,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 		.subtype = IEEE80211_STYPE_AUTH,
 	};
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (len < 24 + 6)
 		return;
@@ -3796,7 +3773,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (len < 24 + 2)
 		return;
@@ -3840,7 +3817,7 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	u16 reason_code;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (len < 24 + 2)
 		return;
@@ -5253,7 +5230,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	u8 ap_mld_addr[ETH_ALEN] __aligned(2);
 	unsigned int link_id;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (!assoc_data)
 		return;
@@ -5453,7 +5430,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_link_data *link,
 	struct ieee80211_bss *bss;
 	struct ieee80211_channel *channel;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	channel = ieee80211_get_channel_khz(local->hw.wiphy,
 					ieee80211_rx_status_to_khz(rx_status));
@@ -5480,7 +5457,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_link_data *link,
 
 	ifmgd = &sdata->u.mgd;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	/*
 	 * According to Draft P802.11ax D6.0 clause 26.17.2.3.2:
@@ -5691,21 +5668,16 @@ static void ieee80211_ml_reconf_work(struct wiphy *wiphy,
 	u16 new_valid_links, new_active_links, new_dormant_links;
 	int ret;
 
-	sdata_lock(sdata);
-	if (!sdata->u.mgd.removed_links) {
-		sdata_unlock(sdata);
+	if (!sdata->u.mgd.removed_links)
 		return;
-	}
 
 	sdata_info(sdata,
 		   "MLO Reconfiguration: work: valid=0x%x, removed=0x%x\n",
 		   sdata->vif.valid_links, sdata->u.mgd.removed_links);
 
 	new_valid_links = sdata->vif.valid_links & ~sdata->u.mgd.removed_links;
-	if (new_valid_links == sdata->vif.valid_links) {
-		sdata_unlock(sdata);
+	if (new_valid_links == sdata->vif.valid_links)
 		return;
-	}
 
 	if (!new_valid_links ||
 	    !(new_valid_links & ~sdata->vif.dormant_links)) {
@@ -5721,8 +5693,7 @@ static void ieee80211_ml_reconf_work(struct wiphy *wiphy,
 				BIT(ffs(new_valid_links &
 					~sdata->vif.dormant_links) - 1);
 
-		ret = __ieee80211_set_active_links(&sdata->vif,
-						   new_active_links);
+		ret = ieee80211_set_active_links(&sdata->vif, new_active_links);
 		if (ret) {
 			sdata_info(sdata,
 				   "Failed setting active links\n");
@@ -5741,11 +5712,9 @@ out:
 	if (!ret)
 		cfg80211_links_removed(sdata->dev, sdata->u.mgd.removed_links);
 	else
-		___ieee80211_disconnect(sdata);
+		__ieee80211_disconnect(sdata);
 
 	sdata->u.mgd.removed_links = 0;
-
-	sdata_unlock(sdata);
 }
 
 static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata,
@@ -5873,7 +5842,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 		.from_ap = true,
 	};
 
-	sdata_assert_lock(sdata);
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	/* Process beacon from the current BSS */
@@ -6183,17 +6151,17 @@ void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_hdr *hdr;
 	u16 fc;
 
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
 	rx_status = (struct ieee80211_rx_status *) skb->cb;
 	hdr = (struct ieee80211_hdr *) skb->data;
 	fc = le16_to_cpu(hdr->frame_control);
 
-	sdata_lock(sdata);
 	switch (fc & IEEE80211_FCTL_STYPE) {
 	case IEEE80211_STYPE_S1G_BEACON:
 		ieee80211_rx_mgmt_beacon(link, hdr, skb->len, rx_status);
 		break;
 	}
-	sdata_unlock(sdata);
 }
 
 void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
@@ -6205,17 +6173,17 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 	u16 fc;
 	int ies_len;
 
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
 	rx_status = (struct ieee80211_rx_status *) skb->cb;
 	mgmt = (struct ieee80211_mgmt *) skb->data;
 	fc = le16_to_cpu(mgmt->frame_control);
 
-	sdata_lock(sdata);
-
 	if (rx_status->link_valid) {
 		link = sdata_dereference(sdata->link[rx_status->link_id],
 					 sdata);
 		if (!link)
-			goto out;
+			return;
 	}
 
 	switch (fc & IEEE80211_FCTL_STYPE) {
@@ -6298,8 +6266,6 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 		}
 		break;
 	}
-out:
-	sdata_unlock(sdata);
 }
 
 static void ieee80211_sta_timer(struct timer_list *t)
@@ -6334,7 +6300,7 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
 		.subtype = IEEE80211_STYPE_AUTH,
 	};
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (WARN_ON_ONCE(!auth_data))
 		return -EINVAL;
@@ -6403,7 +6369,7 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_local *local = sdata->local;
 	int ret;
 
-	sdata_assert_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	assoc_data->tries++;
 	if (assoc_data->tries > IEEE80211_ASSOC_MAX_TRIES) {
@@ -6459,7 +6425,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	sdata_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (ifmgd->status_received) {
 		__le16 fc = ifmgd->status_fc;
@@ -6594,8 +6560,6 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
 				WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false);
 		}
 	}
-
-	sdata_unlock(sdata);
 }
 
 static void ieee80211_sta_bcn_mon_timer(struct timer_list *t)
@@ -6682,7 +6646,7 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
 
-	sdata_lock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
 	if (ifmgd->auth_data || ifmgd->assoc_data) {
 		const u8 *ap_addr = ifmgd->auth_data ?
@@ -6734,8 +6698,6 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
 		memcpy(bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
 		ieee80211_mgd_deauth(sdata, &req);
 	}
-
-	sdata_unlock(sdata);
 }
 #endif
 
@@ -6743,11 +6705,10 @@ void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	sdata_lock(sdata);
-	if (!ifmgd->associated) {
-		sdata_unlock(sdata);
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+	if (!ifmgd->associated)
 		return;
-	}
 
 	if (sdata->flags & IEEE80211_SDATA_DISCONNECT_RESUME) {
 		sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_RESUME;
@@ -6755,7 +6716,6 @@ void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata)
 		ieee80211_sta_connection_lost(sdata,
 					      WLAN_REASON_UNSPECIFIED,
 					      true);
-		sdata_unlock(sdata);
 		return;
 	}
 
@@ -6765,11 +6725,8 @@ void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata)
 		ieee80211_sta_connection_lost(sdata,
 					      WLAN_REASON_UNSPECIFIED,
 					      true);
-		sdata_unlock(sdata);
 		return;
 	}
-
-	sdata_unlock(sdata);
 }
 
 static void ieee80211_request_smps_mgd_work(struct wiphy *wiphy,
@@ -6779,10 +6736,8 @@ static void ieee80211_request_smps_mgd_work(struct wiphy *wiphy,
 		container_of(work, struct ieee80211_link_data,
 			     u.mgd.request_smps_work);
 
-	sdata_lock(link->sdata);
 	__ieee80211_request_smps_mgd(link->sdata, link,
 				     link->u.mgd.driver_smps_mode);
-	sdata_unlock(link->sdata);
 }
 
 /* interface setup */
@@ -7830,7 +7785,6 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
 	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
 				  &ifmgd->ml_reconf_work);
 
-	sdata_lock(sdata);
 	if (ifmgd->assoc_data)
 		ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT);
 	if (ifmgd->auth_data)
@@ -7846,7 +7800,6 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
 	ifmgd->assoc_req_ies_len = 0;
 	spin_unlock_bh(&ifmgd->teardown_lock);
 	del_timer_sync(&ifmgd->timer);
-	sdata_unlock(sdata);
 }
 
 void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
index 7661e96454b2..6e2965ffb809 100644
--- a/net/mac80211/ocb.c
+++ b/net/mac80211/ocb.c
@@ -124,11 +124,11 @@ void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
 	struct sta_info *sta;
 
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
 	if (ifocb->joined != true)
 		return;
 
-	sdata_lock(sdata);
-
 	spin_lock_bh(&ifocb->incomplete_lock);
 	while (!list_empty(&ifocb->incomplete_stations)) {
 		sta = list_first_entry(&ifocb->incomplete_stations,
@@ -144,8 +144,6 @@ void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata)
 
 	if (test_and_clear_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags))
 		ieee80211_ocb_housekeeping(sdata);
-
-	sdata_unlock(sdata);
 }
 
 static void ieee80211_ocb_housekeeping_timer(struct timer_list *t)
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 8325fbb1645e..be377ed12baa 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -826,13 +826,11 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-		sdata_lock(sdata);
 		if (!sdata->u.mgd.associated ||
 		    (params->offchan && params->wait &&
 		     local->ops->remain_on_channel &&
 		     memcmp(sdata->vif.cfg.ap_addr, mgmt->bssid, ETH_ALEN)))
 			need_offchan = true;
-		sdata_unlock(sdata);
 		break;
 	case NL80211_IFTYPE_P2P_DEVICE:
 		need_offchan = true;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 9bcb0c2bba7d..ba14f570cda7 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1465,22 +1465,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 	/* protect possible bss_conf changes and avoid concurrency in
 	 * ieee80211_bss_info_change_notify()
 	 */
-	sdata_lock(sdata);
 	tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer);
 
 	switch (oper) {
 	case NL80211_TDLS_ENABLE_LINK:
 		if (sdata->vif.bss_conf.csa_active) {
 			tdls_dbg(sdata, "TDLS: disallow link during CSA\n");
-			ret = -EBUSY;
-			break;
+			return -EBUSY;
 		}
 
 		sta = sta_info_get(sdata, peer);
-		if (!sta) {
-			ret = -ENOLINK;
-			break;
-		}
+		if (!sta)
+			return -ENOLINK;
 
 		iee80211_tdls_recalc_chanctx(sdata, sta);
 		iee80211_tdls_recalc_ht_protection(sdata, sta);
@@ -1489,7 +1485,6 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 
 		WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) ||
 			     !ether_addr_equal(sdata->u.mgd.tdls_peer, peer));
-		ret = 0;
 		break;
 	case NL80211_TDLS_DISABLE_LINK:
 		/*
@@ -1511,24 +1506,23 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 		iee80211_tdls_recalc_ht_protection(sdata, NULL);
 
 		iee80211_tdls_recalc_chanctx(sdata, NULL);
+		if (ret)
+			return ret;
 		break;
 	default:
-		ret = -ENOTSUPP;
-		break;
+		return -ENOTSUPP;
 	}
 
-	if (ret == 0 && ether_addr_equal(sdata->u.mgd.tdls_peer, peer)) {
+	if (ether_addr_equal(sdata->u.mgd.tdls_peer, peer)) {
 		wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
 					  &sdata->u.mgd.tdls_peer_del_work);
 		eth_zero_addr(sdata->u.mgd.tdls_peer);
 	}
 
-	if (ret == 0)
-		wiphy_work_queue(sdata->local->hw.wiphy,
-				 &sdata->deflink.u.mgd.request_smps_work);
+	wiphy_work_queue(sdata->local->hw.wiphy,
+			 &sdata->deflink.u.mgd.request_smps_work);
 
-	sdata_unlock(sdata);
-	return ret;
+	return 0;
 }
 
 void ieee80211_tdls_oper_request(struct ieee80211_vif *vif, const u8 *peer,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 7833043b0a4e..4aefb9483aa9 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2664,7 +2664,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		if (!ieee80211_sdata_running(sdata))
 			continue;
 
-		sdata_lock(sdata);
 		if (ieee80211_vif_is_mld(&sdata->vif)) {
 			struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS] = {
 				[0] = &sdata->vif.bss_conf,
@@ -2796,7 +2795,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		case NL80211_IFTYPE_NAN:
 			res = ieee80211_reconfig_nan(sdata);
 			if (res < 0) {
-				sdata_unlock(sdata);
 				ieee80211_handle_reconfig_failure(local);
 				return res;
 			}
@@ -2814,7 +2812,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			WARN_ON(1);
 			break;
 		}
-		sdata_unlock(sdata);
 
 		if (active_links)
 			ieee80211_set_active_links(&sdata->vif, active_links);
@@ -2844,7 +2841,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		if (!ieee80211_sdata_running(sdata))
 			continue;
 
-		sdata_lock(sdata);
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_AP_VLAN:
 		case NL80211_IFTYPE_AP:
@@ -2853,7 +2849,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		default:
 			break;
 		}
-		sdata_unlock(sdata);
 	}
 
 	/* add back keys */
diff --git a/net/wireless/ap.c b/net/wireless/ap.c
index 0962770303b2..9a9a870806f5 100644
--- a/net/wireless/ap.c
+++ b/net/wireless/ap.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Parts of this file are
- * Copyright (C) 2022 Intel Corporation
+ * Copyright (C) 2022-2023 Intel Corporation
  */
 #include <linux/ieee80211.h>
 #include <linux/export.h>
@@ -18,7 +18,7 @@ static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!rdev->ops->stop_ap)
 		return -EOPNOTSUPP;
@@ -52,9 +52,9 @@ static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 	return err;
 }
 
-int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev, int link_id,
-		       bool notify)
+int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+		     struct net_device *dev, int link_id,
+		     bool notify)
 {
 	unsigned int link;
 	int ret = 0;
@@ -72,17 +72,3 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 
 	return ret;
 }
-
-int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		     struct net_device *dev, int link_id,
-		     bool notify)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
-
-	wdev_lock(wdev);
-	err = __cfg80211_stop_ap(rdev, dev, link_id, notify);
-	wdev_unlock(wdev);
-
-	return err;
-}
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index b2469e2c1e70..2af3aaee7493 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -713,7 +713,7 @@ bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev)
 {
 	unsigned int link;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
@@ -782,18 +782,14 @@ static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy,
 {
 	struct wireless_dev *wdev;
 
+	lockdep_assert_wiphy(wiphy);
+
 	list_for_each_entry(wdev, &wiphy->wdev_list, list) {
-		wdev_lock(wdev);
-		if (!cfg80211_beaconing_iface_active(wdev)) {
-			wdev_unlock(wdev);
+		if (!cfg80211_beaconing_iface_active(wdev))
 			continue;
-		}
 
-		if (cfg80211_wdev_on_sub_chan(wdev, chan, false)) {
-			wdev_unlock(wdev);
+		if (cfg80211_wdev_on_sub_chan(wdev, chan, false))
 			return true;
-		}
-		wdev_unlock(wdev);
 	}
 
 	return false;
@@ -1325,10 +1321,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
 	list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
 		bool ret;
 
-		wdev_lock(wdev);
 		ret = cfg80211_ir_permissive_check_wdev(iftype, wdev, chan);
-		wdev_unlock(wdev);
-
 		if (ret)
 			return ret;
 	}
@@ -1437,17 +1430,10 @@ EXPORT_SYMBOL(cfg80211_any_usable_channels);
 struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev,
 				       unsigned int link_id)
 {
-	/*
-	 * We need to sort out the locking here - in some cases
-	 * where we get here we really just don't care (yet)
-	 * about the valid links, but in others we do. But we
-	 * get here with various driver cases, so we cannot
-	 * easily require the wdev mutex.
-	 */
-	if (link_id || wdev->valid_links & BIT(0)) {
-		ASSERT_WDEV_LOCK(wdev);
-		WARN_ON(!(wdev->valid_links & BIT(link_id)));
-	}
+	lockdep_assert_wiphy(wdev->wiphy);
+
+	WARN_ON(wdev->valid_links && !(wdev->valid_links & BIT(link_id)));
+	WARN_ON(!wdev->valid_links && link_id > 0);
 
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_MESH_POINT:
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 88042a647aaa..c419177278da 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1278,14 +1278,13 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
 		rdev->num_running_monitor_ifaces += num;
 }
 
-void __cfg80211_leave(struct cfg80211_registered_device *rdev,
-		      struct wireless_dev *wdev)
+void cfg80211_leave(struct cfg80211_registered_device *rdev,
+		    struct wireless_dev *wdev)
 {
 	struct net_device *dev = wdev->netdev;
 	struct cfg80211_sched_scan_request *pos, *tmp;
 
 	lockdep_assert_held(&rdev->wiphy.mtx);
-	ASSERT_WDEV_LOCK(wdev);
 
 	cfg80211_pmsr_wdev_down(wdev);
 
@@ -1293,7 +1292,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
 
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_ADHOC:
-		__cfg80211_leave_ibss(rdev, dev, true);
+		cfg80211_leave_ibss(rdev, dev, true);
 		break;
 	case NL80211_IFTYPE_P2P_CLIENT:
 	case NL80211_IFTYPE_STATION:
@@ -1313,14 +1312,14 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
 				    WLAN_REASON_DEAUTH_LEAVING, true);
 		break;
 	case NL80211_IFTYPE_MESH_POINT:
-		__cfg80211_leave_mesh(rdev, dev);
+		cfg80211_leave_mesh(rdev, dev);
 		break;
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		__cfg80211_stop_ap(rdev, dev, -1, true);
+		cfg80211_stop_ap(rdev, dev, -1, true);
 		break;
 	case NL80211_IFTYPE_OCB:
-		__cfg80211_leave_ocb(rdev, dev);
+		cfg80211_leave_ocb(rdev, dev);
 		break;
 	case NL80211_IFTYPE_P2P_DEVICE:
 	case NL80211_IFTYPE_NAN:
@@ -1338,14 +1337,6 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
 	}
 }
 
-void cfg80211_leave(struct cfg80211_registered_device *rdev,
-		    struct wireless_dev *wdev)
-{
-	wdev_lock(wdev);
-	__cfg80211_leave(rdev, wdev);
-	wdev_unlock(wdev);
-}
-
 void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
 			 gfp_t gfp)
 {
@@ -1370,7 +1361,6 @@ EXPORT_SYMBOL(cfg80211_stop_iface);
 
 void cfg80211_init_wdev(struct wireless_dev *wdev)
 {
-	mutex_init(&wdev->mtx);
 	INIT_LIST_HEAD(&wdev->event_list);
 	spin_lock_init(&wdev->event_lock);
 	INIT_LIST_HEAD(&wdev->mgmt_registrations);
@@ -1533,7 +1523,6 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 	case NETDEV_UP:
 		wiphy_lock(&rdev->wiphy);
 		cfg80211_update_iface_num(rdev, wdev->iftype, 1);
-		wdev_lock(wdev);
 		switch (wdev->iftype) {
 #ifdef CONFIG_CFG80211_WEXT
 		case NL80211_IFTYPE_ADHOC:
@@ -1563,7 +1552,6 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 		default:
 			break;
 		}
-		wdev_unlock(wdev);
 		rdev->opencount++;
 
 		/*
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 5dc76ea3b84e..98f41d9d2ba7 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -235,28 +235,6 @@ void cfg80211_init_wdev(struct wireless_dev *wdev);
 void cfg80211_register_wdev(struct cfg80211_registered_device *rdev,
 			    struct wireless_dev *wdev);
 
-static inline void wdev_lock(struct wireless_dev *wdev)
-	__acquires(wdev)
-{
-	lockdep_assert_held(&wdev->wiphy->mtx);
-	mutex_lock(&wdev->mtx);
-	__acquire(wdev->mtx);
-}
-
-static inline void wdev_unlock(struct wireless_dev *wdev)
-	__releases(wdev)
-{
-	lockdep_assert_held(&wdev->wiphy->mtx);
-	__release(wdev->mtx);
-	mutex_unlock(&wdev->mtx);
-}
-
-static inline void ASSERT_WDEV_LOCK(struct wireless_dev *wdev)
-{
-	lockdep_assert_held(&wdev->wiphy->mtx);
-	lockdep_assert_held(&wdev->mtx);
-}
-
 static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev)
 {
 	lockdep_assert_held(&rdev->wiphy.mtx);
@@ -340,8 +318,6 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 			 struct cfg80211_ibss_params *params,
 			 struct cfg80211_cached_keys *connkeys);
 void cfg80211_clear_ibss(struct net_device *dev, bool nowext);
-int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
-			  struct net_device *dev, bool nowext);
 int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
 			struct net_device *dev, bool nowext);
 void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
@@ -356,8 +332,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 			 struct net_device *dev,
 			 struct mesh_setup *setup,
 			 const struct mesh_config *conf);
-int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
-			  struct net_device *dev);
 int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
 			struct net_device *dev);
 int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
@@ -365,21 +339,13 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
 			      struct cfg80211_chan_def *chandef);
 
 /* OCB */
-int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
-			struct net_device *dev,
-			struct ocb_setup *setup);
 int cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
 		      struct net_device *dev,
 		      struct ocb_setup *setup);
-int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
-			 struct net_device *dev);
 int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
 		       struct net_device *dev);
 
 /* AP */
-int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev, int link,
-		       bool notify);
 int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 		     struct net_device *dev, int link,
 		     bool notify);
@@ -557,8 +523,6 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
 void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
 			       enum nl80211_iftype iftype, int num);
 
-void __cfg80211_leave(struct cfg80211_registered_device *rdev,
-		      struct wireless_dev *wdev);
 void cfg80211_leave(struct cfg80211_registered_device *rdev,
 		    struct wireless_dev *wdev);
 
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index e6fdb0b8187d..9f02ee5f08be 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -3,7 +3,7 @@
  * Some IBSS support code for cfg80211.
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2020-2022 Intel Corporation
+ * Copyright (C) 2020-2023 Intel Corporation
  */
 
 #include <linux/etherdevice.h>
@@ -93,7 +93,6 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 	int err;
 
 	lockdep_assert_held(&rdev->wiphy.mtx);
-	ASSERT_WDEV_LOCK(wdev);
 
 	if (wdev->u.ibss.ssid_len)
 		return -EALREADY;
@@ -151,13 +150,13 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 	return 0;
 }
 
-static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
+void cfg80211_clear_ibss(struct net_device *dev, bool nowext)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	int i;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	kfree_sensitive(wdev->connect_keys);
 	wdev->connect_keys = NULL;
@@ -187,22 +186,13 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
 	cfg80211_sched_dfs_chan_update(rdev);
 }
 
-void cfg80211_clear_ibss(struct net_device *dev, bool nowext)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-
-	wdev_lock(wdev);
-	__cfg80211_clear_ibss(dev, nowext);
-	wdev_unlock(wdev);
-}
-
-int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
-			  struct net_device *dev, bool nowext)
+int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+			struct net_device *dev, bool nowext)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!wdev->u.ibss.ssid_len)
 		return -ENOLINK;
@@ -213,24 +203,11 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
 		return err;
 
 	wdev->conn_owner_nlportid = 0;
-	__cfg80211_clear_ibss(dev, nowext);
+	cfg80211_clear_ibss(dev, nowext);
 
 	return 0;
 }
 
-int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
-			struct net_device *dev, bool nowext)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
-
-	wdev_lock(wdev);
-	err = __cfg80211_leave_ibss(rdev, dev, nowext);
-	wdev_unlock(wdev);
-
-	return err;
-}
-
 #ifdef CONFIG_CFG80211_WEXT
 int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
 			    struct wireless_dev *wdev)
@@ -239,7 +216,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
 	enum nl80211_band band;
 	int i, err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!wdev->wext.ibss.beacon_interval)
 		wdev->wext.ibss.beacon_interval = 100;
@@ -336,11 +313,9 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
 	if (wdev->wext.ibss.chandef.chan == chan)
 		return 0;
 
-	wdev_lock(wdev);
 	err = 0;
 	if (wdev->u.ibss.ssid_len)
-		err = __cfg80211_leave_ibss(rdev, dev, true);
-	wdev_unlock(wdev);
+		err = cfg80211_leave_ibss(rdev, dev, true);
 
 	if (err)
 		return err;
@@ -354,11 +329,7 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
 		wdev->wext.ibss.channel_fixed = false;
 	}
 
-	wdev_lock(wdev);
-	err = cfg80211_ibss_wext_join(rdev, wdev);
-	wdev_unlock(wdev);
-
-	return err;
+	return cfg80211_ibss_wext_join(rdev, wdev);
 }
 
 int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
@@ -372,12 +343,10 @@ int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
 		return -EINVAL;
 
-	wdev_lock(wdev);
 	if (wdev->u.ibss.current_bss)
 		chan = wdev->u.ibss.current_bss->pub.channel;
 	else if (wdev->wext.ibss.chandef.chan)
 		chan = wdev->wext.ibss.chandef.chan;
-	wdev_unlock(wdev);
 
 	if (chan) {
 		freq->m = chan->center_freq;
@@ -405,11 +374,9 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev,
 	if (!rdev->ops->join_ibss)
 		return -EOPNOTSUPP;
 
-	wdev_lock(wdev);
 	err = 0;
 	if (wdev->u.ibss.ssid_len)
-		err = __cfg80211_leave_ibss(rdev, dev, true);
-	wdev_unlock(wdev);
+		err = cfg80211_leave_ibss(rdev, dev, true);
 
 	if (err)
 		return err;
@@ -422,11 +389,7 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev,
 	wdev->wext.ibss.ssid = wdev->u.ibss.ssid;
 	wdev->wext.ibss.ssid_len = len;
 
-	wdev_lock(wdev);
-	err = cfg80211_ibss_wext_join(rdev, wdev);
-	wdev_unlock(wdev);
-
-	return err;
+	return cfg80211_ibss_wext_join(rdev, wdev);
 }
 
 int cfg80211_ibss_wext_giwessid(struct net_device *dev,
@@ -441,7 +404,6 @@ int cfg80211_ibss_wext_giwessid(struct net_device *dev,
 
 	data->flags = 0;
 
-	wdev_lock(wdev);
 	if (wdev->u.ibss.ssid_len) {
 		data->flags = 1;
 		data->length = wdev->u.ibss.ssid_len;
@@ -451,7 +413,6 @@ int cfg80211_ibss_wext_giwessid(struct net_device *dev,
 		data->length = wdev->wext.ibss.ssid_len;
 		memcpy(ssid, wdev->wext.ibss.ssid, data->length);
 	}
-	wdev_unlock(wdev);
 
 	return 0;
 }
@@ -491,11 +452,9 @@ int cfg80211_ibss_wext_siwap(struct net_device *dev,
 	    ether_addr_equal(bssid, wdev->wext.ibss.bssid))
 		return 0;
 
-	wdev_lock(wdev);
 	err = 0;
 	if (wdev->u.ibss.ssid_len)
-		err = __cfg80211_leave_ibss(rdev, dev, true);
-	wdev_unlock(wdev);
+		err = cfg80211_leave_ibss(rdev, dev, true);
 
 	if (err)
 		return err;
@@ -506,11 +465,7 @@ int cfg80211_ibss_wext_siwap(struct net_device *dev,
 	} else
 		wdev->wext.ibss.bssid = NULL;
 
-	wdev_lock(wdev);
-	err = cfg80211_ibss_wext_join(rdev, wdev);
-	wdev_unlock(wdev);
-
-	return err;
+	return cfg80211_ibss_wext_join(rdev, wdev);
 }
 
 int cfg80211_ibss_wext_giwap(struct net_device *dev,
@@ -525,7 +480,6 @@ int cfg80211_ibss_wext_giwap(struct net_device *dev,
 
 	ap_addr->sa_family = ARPHRD_ETHER;
 
-	wdev_lock(wdev);
 	if (wdev->u.ibss.current_bss)
 		memcpy(ap_addr->sa_data, wdev->u.ibss.current_bss->pub.bssid,
 		       ETH_ALEN);
@@ -534,8 +488,6 @@ int cfg80211_ibss_wext_giwap(struct net_device *dev,
 	else
 		eth_zero_addr(ap_addr->sa_data);
 
-	wdev_unlock(wdev);
-
 	return 0;
 }
 #endif
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index 59a3c5c092b1..dc75abdb8f2e 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Portions
- * Copyright (C) 2022 Intel Corporation
+ * Copyright (C) 2022-2023 Intel Corporation
  */
 #include <linux/ieee80211.h>
 #include <linux/export.h>
@@ -109,7 +109,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 
 	BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != IEEE80211_MAX_MESH_ID_LEN);
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
 		return -EOPNOTSUPP;
@@ -257,13 +257,13 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
 	return 0;
 }
 
-int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
-			  struct net_device *dev)
+int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
+			struct net_device *dev)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
 		return -EOPNOTSUPP;
@@ -287,16 +287,3 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
 
 	return err;
 }
-
-int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
-			struct net_device *dev)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
-
-	wdev_lock(wdev);
-	err = __cfg80211_leave_mesh(rdev, dev);
-	wdev_unlock(wdev);
-
-	return err;
-}
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 775cac4d6100..cc7ae9ea84ea 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
  * Copyright (c) 2015		Intel Deutschland GmbH
- * Copyright (C) 2019-2020, 2022 Intel Corporation
+ * Copyright (C) 2019-2020, 2022-2023 Intel Corporation
  */
 
 #include <linux/kernel.h>
@@ -149,7 +149,7 @@ void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct ieee80211_mgmt *mgmt = (void *)buf;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	trace_cfg80211_rx_mlme_mgmt(dev, buf, len);
 
@@ -214,7 +214,7 @@ void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len,
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct ieee80211_mgmt *mgmt = (void *)buf;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	trace_cfg80211_tx_mlme_mgmt(dev, buf, len, reconnect);
 
@@ -262,7 +262,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!req->bss)
 		return -ENOENT;
@@ -331,7 +331,7 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err, i, j;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	for (i = 1; i < ARRAY_SIZE(req->links); i++) {
 		if (!req->links[i].bss)
@@ -393,7 +393,7 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 		.local_state_change = local_state_change,
 	};
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (local_state_change &&
 	    (!wdev->connected ||
@@ -423,7 +423,7 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
 	};
 	int err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!wdev->connected)
 		return -ENOTCONN;
@@ -446,7 +446,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	u8 bssid[ETH_ALEN];
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!rdev->ops->deauth)
 		return;
@@ -726,6 +726,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 	const struct ieee80211_mgmt *mgmt;
 	u16 stype;
 
+	lockdep_assert_wiphy(&rdev->wiphy);
+
 	if (!wdev->wiphy->mgmt_stypes)
 		return -EOPNOTSUPP;
 
@@ -748,8 +750,6 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 	    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
 		int err = 0;
 
-		wdev_lock(wdev);
-
 		switch (wdev->iftype) {
 		case NL80211_IFTYPE_ADHOC:
 			/*
@@ -814,7 +814,6 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			err = -EOPNOTSUPP;
 			break;
 		}
-		wdev_unlock(wdev);
 
 		if (err)
 			return err;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f4298104a2f4..71a0a6e34bdb 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1544,7 +1544,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 
 static int nl80211_key_allowed(struct wireless_dev *wdev)
 {
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
@@ -3423,13 +3423,8 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int link_id = nl80211_link_id_or_invalid(info->attrs);
 	struct net_device *netdev = info->user_ptr[1];
-	int ret;
-
-	wdev_lock(netdev->ieee80211_ptr);
-	ret = __nl80211_set_channel(rdev, netdev, info, link_id);
-	wdev_unlock(netdev->ieee80211_ptr);
 
-	return ret;
+	return __nl80211_set_channel(rdev, netdev, info, link_id);
 }
 
 static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
@@ -3536,7 +3531,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			txq_params.link_id =
 				nl80211_link_id_or_invalid(info->attrs);
 
-			wdev_lock(netdev->ieee80211_ptr);
 			if (txq_params.link_id >= 0 &&
 			    !(netdev->ieee80211_ptr->valid_links &
 			      BIT(txq_params.link_id)))
@@ -3547,7 +3541,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			else
 				result = rdev_set_txq_params(rdev, netdev,
 							     &txq_params);
-			wdev_unlock(netdev->ieee80211_ptr);
 			if (result)
 				goto out;
 		}
@@ -3557,12 +3550,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		int link_id = nl80211_link_id_or_invalid(info->attrs);
 
 		if (wdev) {
-			wdev_lock(wdev);
 			result = __nl80211_set_channel(
 				rdev,
 				nl80211_can_set_dev_channel(wdev) ? netdev : NULL,
 				info, link_id);
-			wdev_unlock(wdev);
 		} else {
 			result = __nl80211_set_channel(rdev, netdev, info, link_id);
 		}
@@ -3870,33 +3861,31 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 			goto nla_put_failure;
 	}
 
-	wdev_lock(wdev);
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
 		if (wdev->u.ap.ssid_len &&
 		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.ap.ssid_len,
 			    wdev->u.ap.ssid))
-			goto nla_put_failure_locked;
+			goto nla_put_failure;
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
 		if (wdev->u.client.ssid_len &&
 		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.client.ssid_len,
 			    wdev->u.client.ssid))
-			goto nla_put_failure_locked;
+			goto nla_put_failure;
 		break;
 	case NL80211_IFTYPE_ADHOC:
 		if (wdev->u.ibss.ssid_len &&
 		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.ibss.ssid_len,
 			    wdev->u.ibss.ssid))
-			goto nla_put_failure_locked;
+			goto nla_put_failure;
 		break;
 	default:
 		/* nothing */
 		break;
 	}
-	wdev_unlock(wdev);
 
 	if (rdev->ops->get_txq_stats) {
 		struct cfg80211_txq_stats txqstats = {};
@@ -3943,8 +3932,6 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	genlmsg_end(msg, hdr);
 	return 0;
 
- nla_put_failure_locked:
-	wdev_unlock(wdev);
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;
@@ -4191,7 +4178,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		if (netif_running(dev))
 			return -EBUSY;
 
-		wdev_lock(wdev);
 		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
 			     IEEE80211_MAX_MESH_ID_LEN);
 		wdev->u.mesh.id_up_len =
@@ -4199,7 +4185,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		memcpy(wdev->u.mesh.id,
 		       nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
 		       wdev->u.mesh.id_up_len);
-		wdev_unlock(wdev);
 	}
 
 	if (info->attrs[NL80211_ATTR_4ADDR]) {
@@ -4300,7 +4285,6 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 	case NL80211_IFTYPE_MESH_POINT:
 		if (!info->attrs[NL80211_ATTR_MESH_ID])
 			break;
-		wdev_lock(wdev);
 		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
 			     IEEE80211_MAX_MESH_ID_LEN);
 		wdev->u.mesh.id_up_len =
@@ -4308,7 +4292,6 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 		memcpy(wdev->u.mesh.id,
 		       nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
 		       wdev->u.mesh.id_up_len);
-		wdev_unlock(wdev);
 		break;
 	case NL80211_IFTYPE_NAN:
 	case NL80211_IFTYPE_P2P_DEVICE:
@@ -4599,79 +4582,67 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 	    !(key.p.mode == NL80211_KEY_SET_TX))
 		return -EINVAL;
 
-	wdev_lock(wdev);
-
 	if (key.def) {
-		if (!rdev->ops->set_default_key) {
-			err = -EOPNOTSUPP;
-			goto out;
-		}
+		if (!rdev->ops->set_default_key)
+			return -EOPNOTSUPP;
 
 		err = nl80211_key_allowed(wdev);
 		if (err)
-			goto out;
+			return err;
 
 		err = nl80211_validate_key_link_id(info, wdev, link_id, false);
 		if (err)
-			goto out;
+			return err;
 
 		err = rdev_set_default_key(rdev, dev, link_id, key.idx,
 					   key.def_uni, key.def_multi);
 
 		if (err)
-			goto out;
+			return err;
 
 #ifdef CONFIG_CFG80211_WEXT
 		wdev->wext.default_key = key.idx;
 #endif
+		return 0;
 	} else if (key.defmgmt) {
-		if (key.def_uni || !key.def_multi) {
-			err = -EINVAL;
-			goto out;
-		}
+		if (key.def_uni || !key.def_multi)
+			return -EINVAL;
 
-		if (!rdev->ops->set_default_mgmt_key) {
-			err = -EOPNOTSUPP;
-			goto out;
-		}
+		if (!rdev->ops->set_default_mgmt_key)
+			return -EOPNOTSUPP;
 
 		err = nl80211_key_allowed(wdev);
 		if (err)
-			goto out;
+			return err;
 
 		err = nl80211_validate_key_link_id(info, wdev, link_id, false);
 		if (err)
-			goto out;
+			return err;
 
 		err = rdev_set_default_mgmt_key(rdev, dev, link_id, key.idx);
 		if (err)
-			goto out;
+			return err;
 
 #ifdef CONFIG_CFG80211_WEXT
 		wdev->wext.default_mgmt_key = key.idx;
 #endif
+		return 0;
 	} else if (key.defbeacon) {
-		if (key.def_uni || !key.def_multi) {
-			err = -EINVAL;
-			goto out;
-		}
+		if (key.def_uni || !key.def_multi)
+			return -EINVAL;
 
-		if (!rdev->ops->set_default_beacon_key) {
-			err = -EOPNOTSUPP;
-			goto out;
-		}
+		if (!rdev->ops->set_default_beacon_key)
+			return -EOPNOTSUPP;
 
 		err = nl80211_key_allowed(wdev);
 		if (err)
-			goto out;
+			return err;
 
 		err = nl80211_validate_key_link_id(info, wdev, link_id, false);
 		if (err)
-			goto out;
+			return err;
 
-		err = rdev_set_default_beacon_key(rdev, dev, link_id, key.idx);
-		if (err)
-			goto out;
+		return rdev_set_default_beacon_key(rdev, dev, link_id, key.idx);
 	} else if (key.p.mode == NL80211_KEY_SET_TX &&
 		   wiphy_ext_feature_isset(&rdev->wiphy,
 					   NL80211_EXT_FEATURE_EXT_KEY_ID)) {
@@ -4680,25 +4651,19 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 		if (info->attrs[NL80211_ATTR_MAC])
 			mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-		if (!mac_addr || key.idx < 0 || key.idx > 1) {
-			err = -EINVAL;
-			goto out;
-		}
+		if (!mac_addr || key.idx < 0 || key.idx > 1)
+			return -EINVAL;
 
 		err = nl80211_validate_key_link_id(info, wdev, link_id, true);
 		if (err)
-			goto out;
+			return err;
 
-		err = rdev_add_key(rdev, dev, link_id, key.idx,
-				   NL80211_KEYTYPE_PAIRWISE,
-				   mac_addr, &key.p);
-	} else {
-		err = -EINVAL;
+		return rdev_add_key(rdev, dev, link_id, key.idx,
+				    NL80211_KEYTYPE_PAIRWISE,
+				    mac_addr, &key.p);
 	}
- out:
-	wdev_unlock(wdev);
 
-	return err;
+	return -EINVAL;
 }
 
 static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
@@ -4751,7 +4716,6 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 
-	wdev_lock(wdev);
 	err = nl80211_key_allowed(wdev);
 	if (err)
 		GENL_SET_ERR_MSG(info, "key not allowed");
@@ -4767,7 +4731,6 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 		if (err)
 			GENL_SET_ERR_MSG(info, "key addition failed");
 	}
-	wdev_unlock(wdev);
 
 	return err;
 }
@@ -4808,7 +4771,6 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->del_key)
 		return -EOPNOTSUPP;
 
-	wdev_lock(wdev);
 	err = nl80211_key_allowed(wdev);
 
 	if (key.type == NL80211_KEYTYPE_GROUP && mac_addr &&
@@ -4832,7 +4794,6 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
 			wdev->wext.default_mgmt_key = -1;
 	}
 #endif
-	wdev_unlock(wdev);
 
 	return err;
 }
@@ -6072,20 +6033,18 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		goto out;
 	}
 
-	wdev_lock(wdev);
-
 	if (info->attrs[NL80211_ATTR_TX_RATES]) {
 		err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
 						    NL80211_ATTR_TX_RATES,
 						    &params->beacon_rate,
 						    dev, false, link_id);
 		if (err)
-			goto out_unlock;
+			goto out;
 
 		err = validate_beacon_tx_rate(rdev, params->chandef.chan->band,
 					      &params->beacon_rate);
 		if (err)
-			goto out_unlock;
+			goto out;
 	}
 
 	if (info->attrs[NL80211_ATTR_SMPS_MODE]) {
@@ -6098,19 +6057,19 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			if (!(rdev->wiphy.features &
 			      NL80211_FEATURE_STATIC_SMPS)) {
 				err = -EINVAL;
-				goto out_unlock;
+				goto out;
 			}
 			break;
 		case NL80211_SMPS_DYNAMIC:
 			if (!(rdev->wiphy.features &
 			      NL80211_FEATURE_DYNAMIC_SMPS)) {
 				err = -EINVAL;
-				goto out_unlock;
+				goto out;
 			}
 			break;
 		default:
 			err = -EINVAL;
-			goto out_unlock;
+			goto out;
 		}
 	} else {
 		params->smps_mode = NL80211_SMPS_OFF;
@@ -6119,7 +6078,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	params->pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
 	if (params->pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) {
 		err = -EOPNOTSUPP;
-		goto out_unlock;
+		goto out;
 	}
 
 	if (info->attrs[NL80211_ATTR_ACL_POLICY]) {
@@ -6127,7 +6086,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		if (IS_ERR(params->acl)) {
 			err = PTR_ERR(params->acl);
 			params->acl = NULL;
-			goto out_unlock;
+			goto out;
 		}
 	}
 
@@ -6139,7 +6098,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 					info->attrs[NL80211_ATTR_HE_OBSS_PD],
 					&params->he_obss_pd);
 		if (err)
-			goto out_unlock;
+			goto out;
 	}
 
 	if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) {
@@ -6147,7 +6106,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 						   info->attrs[NL80211_ATTR_FILS_DISCOVERY],
 						   params);
 		if (err)
-			goto out_unlock;
+			goto out;
 	}
 
 	if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) {
@@ -6155,7 +6114,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP],
 			params);
 		if (err)
-			goto out_unlock;
+			goto out;
 	}
 
 	if (info->attrs[NL80211_ATTR_MBSSID_CONFIG]) {
@@ -6166,17 +6125,17 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 							params->beacon.mbssid_ies->cnt :
 							0);
 		if (err)
-			goto out_unlock;
+			goto out;
 	}
 
 	if (!params->mbssid_config.ema && params->beacon.rnr_ies) {
 		err = -EINVAL;
-		goto out_unlock;
+		goto out;
 	}
 
 	err = nl80211_calculate_ap_params(params);
 	if (err)
-		goto out_unlock;
+		goto out;
 
 	if (info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS])
 		params->flags = nla_get_u32(
@@ -6188,7 +6147,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	    info->attrs[NL80211_ATTR_SOCKET_OWNER] &&
 	    wdev->conn_owner_nlportid != info->snd_portid) {
 		err = -EINVAL;
-		goto out_unlock;
+		goto out;
 	}
 
 	/* FIXME: validate MLO/link-id against driver capabilities */
@@ -6206,8 +6165,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 
 		nl80211_send_ap_started(wdev, link_id);
 	}
-out_unlock:
-	wdev_unlock(wdev);
 out:
 	kfree(params->acl);
 	kfree(params->beacon.mbssid_ies);
@@ -6244,9 +6201,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto out;
 
-	wdev_lock(wdev);
 	err = rdev_change_beacon(rdev, dev, &params);
-	wdev_unlock(wdev);
 
 out:
 	kfree(params.mbssid_ies);
@@ -7305,9 +7260,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	/* driver will call cfg80211_check_station_change() */
-	wdev_lock(dev->ieee80211_ptr);
 	err = rdev_change_station(rdev, dev, mac_addr, &params);
-	wdev_unlock(dev->ieee80211_ptr);
 
  out_put_vlan:
 	dev_put(params.vlan);
@@ -7575,7 +7528,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 
 	/* be aware of params.vlan when changing code here */
 
-	wdev_lock(dev->ieee80211_ptr);
 	if (wdev->valid_links) {
 		if (params.link_sta_params.link_id < 0) {
 			err = -EINVAL;
@@ -7593,7 +7545,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	}
 	err = rdev_add_station(rdev, dev, mac_addr, &params);
 out:
-	wdev_unlock(dev->ieee80211_ptr);
 	dev_put(params.vlan);
 	return err;
 }
@@ -7603,7 +7554,6 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct station_del_parameters params;
-	int ret;
 
 	memset(&params, 0, sizeof(params));
 
@@ -7651,11 +7601,7 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
 		params.reason_code = WLAN_REASON_PREV_AUTH_NOT_VALID;
 	}
 
-	wdev_lock(dev->ieee80211_ptr);
-	ret = rdev_del_station(rdev, dev, &params);
-	wdev_unlock(dev->ieee80211_ptr);
-
-	return ret;
+	return rdev_del_station(rdev, dev, &params);
 }
 
 static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq,
@@ -7974,9 +7920,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct bss_parameters params;
-	int err;
 
 	memset(&params, 0, sizeof(params));
 	params.link_id = nl80211_link_id_or_invalid(info->attrs);
@@ -8039,11 +7983,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
 		return -EOPNOTSUPP;
 
-	wdev_lock(wdev);
-	err = rdev_change_bss(rdev, dev, &params);
-	wdev_unlock(wdev);
-
-	return err;
+	return rdev_change_bss(rdev, dev, &params);
 }
 
 static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
@@ -8114,13 +8054,11 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
 	if (!rdev->ops->get_mesh_config)
 		return -EOPNOTSUPP;
 
-	wdev_lock(wdev);
 	/* If not connected, get default parameters */
 	if (!wdev->u.mesh.id_len)
 		memcpy(&cur_params, &default_mesh_config, sizeof(cur_params));
 	else
 		err = rdev_get_mesh_config(rdev, dev, &cur_params);
-	wdev_unlock(wdev);
 
 	if (err)
 		return err;
@@ -8496,15 +8434,12 @@ static int nl80211_update_mesh_config(struct sk_buff *skb,
 	if (err)
 		return err;
 
-	wdev_lock(wdev);
 	if (!wdev->u.mesh.id_len)
 		err = -ENOLINK;
 
 	if (!err)
 		err = rdev_update_mesh_config(rdev, dev, mask, &cfg);
 
-	wdev_unlock(wdev);
-
 	return err;
 }
 
@@ -8995,7 +8930,7 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev,
 	unsigned int link_id;
 	bool all_ok = true;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!cfg80211_beaconing_iface_active(wdev))
 		return true;
@@ -9245,7 +9180,6 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 
 	request->n_channels = i;
 
-	wdev_lock(wdev);
 	for (i = 0; i < request->n_channels; i++) {
 		struct ieee80211_channel *chan = request->channels[i];
 
@@ -9254,12 +9188,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 			continue;
 
 		if (!cfg80211_wdev_on_sub_chan(wdev, chan, true)) {
-			wdev_unlock(wdev);
 			err = -EBUSY;
 			goto out_free;
 		}
 	}
-	wdev_unlock(wdev);
 
 	i = 0;
 	if (n_ssids) {
@@ -10265,9 +10197,7 @@ skip_beacons:
 			goto free;
 	}
 
-	wdev_lock(wdev);
 	err = rdev_channel_switch(rdev, dev, &params);
-	wdev_unlock(wdev);
 
 free:
 	kfree(params.beacon_after.mbssid_ies);
@@ -10290,7 +10220,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 	void *hdr;
 	struct nlattr *bss;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	hdr = nl80211hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags,
 			     NL80211_CMD_NEW_SCAN_RESULTS);
@@ -10439,7 +10369,6 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
 	/* nl80211_prepare_wdev_dump acquired it in the successful case */
 	__acquire(&rdev->wiphy.mtx);
 
-	wdev_lock(wdev);
 	spin_lock_bh(&rdev->bss_lock);
 
 	/*
@@ -10465,7 +10394,6 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	spin_unlock_bh(&rdev->bss_lock);
-	wdev_unlock(wdev);
 
 	cb->args[2] = idx;
 	wiphy_unlock(&rdev->wiphy);
@@ -10588,9 +10516,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	while (1) {
-		wdev_lock(wdev);
 		res = rdev_dump_survey(rdev, wdev->netdev, survey_idx, &survey);
-		wdev_unlock(wdev);
 		if (res == -ENOENT)
 			break;
 		if (res)
@@ -10763,9 +10689,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 	if (!req.bss)
 		return -ENOENT;
 
-	wdev_lock(dev->ieee80211_ptr);
 	err = cfg80211_mlme_auth(rdev, dev, &req);
-	wdev_unlock(dev->ieee80211_ptr);
 
 	cfg80211_put_bss(&rdev->wiphy, req.bss);
 
@@ -11180,8 +11104,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 
 	err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
 	if (!err) {
-		wdev_lock(dev->ieee80211_ptr);
-
 		err = cfg80211_mlme_assoc(rdev, dev, &req);
 
 		if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
@@ -11190,8 +11112,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 			memcpy(dev->ieee80211_ptr->disconnect_bssid,
 			       ap_addr, ETH_ALEN);
 		}
-
-		wdev_unlock(dev->ieee80211_ptr);
 	}
 
 free:
@@ -11208,7 +11128,7 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	const u8 *ie = NULL, *bssid;
-	int ie_len = 0, err;
+	int ie_len = 0;
 	u16 reason_code;
 	bool local_state_change;
 
@@ -11244,11 +11164,8 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
 
 	local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
 
-	wdev_lock(dev->ieee80211_ptr);
-	err = cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
-				   local_state_change);
-	wdev_unlock(dev->ieee80211_ptr);
-	return err;
+	return cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
+				    local_state_change);
 }
 
 static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
@@ -11256,7 +11173,7 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	const u8 *ie = NULL, *bssid;
-	int ie_len = 0, err;
+	int ie_len = 0;
 	u16 reason_code;
 	bool local_state_change;
 
@@ -11292,11 +11209,8 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
 
 	local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
 
-	wdev_lock(dev->ieee80211_ptr);
-	err = cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
-				     local_state_change);
-	wdev_unlock(dev->ieee80211_ptr);
-	return err;
+	return cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
+				      local_state_change);
 }
 
 static bool
@@ -11474,13 +11388,11 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	ibss.userspace_handles_dfs =
 		nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);
 
-	wdev_lock(dev->ieee80211_ptr);
 	err = __cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
 	if (err)
 		kfree_sensitive(connkeys);
 	else if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
 		dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
-	wdev_unlock(dev->ieee80211_ptr);
 
 	return err;
 }
@@ -12013,8 +11925,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	if (nla_get_flag(info->attrs[NL80211_ATTR_MLO_SUPPORT]))
 		connect.flags |= CONNECT_REQ_MLO_SUPPORT;
 
-	wdev_lock(dev->ieee80211_ptr);
-
 	err = cfg80211_connect(rdev, dev, &connect, connkeys,
 			       connect.prev_bssid);
 	if (err)
@@ -12029,8 +11939,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 			eth_zero_addr(dev->ieee80211_ptr->disconnect_bssid);
 	}
 
-	wdev_unlock(dev->ieee80211_ptr);
-
 	return err;
 }
 
@@ -12044,7 +11952,6 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 	bool fils_sk_offload;
 	u32 auth_type;
 	u32 changed = 0;
-	int ret;
 
 	if (!rdev->ops->update_connect_params)
 		return -EOPNOTSUPP;
@@ -12105,14 +12012,10 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 		changed |= UPDATE_AUTH_TYPE;
 	}
 
-	wdev_lock(dev->ieee80211_ptr);
 	if (!wdev->connected)
-		ret = -ENOLINK;
-	else
-		ret = rdev_update_connect_params(rdev, dev, &connect, changed);
-	wdev_unlock(dev->ieee80211_ptr);
+		return -ENOLINK;
 
-	return ret;
+	return rdev_update_connect_params(rdev, dev, &connect, changed);
 }
 
 static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
@@ -12120,7 +12023,6 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	u16 reason;
-	int ret;
 
 	if (dev->ieee80211_ptr->conn_owner_nlportid &&
 	    dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
@@ -12138,10 +12040,7 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
 		return -EOPNOTSUPP;
 
-	wdev_lock(dev->ieee80211_ptr);
-	ret = cfg80211_disconnect(rdev, dev, reason, true);
-	wdev_unlock(dev->ieee80211_ptr);
-	return ret;
+	return cfg80211_disconnect(rdev, dev, reason, true);
 }
 
 static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
@@ -12352,7 +12251,6 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 	if (err)
 		return err;
 
-	wdev_lock(wdev);
 	if (!cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) {
 		const struct cfg80211_chan_def *oper_chandef, *compat_chandef;
 
@@ -12361,7 +12259,6 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 		if (WARN_ON(!oper_chandef)) {
 			/* cannot happen since we must beacon to get here */
 			WARN_ON(1);
-			wdev_unlock(wdev);
 			return -EBUSY;
 		}
 
@@ -12369,12 +12266,9 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 		compat_chandef = cfg80211_chandef_compatible(&chandef,
 							     oper_chandef);
 
-		if (compat_chandef != &chandef) {
-			wdev_unlock(wdev);
+		if (compat_chandef != &chandef)
 			return -EBUSY;
-		}
 	}
-	wdev_unlock(wdev);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
@@ -12433,23 +12327,18 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
 	if (!rdev->ops->set_bitrate_mask)
 		return -EOPNOTSUPP;
 
-	wdev_lock(wdev);
 	err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
 					    NL80211_ATTR_TX_RATES, &mask,
 					    dev, true, link_id);
 	if (err)
-		goto out;
+		return err;
 
-	err = rdev_set_bitrate_mask(rdev, dev, link_id, NULL, &mask);
-out:
-	wdev_unlock(wdev);
-	return err;
+	return rdev_set_bitrate_mask(rdev, dev, link_id, NULL, &mask);
 }
 
 static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
@@ -12578,12 +12467,9 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
 	if (!chandef.chan && params.offchan)
 		return -EINVAL;
 
-	wdev_lock(wdev);
 	if (params.offchan &&
-	    !cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) {
-		wdev_unlock(wdev);
+	    !cfg80211_off_channel_oper_allowed(wdev, chandef.chan))
 		return -EBUSY;
-	}
 
 	params.link_id = nl80211_link_id_or_invalid(info->attrs);
 	/*
@@ -12592,11 +12478,8 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
 	 * to the driver.
 	 */
 	if (params.link_id >= 0 &&
-	    !(wdev->valid_links & BIT(params.link_id))) {
-		wdev_unlock(wdev);
+	    !(wdev->valid_links & BIT(params.link_id)))
 		return -EINVAL;
-	}
-	wdev_unlock(wdev);
 
 	params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
 	params.len = nla_len(info->attrs[NL80211_ATTR_FRAME]);
@@ -12866,8 +12749,8 @@ static int nl80211_set_cqm_rssi(struct genl_info *info,
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int i, err;
 	s32 prev = S32_MIN;
+	int i;
 
 	/* Check all values negative and sorted */
 	for (i = 0; i < n_thresholds; i++) {
@@ -12881,9 +12764,7 @@ static int nl80211_set_cqm_rssi(struct genl_info *info,
 	    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
 		return -EOPNOTSUPP;
 
-	wdev_lock(wdev);
 	cfg80211_cqm_config_free(wdev);
-	wdev_unlock(wdev);
 
 	if (n_thresholds <= 1 && rdev->ops->set_cqm_rssi_config) {
 		if (n_thresholds == 0 || thresholds[0] == 0) /* Disabling */
@@ -12900,17 +12781,14 @@ static int nl80211_set_cqm_rssi(struct genl_info *info,
 	if (n_thresholds == 1 && thresholds[0] == 0) /* Disabling */
 		n_thresholds = 0;
 
-	wdev_lock(wdev);
 	if (n_thresholds) {
 		struct cfg80211_cqm_config *cqm_config;
 
 		cqm_config = kzalloc(struct_size(cqm_config, rssi_thresholds,
 						 n_thresholds),
 				     GFP_KERNEL);
-		if (!cqm_config) {
-			err = -ENOMEM;
-			goto unlock;
-		}
+		if (!cqm_config)
+			return -ENOMEM;
 
 		cqm_config->rssi_hyst = hysteresis;
 		cqm_config->n_rssi_thresholds = n_thresholds;
@@ -12921,12 +12799,7 @@ static int nl80211_set_cqm_rssi(struct genl_info *info,
 		wdev->cqm_config = cqm_config;
 	}
 
-	err = cfg80211_cqm_rssi_update(rdev, dev);
-
-unlock:
-	wdev_unlock(wdev);
-
-	return err;
+	return cfg80211_cqm_rssi_update(rdev, dev);
 }
 
 static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info)
@@ -13108,11 +12981,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
 		setup.control_port_over_nl80211 = true;
 	}
 
-	wdev_lock(dev->ieee80211_ptr);
 	err = __cfg80211_join_mesh(rdev, dev, &setup, &cfg);
 	if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER])
 		dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
-	wdev_unlock(dev->ieee80211_ptr);
 
 	return err;
 }
@@ -14056,21 +13927,13 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
 	if (tb[NL80211_REKEY_DATA_AKM])
 		rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]);
 
-	wdev_lock(wdev);
-	if (!wdev->connected) {
-		err = -ENOTCONN;
-		goto out;
-	}
+	if (!wdev->connected)
+		return -ENOTCONN;
 
-	if (!rdev->ops->set_rekey_data) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->set_rekey_data)
+		return -EOPNOTSUPP;
 
-	err = rdev_set_rekey_data(rdev, dev, &rekey_data);
- out:
-	wdev_unlock(wdev);
-	return err;
+	return rdev_set_rekey_data(rdev, dev, &rekey_data);
 }
 
 static int nl80211_register_unexpected_frame(struct sk_buff *skb,
@@ -15274,11 +15137,9 @@ static int nl80211_set_qos_map(struct sk_buff *skb,
 		memcpy(qos_map->up, pos, IEEE80211_QOS_MAP_LEN_MIN);
 	}
 
-	wdev_lock(dev->ieee80211_ptr);
 	ret = nl80211_key_allowed(dev->ieee80211_ptr);
 	if (!ret)
 		ret = rdev_set_qos_map(rdev, dev, qos_map);
-	wdev_unlock(dev->ieee80211_ptr);
 
 	kfree(qos_map);
 	return ret;
@@ -15292,7 +15153,6 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info)
 	const u8 *peer;
 	u8 tsid, up;
 	u16 admitted_time = 0;
-	int err;
 
 	if (!(rdev->wiphy.features & NL80211_FEATURE_SUPPORTS_WMM_ADMISSION))
 		return -EOPNOTSUPP;
@@ -15322,34 +15182,25 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info)
 			return -EINVAL;
 	}
 
-	wdev_lock(wdev);
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
 		if (wdev->connected)
 			break;
-		err = -ENOTCONN;
-		goto out;
+		return -ENOTCONN;
 	default:
-		err = -EOPNOTSUPP;
-		goto out;
+		return -EOPNOTSUPP;
 	}
 
-	err = rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time);
-
- out:
-	wdev_unlock(wdev);
-	return err;
+	return rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time);
 }
 
 static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	const u8 *peer;
 	u8 tsid;
-	int err;
 
 	if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC])
 		return -EINVAL;
@@ -15357,11 +15208,7 @@ static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info)
 	tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]);
 	peer = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	wdev_lock(wdev);
-	err = rdev_del_tx_ts(rdev, dev, tsid, peer);
-	wdev_unlock(wdev);
-
-	return err;
+	return rdev_del_tx_ts(rdev, dev, tsid, peer);
 }
 
 static int nl80211_tdls_channel_switch(struct sk_buff *skb,
@@ -15417,11 +15264,7 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb,
 	addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 	oper_class = nla_get_u8(info->attrs[NL80211_ATTR_OPER_CLASS]);
 
-	wdev_lock(wdev);
-	err = rdev_tdls_channel_switch(rdev, dev, addr, oper_class, &chandef);
-	wdev_unlock(wdev);
-
-	return err;
+	return rdev_tdls_channel_switch(rdev, dev, addr, oper_class, &chandef);
 }
 
 static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb,
@@ -15429,7 +15272,6 @@ static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb,
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	const u8 *addr;
 
 	if (!rdev->ops->tdls_channel_switch ||
@@ -15450,9 +15292,7 @@ static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb,
 
 	addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	wdev_lock(wdev);
 	rdev_tdls_cancel_channel_switch(rdev, dev, addr);
-	wdev_unlock(wdev);
 
 	return 0;
 }
@@ -15485,7 +15325,6 @@ static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info)
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_pmk_conf pmk_conf = {};
-	int ret;
 
 	if (wdev->iftype != NL80211_IFTYPE_STATION &&
 	    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
@@ -15498,34 +15337,24 @@ static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_MAC] || !info->attrs[NL80211_ATTR_PMK])
 		return -EINVAL;
 
-	wdev_lock(wdev);
-	if (!wdev->connected) {
-		ret = -ENOTCONN;
-		goto out;
-	}
+	if (!wdev->connected)
+		return -ENOTCONN;
 
 	pmk_conf.aa = nla_data(info->attrs[NL80211_ATTR_MAC]);
-	if (memcmp(pmk_conf.aa, wdev->u.client.connected_addr, ETH_ALEN)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (memcmp(pmk_conf.aa, wdev->u.client.connected_addr, ETH_ALEN))
+		return -EINVAL;
 
 	pmk_conf.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]);
 	pmk_conf.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]);
 	if (pmk_conf.pmk_len != WLAN_PMK_LEN &&
-	    pmk_conf.pmk_len != WLAN_PMK_LEN_SUITE_B_192) {
-		ret = -EINVAL;
-		goto out;
-	}
+	    pmk_conf.pmk_len != WLAN_PMK_LEN_SUITE_B_192)
+		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_PMKR0_NAME])
 		pmk_conf.pmk_r0_name =
 			nla_data(info->attrs[NL80211_ATTR_PMKR0_NAME]);
 
-	ret = rdev_set_pmk(rdev, dev, &pmk_conf);
-out:
-	wdev_unlock(wdev);
-	return ret;
+	return rdev_set_pmk(rdev, dev, &pmk_conf);
 }
 
 static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info)
@@ -15534,7 +15363,6 @@ static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info)
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	const u8 *aa;
-	int ret;
 
 	if (wdev->iftype != NL80211_IFTYPE_STATION &&
 	    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
@@ -15547,12 +15375,8 @@ static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_MAC])
 		return -EINVAL;
 
-	wdev_lock(wdev);
 	aa = nla_data(info->attrs[NL80211_ATTR_MAC]);
-	ret = rdev_del_pmk(rdev, dev, aa);
-	wdev_unlock(wdev);
-
-	return ret;
+	return rdev_del_pmk(rdev, dev, aa);
 }
 
 static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
@@ -15626,8 +15450,6 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 
-	wdev_lock(wdev);
-
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
@@ -15636,21 +15458,16 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	case NL80211_IFTYPE_ADHOC:
 		if (wdev->u.ibss.current_bss)
 			break;
-		err = -ENOTCONN;
-		goto out;
+		return -ENOTCONN;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
 		if (wdev->connected)
 			break;
-		err = -ENOTCONN;
-		goto out;
+		return -ENOTCONN;
 	default:
-		err = -EOPNOTSUPP;
-		goto out;
+		return -EOPNOTSUPP;
 	}
 
-	wdev_unlock(wdev);
-
 	buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
 	len = nla_len(info->attrs[NL80211_ATTR_FRAME]);
 	dest = nla_data(info->attrs[NL80211_ATTR_MAC]);
@@ -15666,9 +15483,6 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	if (!err && !dont_wait_for_ack)
 		nl_set_extack_cookie_u64(info->extack, cookie);
 	return err;
- out:
-	wdev_unlock(wdev);
-	return err;
 }
 
 static int nl80211_get_ftm_responder_stats(struct sk_buff *skb,
@@ -15946,8 +15760,6 @@ static int nl80211_set_tid_config(struct sk_buff *skb,
 	if (info->attrs[NL80211_ATTR_MAC])
 		tid_config->peer = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	wdev_lock(dev->ieee80211_ptr);
-
 	nla_for_each_nested(tid, info->attrs[NL80211_ATTR_TID_CONFIG],
 			    rem_conf) {
 		ret = nla_parse_nested(attrs, NL80211_TID_CONFIG_ATTR_MAX,
@@ -15969,7 +15781,6 @@ static int nl80211_set_tid_config(struct sk_buff *skb,
 
 bad_tid_conf:
 	kfree(tid_config);
-	wdev_unlock(dev->ieee80211_ptr);
 	return ret;
 }
 
@@ -16066,9 +15877,7 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 		params.counter_offset_presp = offset;
 	}
 
-	wdev_lock(wdev);
 	err = rdev_color_change(rdev, dev, &params);
-	wdev_unlock(wdev);
 
 out:
 	kfree(params.beacon_next.mbssid_ies);
@@ -16124,7 +15933,6 @@ static int nl80211_add_link(struct sk_buff *skb, struct genl_info *info)
 	    !is_valid_ether_addr(nla_data(info->attrs[NL80211_ATTR_MAC])))
 		return -EINVAL;
 
-	wdev_lock(wdev);
 	wdev->valid_links |= BIT(link_id);
 	ether_addr_copy(wdev->links[link_id].addr,
 			nla_data(info->attrs[NL80211_ATTR_MAC]));
@@ -16134,7 +15942,6 @@ static int nl80211_add_link(struct sk_buff *skb, struct genl_info *info)
 		wdev->valid_links &= ~BIT(link_id);
 		eth_zero_addr(wdev->links[link_id].addr);
 	}
-	wdev_unlock(wdev);
 
 	return ret;
 }
@@ -16156,9 +15963,7 @@ static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 
-	wdev_lock(wdev);
 	cfg80211_remove_link(wdev, link_id);
-	wdev_unlock(wdev);
 
 	return 0;
 }
@@ -16248,14 +16053,10 @@ nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info,
 	if (err)
 		return err;
 
-	wdev_lock(dev->ieee80211_ptr);
 	if (add)
-		err = rdev_add_link_station(rdev, dev, &params);
-	else
-		err = rdev_mod_link_station(rdev, dev, &params);
-	wdev_unlock(dev->ieee80211_ptr);
+		return rdev_add_link_station(rdev, dev, &params);
 
-	return err;
+	return rdev_mod_link_station(rdev, dev, &params);
 }
 
 static int
@@ -16276,7 +16077,6 @@ nl80211_remove_link_station(struct sk_buff *skb, struct genl_info *info)
 	struct link_station_del_parameters params = {};
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	int ret;
 
 	if (!rdev->ops->del_link_station)
 		return -EOPNOTSUPP;
@@ -16288,11 +16088,7 @@ nl80211_remove_link_station(struct sk_buff *skb, struct genl_info *info)
 	params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
 	params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]);
 
-	wdev_lock(dev->ieee80211_ptr);
-	ret = rdev_del_link_station(rdev, dev, &params);
-	wdev_unlock(dev->ieee80211_ptr);
-
-	return ret;
+	return rdev_del_link_station(rdev, dev, &params);
 }
 
 static int nl80211_set_hw_timestamp(struct sk_buff *skb,
@@ -18300,7 +18096,7 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask)
 	struct nlattr *links;
 	void *hdr;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 	trace_cfg80211_links_removed(dev, link_mask);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
@@ -19354,7 +19150,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev,
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 	WARN_INVALID_LINK_ID(wdev, link_id);
 
 	trace_cfg80211_ch_switch_notify(dev, chandef, link_id, punct_bitmap);
@@ -19399,7 +19195,7 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev,
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 	WARN_INVALID_LINK_ID(wdev, link_id);
 
 	trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id,
@@ -19422,7 +19218,7 @@ int cfg80211_bss_color_notify(struct net_device *dev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	trace_cfg80211_bss_color_notify(dev, cmd, count, color_bitmap);
 
diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c
index 29afaf3da54f..7d2d67f13ad9 100644
--- a/net/wireless/ocb.c
+++ b/net/wireless/ocb.c
@@ -4,7 +4,7 @@
  *
  * Copyright: (c) 2014 Czech Technical University in Prague
  *            (c) 2014 Volkswagen Group Research
- * Copyright (C) 2022 Intel Corporation
+ * Copyright (C) 2022-2023 Intel Corporation
  * Author:    Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz>
  * Funded by: Volkswagen Group Research
  */
@@ -15,14 +15,14 @@
 #include "core.h"
 #include "rdev-ops.h"
 
-int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
-			struct net_device *dev,
-			struct ocb_setup *setup)
+int cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct ocb_setup *setup)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB)
 		return -EOPNOTSUPP;
@@ -40,27 +40,13 @@ int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
 	return err;
 }
 
-int cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
-		      struct net_device *dev,
-		      struct ocb_setup *setup)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
-
-	wdev_lock(wdev);
-	err = __cfg80211_join_ocb(rdev, dev, setup);
-	wdev_unlock(wdev);
-
-	return err;
-}
-
-int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
-			 struct net_device *dev)
+int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB)
 		return -EOPNOTSUPP;
@@ -77,16 +63,3 @@ int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
 
 	return err;
 }
-
-int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
-
-	wdev_lock(wdev);
-	err = __cfg80211_leave_ocb(rdev, dev);
-	wdev_unlock(wdev);
-
-	return err;
-}
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index 9611aa0bd051..e106dcea3977 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -600,7 +600,7 @@ static void cfg80211_pmsr_process_abort(struct wireless_dev *wdev)
 	struct cfg80211_pmsr_request *req, *tmp;
 	LIST_HEAD(free_list);
 
-	lockdep_assert_held(&wdev->mtx);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	spin_lock_bh(&wdev->pmsr_lock);
 	list_for_each_entry_safe(req, tmp, &wdev->pmsr_list, list) {
@@ -623,9 +623,7 @@ void cfg80211_pmsr_free_wk(struct work_struct *work)
 						 pmsr_free_wk);
 
 	wiphy_lock(wdev->wiphy);
-	wdev_lock(wdev);
 	cfg80211_pmsr_process_abort(wdev);
-	wdev_unlock(wdev);
 	wiphy_unlock(wdev->wiphy);
 }
 
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 1cdaf273d775..f86ee1a6daad 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2342,12 +2342,11 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 	bool ret;
 	int link;
 
-	wdev_lock(wdev);
 	iftype = wdev->iftype;
 
 	/* make sure the interface is active */
 	if (!wdev->netdev || !netif_running(wdev->netdev))
-		goto wdev_inactive_unlock;
+		return true;
 
 	for (link = 0; link < ARRAY_SIZE(wdev->links); link++) {
 		struct ieee80211_channel *chan;
@@ -2407,8 +2406,6 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 			break;
 		}
 
-		wdev_unlock(wdev);
-
 		switch (iftype) {
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_P2P_GO:
@@ -2429,16 +2426,8 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 		default:
 			break;
 		}
-
-		wdev_lock(wdev);
 	}
 
-	wdev_unlock(wdev);
-
-	return true;
-
-wdev_inactive_unlock:
-	wdev_unlock(wdev);
 	return true;
 }
 
@@ -3577,13 +3566,10 @@ static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag)
 	for_each_rdev(rdev) {
 		wiphy_lock(&rdev->wiphy);
 		list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
-			wdev_lock(wdev);
 			if (!(wdev->wiphy->regulatory_flags & flag)) {
-				wdev_unlock(wdev);
 				wiphy_unlock(&rdev->wiphy);
 				return false;
 			}
-			wdev_unlock(wdev);
 		}
 		wiphy_unlock(&rdev->wiphy);
 	}
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index c271f30b58fa..50fcb27e6dab 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -67,7 +67,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
 	struct cfg80211_scan_request *request;
 	int n_channels, err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (rdev->scan_req || rdev->scan_msg)
 		return -EBUSY;
@@ -151,7 +151,7 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev,
 	struct cfg80211_assoc_request req = {};
 	int err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!wdev->conn)
 		return 0;
@@ -255,16 +255,13 @@ void cfg80211_conn_work(struct work_struct *work)
 		if (!wdev->netdev)
 			continue;
 
-		wdev_lock(wdev);
-		if (!netif_running(wdev->netdev)) {
-			wdev_unlock(wdev);
+		if (!netif_running(wdev->netdev))
 			continue;
-		}
+
 		if (!wdev->conn ||
-		    wdev->conn->state == CFG80211_CONN_CONNECTED) {
-			wdev_unlock(wdev);
+		    wdev->conn->state == CFG80211_CONN_CONNECTED)
 			continue;
-		}
+
 		if (wdev->conn->params.bssid) {
 			memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
 			bssid = bssid_buf;
@@ -279,7 +276,6 @@ void cfg80211_conn_work(struct work_struct *work)
 			cr.timeout_reason = treason;
 			__cfg80211_connect_result(wdev->netdev, &cr, false);
 		}
-		wdev_unlock(wdev);
 	}
 
 	wiphy_unlock(&rdev->wiphy);
@@ -300,7 +296,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_bss *bss;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel,
 			       wdev->conn->params.bssid,
@@ -317,13 +313,13 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
 	return bss;
 }
 
-static void __cfg80211_sme_scan_done(struct net_device *dev)
+void cfg80211_sme_scan_done(struct net_device *dev)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_bss *bss;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!wdev->conn)
 		return;
@@ -339,15 +335,6 @@ static void __cfg80211_sme_scan_done(struct net_device *dev)
 		schedule_work(&rdev->conn_work);
 }
 
-void cfg80211_sme_scan_done(struct net_device *dev)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-
-	wdev_lock(wdev);
-	__cfg80211_sme_scan_done(dev);
-	wdev_unlock(wdev);
-}
-
 void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
 {
 	struct wiphy *wiphy = wdev->wiphy;
@@ -355,7 +342,7 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
 	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
 	u16 status_code = le16_to_cpu(mgmt->u.auth.status_code);
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED)
 		return;
@@ -705,11 +692,9 @@ static bool cfg80211_is_all_idle(void)
 	for_each_rdev(rdev) {
 		wiphy_lock(&rdev->wiphy);
 		list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
-			wdev_lock(wdev);
 			if (wdev->conn || wdev->connected ||
 			    cfg80211_beaconing_iface_active(wdev))
 				is_all_idle = false;
-			wdev_unlock(wdev);
 		}
 		wiphy_unlock(&rdev->wiphy);
 	}
@@ -763,7 +748,7 @@ void __cfg80211_connect_result(struct net_device *dev,
 	const u8 *connected_addr;
 	bool bss_not_found = false;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
 		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
@@ -1095,7 +1080,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 	unsigned int link;
 	const u8 *connected_addr;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
 		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
@@ -1299,7 +1284,7 @@ EXPORT_SYMBOL(cfg80211_roamed);
 void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid,
 					const u8 *td_bitmap, u8 td_bitmap_len)
 {
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
 		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
@@ -1355,7 +1340,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 	union iwreq_data wrqu;
 #endif
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
 		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
@@ -1445,7 +1430,7 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	/*
 	 * If we have an ssid_len, we're trying to connect or are
@@ -1551,7 +1536,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err = 0;
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	kfree_sensitive(wdev->connect_keys);
 	wdev->connect_keys = NULL;
@@ -1587,19 +1572,18 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 
 	wiphy_lock(wdev->wiphy);
-	wdev_lock(wdev);
 
 	if (wdev->conn_owner_nlportid) {
 		switch (wdev->iftype) {
 		case NL80211_IFTYPE_ADHOC:
-			__cfg80211_leave_ibss(rdev, wdev->netdev, false);
+			cfg80211_leave_ibss(rdev, wdev->netdev, false);
 			break;
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_P2P_GO:
-			__cfg80211_stop_ap(rdev, wdev->netdev, -1, false);
+			cfg80211_stop_ap(rdev, wdev->netdev, -1, false);
 			break;
 		case NL80211_IFTYPE_MESH_POINT:
-			__cfg80211_leave_mesh(rdev, wdev->netdev);
+			cfg80211_leave_mesh(rdev, wdev->netdev);
 			break;
 		case NL80211_IFTYPE_STATION:
 		case NL80211_IFTYPE_P2P_CLIENT:
@@ -1624,6 +1608,5 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
 		}
 	}
 
-	wdev_unlock(wdev);
 	wiphy_unlock(wdev->wiphy);
 }
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 1783ab9d57a3..fff99fe43fdd 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1044,7 +1044,6 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 		list_del(&ev->list);
 		spin_unlock_irqrestore(&wdev->event_lock, flags);
 
-		wdev_lock(wdev);
 		switch (ev->type) {
 		case EVENT_CONNECT_RESULT:
 			__cfg80211_connect_result(
@@ -1066,7 +1065,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 					       ev->ij.channel);
 			break;
 		case EVENT_STOPPED:
-			__cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
+			cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
 			break;
 		case EVENT_PORT_AUTHORIZED:
 			__cfg80211_port_authorized(wdev, ev->pa.bssid,
@@ -1074,7 +1073,6 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 						   ev->pa.td_bitmap_len);
 			break;
 		}
-		wdev_unlock(wdev);
 
 		kfree(ev);
 
@@ -1124,9 +1122,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			return -EBUSY;
 
 		dev->ieee80211_ptr->use_4addr = false;
-		wdev_lock(dev->ieee80211_ptr);
 		rdev_set_qos_map(rdev, dev, NULL);
-		wdev_unlock(dev->ieee80211_ptr);
 
 		switch (otype) {
 		case NL80211_IFTYPE_AP:
@@ -1138,10 +1134,8 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			break;
 		case NL80211_IFTYPE_STATION:
 		case NL80211_IFTYPE_P2P_CLIENT:
-			wdev_lock(dev->ieee80211_ptr);
 			cfg80211_disconnect(rdev, dev,
 					    WLAN_REASON_DEAUTH_LEAVING, true);
-			wdev_unlock(dev->ieee80211_ptr);
 			break;
 		case NL80211_IFTYPE_MESH_POINT:
 			/* mesh should be handled? */
@@ -2647,12 +2641,12 @@ void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		__cfg80211_stop_ap(rdev, wdev->netdev, link_id, true);
+		cfg80211_stop_ap(rdev, wdev->netdev, link_id, true);
 		break;
 	default:
 		/* per-link not relevant */
@@ -2677,12 +2671,10 @@ void cfg80211_remove_links(struct wireless_dev *wdev)
 	if (wdev->iftype != NL80211_IFTYPE_AP)
 		return;
 
-	wdev_lock(wdev);
 	if (wdev->valid_links) {
 		for_each_valid_link(wdev, link_id)
 			cfg80211_remove_link(wdev, link_id);
 	}
-	wdev_unlock(wdev);
 }
 
 int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index e3acfac7430a..d23ce088bffa 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -415,10 +415,10 @@ int cfg80211_wext_giwretry(struct net_device *dev,
 }
 EXPORT_WEXT_HANDLER(cfg80211_wext_giwretry);
 
-static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
-				     struct net_device *dev, bool pairwise,
-				     const u8 *addr, bool remove, bool tx_key,
-				     int idx, struct key_params *params)
+static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
+				   struct net_device *dev, bool pairwise,
+				   const u8 *addr, bool remove, bool tx_key,
+				   int idx, struct key_params *params)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err, i;
@@ -471,7 +471,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 			 */
 			if (idx == wdev->wext.default_key &&
 			    wdev->iftype == NL80211_IFTYPE_ADHOC) {
-				__cfg80211_leave_ibss(rdev, wdev->netdev, true);
+				cfg80211_leave_ibss(rdev, wdev->netdev, true);
 				rejoin = true;
 			}
 
@@ -552,7 +552,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 			 */
 			if (wdev->iftype == NL80211_IFTYPE_ADHOC &&
 			    wdev->wext.default_key == -1) {
-				__cfg80211_leave_ibss(rdev, wdev->netdev, true);
+				cfg80211_leave_ibss(rdev, wdev->netdev, true);
 				rejoin = true;
 			}
 			err = rdev_set_default_key(rdev, dev, -1, idx, true,
@@ -580,21 +580,6 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 	return 0;
 }
 
-static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
-				   struct net_device *dev, bool pairwise,
-				   const u8 *addr, bool remove, bool tx_key,
-				   int idx, struct key_params *params)
-{
-	int err;
-
-	wdev_lock(dev->ieee80211_ptr);
-	err = __cfg80211_set_encryption(rdev, dev, pairwise, addr,
-					remove, tx_key, idx, params);
-	wdev_unlock(dev->ieee80211_ptr);
-
-	return err;
-}
-
 static int cfg80211_wext_siwencode(struct net_device *dev,
 				   struct iw_request_info *info,
 				   union iwreq_data *wrqu, char *keybuf)
@@ -639,7 +624,6 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
 	else if (erq->length == 0) {
 		/* No key data - just set the default TX key index */
 		err = 0;
-		wdev_lock(wdev);
 		if (wdev->connected ||
 		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
 		     wdev->u.ibss.current_bss))
@@ -647,7 +631,6 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
 						   true);
 		if (!err)
 			wdev->wext.default_key = idx;
-		wdev_unlock(wdev);
 		goto out;
 	}
 
@@ -697,12 +680,8 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev,
 	    !rdev->ops->set_default_key)
 		return -EOPNOTSUPP;
 
-	wdev_lock(wdev);
-	if (wdev->valid_links) {
-		wdev_unlock(wdev);
+	if (wdev->valid_links)
 		return -EOPNOTSUPP;
-	}
-	wdev_unlock(wdev);
 
 	switch (ext->alg) {
 	case IW_ENCODE_ALG_NONE:
@@ -1341,13 +1320,11 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 		return -EOPNOTSUPP;
 
 	err = 0;
-	wdev_lock(wdev);
 	if (!wdev->valid_links && wdev->links[0].client.current_bss)
 		memcpy(addr, wdev->links[0].client.current_bss->pub.bssid,
 		       ETH_ALEN);
 	else
 		err = -EOPNOTSUPP;
-	wdev_unlock(wdev);
 	if (err)
 		return err;
 
@@ -1387,17 +1364,15 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
 		return NULL;
 
 	/* Grab BSSID of current BSS, if any */
-	wdev_lock(wdev);
+	wiphy_lock(&rdev->wiphy);
 	if (wdev->valid_links || !wdev->links[0].client.current_bss) {
-		wdev_unlock(wdev);
+		wiphy_unlock(&rdev->wiphy);
 		return NULL;
 	}
 	memcpy(bssid, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN);
-	wdev_unlock(wdev);
 
 	memset(&sinfo, 0, sizeof(sinfo));
 
-	wiphy_lock(&rdev->wiphy);
 	ret = rdev_get_station(rdev, dev, bssid, &sinfo);
 	wiphy_unlock(&rdev->wiphy);
 
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index f3eaa3388694..8edd9ada69d0 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -23,7 +23,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 	int err, i;
 
 	ASSERT_RTNL();
-	ASSERT_WDEV_LOCK(wdev);
+	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (!netif_running(wdev->netdev))
 		return 0;
@@ -87,15 +87,11 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
 			return -EINVAL;
 	}
 
-	wdev_lock(wdev);
-
 	if (wdev->conn) {
 		bool event = true;
 
-		if (wdev->wext.connect.channel == chan) {
-			err = 0;
-			goto out;
-		}
+		if (wdev->wext.connect.channel == chan)
+			return 0;
 
 		/* if SSID set, we'll try right again, avoid event */
 		if (wdev->wext.connect.ssid_len)
@@ -103,14 +99,11 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
 		err = cfg80211_disconnect(rdev, dev,
 					  WLAN_REASON_DEAUTH_LEAVING, event);
 		if (err)
-			goto out;
+			return err;
 	}
 
 	wdev->wext.connect.channel = chan;
-	err = cfg80211_mgd_wext_connect(rdev, wdev);
- out:
-	wdev_unlock(wdev);
-	return err;
+	return cfg80211_mgd_wext_connect(rdev, wdev);
 }
 
 int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
@@ -127,12 +120,10 @@ int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
 	if (wdev->valid_links)
 		return -EOPNOTSUPP;
 
-	wdev_lock(wdev);
 	if (wdev->links[0].client.current_bss)
 		chan = wdev->links[0].client.current_bss->pub.channel;
 	else if (wdev->wext.connect.channel)
 		chan = wdev->wext.connect.channel;
-	wdev_unlock(wdev);
 
 	if (chan) {
 		freq->m = chan->center_freq;
@@ -164,17 +155,13 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev,
 	if (len > 0 && ssid[len - 1] == '\0')
 		len--;
 
-	wdev_lock(wdev);
-
-	err = 0;
-
 	if (wdev->conn) {
 		bool event = true;
 
 		if (wdev->wext.connect.ssid && len &&
 		    len == wdev->wext.connect.ssid_len &&
 		    memcmp(wdev->wext.connect.ssid, ssid, len) == 0)
-			goto out;
+			return 0;
 
 		/* if SSID set now, we'll try to connect, avoid event */
 		if (len)
@@ -182,7 +169,7 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev,
 		err = cfg80211_disconnect(rdev, dev,
 					  WLAN_REASON_DEAUTH_LEAVING, event);
 		if (err)
-			goto out;
+			return err;
 	}
 
 	wdev->wext.prev_bssid_valid = false;
@@ -194,10 +181,7 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev,
 	wdev->wext.connect.crypto.control_port_ethertype =
 					cpu_to_be16(ETH_P_PAE);
 
-	err = cfg80211_mgd_wext_connect(rdev, wdev);
- out:
-	wdev_unlock(wdev);
-	return err;
+	return cfg80211_mgd_wext_connect(rdev, wdev);
 }
 
 int cfg80211_mgd_wext_giwessid(struct net_device *dev,
@@ -216,7 +200,6 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
 
 	data->flags = 0;
 
-	wdev_lock(wdev);
 	if (wdev->links[0].client.current_bss) {
 		const struct element *ssid_elem;
 
@@ -238,7 +221,6 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
 		data->length = wdev->wext.connect.ssid_len;
 		memcpy(ssid, wdev->wext.connect.ssid, data->length);
 	}
-	wdev_unlock(wdev);
 
 	return ret;
 }
@@ -263,23 +245,20 @@ int cfg80211_mgd_wext_siwap(struct net_device *dev,
 	if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid))
 		bssid = NULL;
 
-	wdev_lock(wdev);
-
 	if (wdev->conn) {
-		err = 0;
 		/* both automatic */
 		if (!bssid && !wdev->wext.connect.bssid)
-			goto out;
+			return 0;
 
 		/* fixed already - and no change */
 		if (wdev->wext.connect.bssid && bssid &&
 		    ether_addr_equal(bssid, wdev->wext.connect.bssid))
-			goto out;
+			return 0;
 
 		err = cfg80211_disconnect(rdev, dev,
 					  WLAN_REASON_DEAUTH_LEAVING, false);
 		if (err)
-			goto out;
+			return err;
 	}
 
 	if (bssid) {
@@ -288,10 +267,7 @@ int cfg80211_mgd_wext_siwap(struct net_device *dev,
 	} else
 		wdev->wext.connect.bssid = NULL;
 
-	err = cfg80211_mgd_wext_connect(rdev, wdev);
- out:
-	wdev_unlock(wdev);
-	return err;
+	return cfg80211_mgd_wext_connect(rdev, wdev);
 }
 
 int cfg80211_mgd_wext_giwap(struct net_device *dev,
@@ -306,18 +282,15 @@ int cfg80211_mgd_wext_giwap(struct net_device *dev,
 
 	ap_addr->sa_family = ARPHRD_ETHER;
 
-	wdev_lock(wdev);
-	if (wdev->valid_links) {
-		wdev_unlock(wdev);
+	if (wdev->valid_links)
 		return -EOPNOTSUPP;
-	}
+
 	if (wdev->links[0].client.current_bss)
 		memcpy(ap_addr->sa_data,
 		       wdev->links[0].client.current_bss->pub.bssid,
 		       ETH_ALEN);
 	else
 		eth_zero_addr(ap_addr->sa_data);
-	wdev_unlock(wdev);
 
 	return 0;
 }
@@ -339,7 +312,6 @@ int cfg80211_wext_siwgenie(struct net_device *dev,
 		ie = NULL;
 
 	wiphy_lock(wdev->wiphy);
-	wdev_lock(wdev);
 
 	/* no change */
 	err = 0;
@@ -370,7 +342,6 @@ int cfg80211_wext_siwgenie(struct net_device *dev,
 	/* userspace better not think we'll reconnect */
 	err = 0;
  out:
-	wdev_unlock(wdev);
 	wiphy_unlock(wdev->wiphy);
 	return err;
 }
@@ -396,7 +367,6 @@ int cfg80211_wext_siwmlme(struct net_device *dev,
 		return -EINVAL;
 
 	wiphy_lock(&rdev->wiphy);
-	wdev_lock(wdev);
 	switch (mlme->cmd) {
 	case IW_MLME_DEAUTH:
 	case IW_MLME_DISASSOC:
@@ -406,7 +376,6 @@ int cfg80211_wext_siwmlme(struct net_device *dev,
 		err = -EOPNOTSUPP;
 		break;
 	}
-	wdev_unlock(wdev);
 	wiphy_unlock(&rdev->wiphy);
 
 	return err;
-- 
cgit v1.2.3-70-g09d2


From 2ae5c9248e06dac2c2360be26b4e25f673238337 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 31 Aug 2023 11:22:58 -0700
Subject: wifi: mac80211: Use flexible array in struct ieee80211_tim_ie

Currently struct ieee80211_tim_ie defines:
	u8 virtual_map[1];

Per the guidance in [1] change this to be a flexible array.

Per the discussion in [2] wrap the virtual_map in a union with a u8
item in order to preserve the existing expectation that the
virtual_map must contain at least one octet (at least when used in a
non-S1G PPDU). This means that no driver changes are required.

[1] https://docs.kernel.org/process/deprecated.html#zero-length-and-one-element-arrays
[2] https://lore.kernel.org/linux-wireless/202308301529.AC90A9EF98@keescook/

Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20230831-ieee80211_tim_ie-v3-2-e10ff584ab5d@quicinc.com
[add wifi prefix]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bd2f6e19c357..340d7e0f6bf7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -951,17 +951,24 @@ struct ieee80211_wide_bw_chansw_ie {
  * @dtim_count: DTIM Count
  * @dtim_period: DTIM Period
  * @bitmap_ctrl: Bitmap Control
+ * @required_octet: "Syntatic sugar" to force the struct size to the
+ *                  minimum valid size when carried in a non-S1G PPDU
  * @virtual_map: Partial Virtual Bitmap
  *
  * This structure represents the payload of the "TIM element" as
- * described in IEEE Std 802.11-2020 section 9.4.2.5.
+ * described in IEEE Std 802.11-2020 section 9.4.2.5. Note that this
+ * definition is only applicable when the element is carried in a
+ * non-S1G PPDU. When the TIM is carried in an S1G PPDU, the Bitmap
+ * Control and Partial Virtual Bitmap may not be present.
  */
 struct ieee80211_tim_ie {
 	u8 dtim_count;
 	u8 dtim_period;
 	u8 bitmap_ctrl;
-	/* variable size: 1 - 251 bytes */
-	u8 virtual_map[1];
+	union {
+		u8 required_octet;
+		DECLARE_FLEX_ARRAY(u8, virtual_map);
+	};
 } __packed;
 
 /**
-- 
cgit v1.2.3-70-g09d2


From e8c1841278a78362f7034f3de415096ddb19f097 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 28 Aug 2023 09:54:39 +0200
Subject: wifi: cfg80211: annotate iftype_data pointer with sparse

There were are a number of cases in mac80211 and iwlwifi (at
least) that used the sband->iftype_data pointer directly,
instead of using the accessors to find the right array entry
to use.

Make sparse warn when such a thing is done.

To not have a lot of casts, add two helper functions/macros

 - ieee80211_set_sband_iftype_data()
 - for_each_sband_iftype_data()

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath11k/mac.c              | 15 +++---
 drivers/net/wireless/ath/ath12k/mac.c              | 12 ++---
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c |  4 +-
 drivers/net/wireless/mediatek/mt76/mt7915/init.c   |  9 ++--
 drivers/net/wireless/mediatek/mt76/mt7921/main.c   |  9 ++--
 drivers/net/wireless/mediatek/mt76/mt7996/init.c   |  3 +-
 drivers/net/wireless/quantenna/qtnfmac/commands.c  |  5 +-
 drivers/net/wireless/quantenna/qtnfmac/core.c      |  2 +-
 drivers/net/wireless/realtek/rtw89/core.c          | 15 +++---
 drivers/net/wireless/realtek/rtw89/regd.c          |  2 +-
 drivers/net/wireless/virtual/mac80211_hwsim.c      | 30 +++++------
 include/net/cfg80211.h                             | 59 ++++++++++++++++++++--
 net/mac80211/main.c                                |  7 +--
 net/wireless/chan.c                                |  5 +-
 net/wireless/core.c                                |  8 ++-
 net/wireless/nl80211.c                             |  6 +--
 16 files changed, 112 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index c071bf5841af..6ed036b51dba 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -5893,8 +5893,9 @@ static void ath11k_mac_setup_he_cap(struct ath11k *ar,
 					       ar->mac.iftype[NL80211_BAND_2GHZ],
 					       NL80211_BAND_2GHZ);
 		band = &ar->mac.sbands[NL80211_BAND_2GHZ];
-		band->iftype_data = ar->mac.iftype[NL80211_BAND_2GHZ];
-		band->n_iftype_data = count;
+		_ieee80211_set_sband_iftype_data(band,
+						 ar->mac.iftype[NL80211_BAND_2GHZ],
+						 count);
 	}
 
 	if (cap->supported_bands & WMI_HOST_WLAN_5G_CAP) {
@@ -5902,8 +5903,9 @@ static void ath11k_mac_setup_he_cap(struct ath11k *ar,
 					       ar->mac.iftype[NL80211_BAND_5GHZ],
 					       NL80211_BAND_5GHZ);
 		band = &ar->mac.sbands[NL80211_BAND_5GHZ];
-		band->iftype_data = ar->mac.iftype[NL80211_BAND_5GHZ];
-		band->n_iftype_data = count;
+		_ieee80211_set_sband_iftype_data(band,
+						 ar->mac.iftype[NL80211_BAND_5GHZ],
+						 count);
 	}
 
 	if (cap->supported_bands & WMI_HOST_WLAN_5G_CAP &&
@@ -5912,8 +5914,9 @@ static void ath11k_mac_setup_he_cap(struct ath11k *ar,
 					       ar->mac.iftype[NL80211_BAND_6GHZ],
 					       NL80211_BAND_6GHZ);
 		band = &ar->mac.sbands[NL80211_BAND_6GHZ];
-		band->iftype_data = ar->mac.iftype[NL80211_BAND_6GHZ];
-		band->n_iftype_data = count;
+		_ieee80211_set_sband_iftype_data(band,
+						 ar->mac.iftype[NL80211_BAND_6GHZ],
+						 count);
 	}
 }
 
diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c
index 88346e66bb75..24113709972d 100644
--- a/drivers/net/wireless/ath/ath12k/mac.c
+++ b/drivers/net/wireless/ath/ath12k/mac.c
@@ -4647,8 +4647,8 @@ static void ath12k_mac_setup_sband_iftype_data(struct ath12k *ar,
 							  ar->mac.iftype[band],
 							  band);
 		sband = &ar->mac.sbands[band];
-		sband->iftype_data = ar->mac.iftype[band];
-		sband->n_iftype_data = count;
+		_ieee80211_set_sband_iftype_data(sband, ar->mac.iftype[band],
+						 count);
 	}
 
 	if (cap->supported_bands & WMI_HOST_WLAN_5G_CAP) {
@@ -4657,8 +4657,8 @@ static void ath12k_mac_setup_sband_iftype_data(struct ath12k *ar,
 							  ar->mac.iftype[band],
 							  band);
 		sband = &ar->mac.sbands[band];
-		sband->iftype_data = ar->mac.iftype[band];
-		sband->n_iftype_data = count;
+		_ieee80211_set_sband_iftype_data(sband, ar->mac.iftype[band],
+						 count);
 	}
 
 	if (cap->supported_bands & WMI_HOST_WLAN_5G_CAP &&
@@ -4668,8 +4668,8 @@ static void ath12k_mac_setup_sband_iftype_data(struct ath12k *ar,
 							  ar->mac.iftype[band],
 							  band);
 		sband = &ar->mac.sbands[band];
-		sband->iftype_data = ar->mac.iftype[band];
-		sband->n_iftype_data = count;
+		_ieee80211_set_sband_iftype_data(sband, ar->mac.iftype[band],
+						 count);
 	}
 }
 
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index 31176897b746..cff1f97536e3 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -1077,8 +1077,8 @@ static void iwl_init_he_hw_capab(struct iwl_trans *trans,
 
 	memcpy(iftype_data, iwl_he_eht_capa, sizeof(iwl_he_eht_capa));
 
-	sband->iftype_data = iftype_data;
-	sband->n_iftype_data = ARRAY_SIZE(iwl_he_eht_capa);
+	_ieee80211_set_sband_iftype_data(sband, iftype_data,
+					 ARRAY_SIZE(iwl_he_eht_capa));
 
 	for (i = 0; i < sband->n_iftype_data; i++)
 		iwl_nvm_fixup_sband_iftd(trans, data, sband, &iftype_data[i],
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c
index 35fdf4f98d80..b27d04e02aba 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c
@@ -1127,8 +1127,7 @@ void mt7915_set_stream_he_caps(struct mt7915_phy *phy)
 		n = mt7915_init_he_caps(phy, NL80211_BAND_2GHZ, data);
 
 		band = &phy->mt76->sband_2g.sband;
-		band->iftype_data = data;
-		band->n_iftype_data = n;
+		_ieee80211_set_sband_iftype_data(band, data, n);
 	}
 
 	if (phy->mt76->cap.has_5ghz) {
@@ -1136,8 +1135,7 @@ void mt7915_set_stream_he_caps(struct mt7915_phy *phy)
 		n = mt7915_init_he_caps(phy, NL80211_BAND_5GHZ, data);
 
 		band = &phy->mt76->sband_5g.sband;
-		band->iftype_data = data;
-		band->n_iftype_data = n;
+		_ieee80211_set_sband_iftype_data(band, data, n);
 	}
 
 	if (phy->mt76->cap.has_6ghz) {
@@ -1145,8 +1143,7 @@ void mt7915_set_stream_he_caps(struct mt7915_phy *phy)
 		n = mt7915_init_he_caps(phy, NL80211_BAND_6GHZ, data);
 
 		band = &phy->mt76->sband_6g.sband;
-		band->iftype_data = data;
-		band->n_iftype_data = n;
+		_ieee80211_set_sband_iftype_data(band, data, n);
 	}
 }
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 0844d28b3223..62e6da1386aa 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -196,8 +196,7 @@ void mt7921_set_stream_he_caps(struct mt792x_phy *phy)
 		n = mt7921_init_he_caps(phy, NL80211_BAND_2GHZ, data);
 
 		band = &phy->mt76->sband_2g.sband;
-		band->iftype_data = data;
-		band->n_iftype_data = n;
+		_ieee80211_set_sband_iftype_data(band, data, n);
 	}
 
 	if (phy->mt76->cap.has_5ghz) {
@@ -205,16 +204,14 @@ void mt7921_set_stream_he_caps(struct mt792x_phy *phy)
 		n = mt7921_init_he_caps(phy, NL80211_BAND_5GHZ, data);
 
 		band = &phy->mt76->sband_5g.sband;
-		band->iftype_data = data;
-		band->n_iftype_data = n;
+		_ieee80211_set_sband_iftype_data(band, data, n);
 
 		if (phy->mt76->cap.has_6ghz) {
 			data = phy->iftype[NL80211_BAND_6GHZ];
 			n = mt7921_init_he_caps(phy, NL80211_BAND_6GHZ, data);
 
 			band = &phy->mt76->sband_6g.sband;
-			band->iftype_data = data;
-			band->n_iftype_data = n;
+			_ieee80211_set_sband_iftype_data(band, data, n);
 		}
 	}
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/init.c b/drivers/net/wireless/mediatek/mt76/mt7996/init.c
index 26e03b28935f..0d6cc214ce10 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/init.c
@@ -827,8 +827,7 @@ __mt7996_set_stream_he_eht_caps(struct mt7996_phy *phy,
 		n++;
 	}
 
-	sband->iftype_data = data;
-	sband->n_iftype_data = n;
+	_ieee80211_set_sband_iftype_data(sband, data, n);
 }
 
 void mt7996_set_stream_he_eht_caps(struct mt7996_phy *phy)
diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c
index 68ae9c7ea95a..9540ad6196d7 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c
@@ -1335,7 +1335,7 @@ static int qtnf_cmd_band_fill_iftype(const u8 *data,
 		return -EINVAL;
 	}
 
-	kfree(band->iftype_data);
+	kfree((__force void *)band->iftype_data);
 	band->iftype_data = NULL;
 	band->n_iftype_data = tlv->n_iftype_data;
 	if (band->n_iftype_data == 0)
@@ -1347,7 +1347,8 @@ static int qtnf_cmd_band_fill_iftype(const u8 *data,
 		band->n_iftype_data = 0;
 		return -ENOMEM;
 	}
-	band->iftype_data = iftype_data;
+
+	_ieee80211_set_sband_iftype_data(band, iftype_data, tlv->n_iftype_data);
 
 	for (i = 0; i < band->n_iftype_data; i++)
 		qtnf_cmd_conv_iftype(iftype_data++, &tlv->iftype_data[i]);
diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c
index 2a63ffdc4b2c..677bac835330 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/core.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/core.c
@@ -535,7 +535,7 @@ static void qtnf_core_mac_detach(struct qtnf_bus *bus, unsigned int macid)
 		if (!wiphy->bands[band])
 			continue;
 
-		kfree(wiphy->bands[band]->iftype_data);
+		kfree((__force void *)wiphy->bands[band]->iftype_data);
 		wiphy->bands[band]->n_iftype_data = 0;
 
 		kfree(wiphy->bands[band]->channels);
diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c
index 856f3543eff2..fc686954b3dd 100644
--- a/drivers/net/wireless/realtek/rtw89/core.c
+++ b/drivers/net/wireless/realtek/rtw89/core.c
@@ -3359,8 +3359,7 @@ static void rtw89_init_he_cap(struct rtw89_dev *rtwdev,
 		idx++;
 	}
 
-	sband->iftype_data = iftype_data;
-	sband->n_iftype_data = idx;
+	_ieee80211_set_sband_iftype_data(sband, iftype_data, idx);
 }
 
 static int rtw89_core_set_supported_band(struct rtw89_dev *rtwdev)
@@ -3405,11 +3404,11 @@ err:
 	hw->wiphy->bands[NL80211_BAND_5GHZ] = NULL;
 	hw->wiphy->bands[NL80211_BAND_6GHZ] = NULL;
 	if (sband_2ghz)
-		kfree(sband_2ghz->iftype_data);
+		kfree((__force void *)sband_2ghz->iftype_data);
 	if (sband_5ghz)
-		kfree(sband_5ghz->iftype_data);
+		kfree((__force void *)sband_5ghz->iftype_data);
 	if (sband_6ghz)
-		kfree(sband_6ghz->iftype_data);
+		kfree((__force void *)sband_6ghz->iftype_data);
 	kfree(sband_2ghz);
 	kfree(sband_5ghz);
 	kfree(sband_6ghz);
@@ -3421,11 +3420,11 @@ static void rtw89_core_clr_supported_band(struct rtw89_dev *rtwdev)
 	struct ieee80211_hw *hw = rtwdev->hw;
 
 	if (hw->wiphy->bands[NL80211_BAND_2GHZ])
-		kfree(hw->wiphy->bands[NL80211_BAND_2GHZ]->iftype_data);
+		kfree((__force void *)hw->wiphy->bands[NL80211_BAND_2GHZ]->iftype_data);
 	if (hw->wiphy->bands[NL80211_BAND_5GHZ])
-		kfree(hw->wiphy->bands[NL80211_BAND_5GHZ]->iftype_data);
+		kfree((__force void *)hw->wiphy->bands[NL80211_BAND_5GHZ]->iftype_data);
 	if (hw->wiphy->bands[NL80211_BAND_6GHZ])
-		kfree(hw->wiphy->bands[NL80211_BAND_6GHZ]->iftype_data);
+		kfree((__force void *)hw->wiphy->bands[NL80211_BAND_6GHZ]->iftype_data);
 	kfree(hw->wiphy->bands[NL80211_BAND_2GHZ]);
 	kfree(hw->wiphy->bands[NL80211_BAND_5GHZ]);
 	kfree(hw->wiphy->bands[NL80211_BAND_6GHZ]);
diff --git a/drivers/net/wireless/realtek/rtw89/regd.c b/drivers/net/wireless/realtek/rtw89/regd.c
index 9e2328db1865..c956a8b971c6 100644
--- a/drivers/net/wireless/realtek/rtw89/regd.c
+++ b/drivers/net/wireless/realtek/rtw89/regd.c
@@ -377,7 +377,7 @@ bottom:
 		return;
 
 	wiphy->bands[NL80211_BAND_6GHZ] = NULL;
-	kfree(sband->iftype_data);
+	kfree((__force void *)sband->iftype_data);
 	kfree(sband);
 }
 
diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c
index f5a0880da3fc..36f2d2388ddd 100644
--- a/drivers/net/wireless/virtual/mac80211_hwsim.c
+++ b/drivers/net/wireless/virtual/mac80211_hwsim.c
@@ -4899,25 +4899,19 @@ static const struct ieee80211_sband_iftype_data sband_capa_6ghz[] = {
 
 static void mac80211_hwsim_sband_capab(struct ieee80211_supported_band *sband)
 {
-	u16 n_iftype_data;
-
-	if (sband->band == NL80211_BAND_2GHZ) {
-		n_iftype_data = ARRAY_SIZE(sband_capa_2ghz);
-		sband->iftype_data =
-			(struct ieee80211_sband_iftype_data *)sband_capa_2ghz;
-	} else if (sband->band == NL80211_BAND_5GHZ) {
-		n_iftype_data = ARRAY_SIZE(sband_capa_5ghz);
-		sband->iftype_data =
-			(struct ieee80211_sband_iftype_data *)sband_capa_5ghz;
-	} else if (sband->band == NL80211_BAND_6GHZ) {
-		n_iftype_data = ARRAY_SIZE(sband_capa_6ghz);
-		sband->iftype_data =
-			(struct ieee80211_sband_iftype_data *)sband_capa_6ghz;
-	} else {
-		return;
+	switch (sband->band) {
+	case NL80211_BAND_2GHZ:
+		ieee80211_set_sband_iftype_data(sband, sband_capa_2ghz);
+		break;
+	case NL80211_BAND_5GHZ:
+		ieee80211_set_sband_iftype_data(sband, sband_capa_5ghz);
+		break;
+	case NL80211_BAND_6GHZ:
+		ieee80211_set_sband_iftype_data(sband, sband_capa_6ghz);
+		break;
+	default:
+		break;
 	}
-
-	sband->n_iftype_data = n_iftype_data;
 }
 
 #ifdef CONFIG_MAC80211_MESH
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index aa9c26a03f30..922fd9e0d9b4 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -410,6 +410,19 @@ struct ieee80211_sta_eht_cap {
 	u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN];
 };
 
+/* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */
+#ifdef __CHECKER__
+/*
+ * This is used to mark the sband->iftype_data pointer which is supposed
+ * to be an array with special access semantics (per iftype), but a lot
+ * of code got it wrong in the past, so with this marking sparse will be
+ * noisy when the pointer is used directly.
+ */
+# define __iftd		__attribute__((noderef, address_space(__iftype_data)))
+#else
+# define __iftd
+#endif /* __CHECKER__ */
+
 /**
  * struct ieee80211_sband_iftype_data - sband data per interface type
  *
@@ -543,9 +556,47 @@ struct ieee80211_supported_band {
 	struct ieee80211_sta_s1g_cap s1g_cap;
 	struct ieee80211_edmg edmg_cap;
 	u16 n_iftype_data;
-	const struct ieee80211_sband_iftype_data *iftype_data;
+	const struct ieee80211_sband_iftype_data __iftd *iftype_data;
 };
 
+/**
+ * _ieee80211_set_sband_iftype_data - set sband iftype data array
+ * @sband: the sband to initialize
+ * @iftd: the iftype data array pointer
+ * @n_iftd: the length of the iftype data array
+ *
+ * Set the sband iftype data array; use this where the length cannot
+ * be derived from the ARRAY_SIZE() of the argument, but prefer
+ * ieee80211_set_sband_iftype_data() where it can be used.
+ */
+static inline void
+_ieee80211_set_sband_iftype_data(struct ieee80211_supported_band *sband,
+				 const struct ieee80211_sband_iftype_data *iftd,
+				 u16 n_iftd)
+{
+	sband->iftype_data = (const void __iftd __force *)iftd;
+	sband->n_iftype_data = n_iftd;
+}
+
+/**
+ * ieee80211_set_sband_iftype_data - set sband iftype data array
+ * @sband: the sband to initialize
+ * @iftd: the iftype data array
+ */
+#define ieee80211_set_sband_iftype_data(sband, iftd)	\
+	_ieee80211_set_sband_iftype_data(sband, iftd, ARRAY_SIZE(iftd))
+
+/**
+ * for_each_sband_iftype_data - iterate sband iftype data entries
+ * @sband: the sband whose iftype_data array to iterate
+ * @i: iterator counter
+ * @iftd: iftype data pointer to set
+ */
+#define for_each_sband_iftype_data(sband, i, iftd)				\
+	for (i = 0, iftd = (const void __force *)&(sband)->iftype_data[i];	\
+	     i < (sband)->n_iftype_data;					\
+	     i++, iftd = (const void __force *)&(sband)->iftype_data[i])
+
 /**
  * ieee80211_get_sband_iftype_data - return sband data for a given iftype
  * @sband: the sband to search for the STA on
@@ -557,6 +608,7 @@ static inline const struct ieee80211_sband_iftype_data *
 ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
 				u8 iftype)
 {
+	const struct ieee80211_sband_iftype_data *data;
 	int i;
 
 	if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
@@ -565,10 +617,7 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
 	if (iftype == NL80211_IFTYPE_AP_VLAN)
 		iftype = NL80211_IFTYPE_AP;
 
-	for (i = 0; i < sband->n_iftype_data; i++)  {
-		const struct ieee80211_sband_iftype_data *data =
-			&sband->iftype_data[i];
-
+	for_each_sband_iftype_data(sband, i, data) {
 		if (data->types_mask & BIT(iftype))
 			return data;
 	}
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index eabf6c1bf3ff..bf8f72c412ee 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1066,6 +1066,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	supp_he = false;
 	supp_eht = false;
 	for (band = 0; band < NUM_NL80211_BANDS; band++) {
+		const struct ieee80211_sband_iftype_data *iftd;
 		struct ieee80211_supported_band *sband;
 
 		sband = local->hw.wiphy->bands[band];
@@ -1112,11 +1113,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		supp_ht = supp_ht || sband->ht_cap.ht_supported;
 		supp_vht = supp_vht || sband->vht_cap.vht_supported;
 
-		for (i = 0; i < sband->n_iftype_data; i++) {
-			const struct ieee80211_sband_iftype_data *iftd;
-
-			iftd = &sband->iftype_data[i];
-
+		for_each_sband_iftype_data(sband, i, iftd) {
 			supp_he = supp_he || iftd->he_cap.has_he;
 			supp_eht = supp_eht || iftd->eht_cap.has_eht;
 		}
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 2af3aaee7493..842190dfa100 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -6,7 +6,7 @@
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2018-2022	Intel Corporation
+ * Copyright 2018-2023	Intel Corporation
  */
 
 #include <linux/export.h>
@@ -1162,8 +1162,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 		if (!sband)
 			return false;
 
-		for (i = 0; i < sband->n_iftype_data; i++) {
-			iftd = &sband->iftype_data[i];
+		for_each_sband_iftype_data(sband, i, iftd) {
 			if (!iftd->eht_cap.has_eht)
 				continue;
 
diff --git a/net/wireless/core.c b/net/wireless/core.c
index c419177278da..b0f6ae3ce78f 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -5,7 +5,7 @@
  * Copyright 2006-2010		Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -821,6 +821,7 @@ int wiphy_register(struct wiphy *wiphy)
 
 	/* sanity check supported bands/channels */
 	for (band = 0; band < NUM_NL80211_BANDS; band++) {
+		const struct ieee80211_sband_iftype_data *iftd;
 		u16 types = 0;
 		bool have_he = false;
 
@@ -877,14 +878,11 @@ int wiphy_register(struct wiphy *wiphy)
 				return -EINVAL;
 		}
 
-		for (i = 0; i < sband->n_iftype_data; i++) {
-			const struct ieee80211_sband_iftype_data *iftd;
+		for_each_sband_iftype_data(sband, i, iftd) {
 			bool has_ap, has_non_ap;
 			u32 ap_bits = BIT(NL80211_IFTYPE_AP) |
 				      BIT(NL80211_IFTYPE_P2P_GO);
 
-			iftd = &sband->iftype_data[i];
-
 			if (WARN_ON(!iftd->types_mask))
 				return -EINVAL;
 			if (WARN_ON(types & iftd->types_mask))
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 71a0a6e34bdb..ab0aea7dca7d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1913,20 +1913,20 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 		struct nlattr *nl_iftype_data =
 			nla_nest_start_noflag(msg,
 					      NL80211_BAND_ATTR_IFTYPE_DATA);
+		const struct ieee80211_sband_iftype_data *iftd;
 		int err;
 
 		if (!nl_iftype_data)
 			return -ENOBUFS;
 
-		for (i = 0; i < sband->n_iftype_data; i++) {
+		for_each_sband_iftype_data(sband, i, iftd) {
 			struct nlattr *iftdata;
 
 			iftdata = nla_nest_start_noflag(msg, i + 1);
 			if (!iftdata)
 				return -ENOBUFS;
 
-			err = nl80211_send_iftype_data(msg, sband,
-						       &sband->iftype_data[i]);
+			err = nl80211_send_iftype_data(msg, sband, iftd);
 			if (err)
 				return err;
 
-- 
cgit v1.2.3-70-g09d2


From a469a5938d1fd98e50119893f22541fe6e269f02 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Mon, 28 Aug 2023 13:04:10 +0300
Subject: wifi: mac80211: add support for mld in ieee80211_chswitch_done

This allows to finalize the CSA per link.
In case the switch didn't work, tear down the MLD connection.
Also pass the ieee80211_bss_conf to post_channel_switch to let the
driver know which link completed the switch.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230828130311.3d3eacc88436.Ic2d14e2285aa1646216a56806cfd4a8d0054437c@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlegacy/4965-mac.c     |  2 +-
 drivers/net/wireless/intel/iwlegacy/common.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c  |  6 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c  |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  | 10 ++++---
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h       |  3 +-
 .../net/wireless/intel/iwlwifi/mvm/time-event.c    |  2 +-
 drivers/net/wireless/ti/wlcore/event.c             |  2 +-
 drivers/net/wireless/ti/wlcore/main.c              |  6 ++--
 include/net/mac80211.h                             |  8 +++--
 net/mac80211/cfg.c                                 | 35 ++++++++++++----------
 net/mac80211/driver-ops.h                          |  6 ++--
 net/mac80211/mlme.c                                | 27 +++++++++++------
 net/mac80211/trace.h                               | 11 ++++---
 14 files changed, 73 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index 0a4aa3c678c1..69276266ce6f 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
@@ -6122,7 +6122,7 @@ il4965_mac_channel_switch(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	if (il->ops->set_channel_switch(il, ch_switch)) {
 		clear_bit(S_CHANNEL_SWITCH_PENDING, &il->status);
 		il->switch_channel = 0;
-		ieee80211_chswitch_done(il->vif, false);
+		ieee80211_chswitch_done(il->vif, false, 0);
 	}
 
 out:
diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
index 96002121bb8b..054fef680aba 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.c
+++ b/drivers/net/wireless/intel/iwlegacy/common.c
@@ -4090,7 +4090,7 @@ il_chswitch_done(struct il_priv *il, bool is_success)
 		return;
 
 	if (test_and_clear_bit(S_CHANNEL_SWITCH_PENDING, &il->status))
-		ieee80211_chswitch_done(il->vif, is_success);
+		ieee80211_chswitch_done(il->vif, is_success, 0);
 }
 EXPORT_SYMBOL(il_chswitch_done);
 
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
index b1939ff275b5..5f3d5b15f727 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
@@ -2,7 +2,7 @@
 /******************************************************************************
  *
  * Copyright(c) 2003 - 2014 Intel Corporation. All rights reserved.
- * Copyright (C) 2018 - 2019, 2022 Intel Corporation
+ * Copyright(C) 2018 - 2019, 2022 - 2023 Intel Corporation
  *
  * Portions of this file are derived from the ipw3945 project, as well
  * as portions of the ieee80211 subsystem header files.
@@ -1001,7 +1001,7 @@ static void iwlagn_mac_channel_switch(struct ieee80211_hw *hw,
 	if (priv->lib->set_channel_switch(priv, ch_switch)) {
 		clear_bit(STATUS_CHANNEL_SWITCH_PENDING, &priv->status);
 		priv->switch_channel = 0;
-		ieee80211_chswitch_done(ctx->vif, false);
+		ieee80211_chswitch_done(ctx->vif, false, 0);
 	}
 
 out:
@@ -1024,7 +1024,7 @@ void iwl_chswitch_done(struct iwl_priv *priv, bool is_success)
 		return;
 
 	if (ctx->vif)
-		ieee80211_chswitch_done(ctx->vif, is_success);
+		ieee80211_chswitch_done(ctx->vif, is_success, 0);
 }
 
 static void iwlagn_configure_filter(struct ieee80211_hw *hw,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index 7369a45f7f2b..b28d998c65c5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -1839,7 +1839,7 @@ void iwl_mvm_channel_switch_start_notif(struct iwl_mvm *mvm,
 
 		iwl_mvm_csa_client_absent(mvm, vif);
 		cancel_delayed_work(&mvmvif->csa_work);
-		ieee80211_chswitch_done(vif, true);
+		ieee80211_chswitch_done(vif, true, 0);
 		break;
 	default:
 		/* should never happen */
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 5918c1f2b10c..921f72dcddac 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -1370,7 +1370,8 @@ int iwl_mvm_set_tx_power(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 }
 
 int iwl_mvm_post_channel_switch(struct ieee80211_hw *hw,
-				struct ieee80211_vif *vif)
+				struct ieee80211_vif *vif,
+				struct ieee80211_bss_conf *link_conf)
 {
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
@@ -1452,7 +1453,8 @@ void iwl_mvm_abort_channel_switch(struct ieee80211_hw *hw,
 	mvmvif->csa_failed = true;
 	mutex_unlock(&mvm->mutex);
 
-	iwl_mvm_post_channel_switch(hw, vif);
+	/* If we're here, we can't support MLD */
+	iwl_mvm_post_channel_switch(hw, vif, &vif->bss_conf);
 }
 
 void iwl_mvm_channel_switch_disconnect_wk(struct work_struct *wk)
@@ -1464,7 +1466,7 @@ void iwl_mvm_channel_switch_disconnect_wk(struct work_struct *wk)
 	vif = container_of((void *)mvmvif, struct ieee80211_vif, drv_priv);
 
 	/* Trigger disconnect (should clear the CSA state) */
-	ieee80211_chswitch_done(vif, false);
+	ieee80211_chswitch_done(vif, false, 0);
 }
 
 static u8
@@ -5535,7 +5537,7 @@ void iwl_mvm_channel_switch_rx_beacon(struct ieee80211_hw *hw,
 		if (mvmvif->csa_misbehave) {
 			/* Second time, give up on this AP*/
 			iwl_mvm_abort_channel_switch(hw, vif);
-			ieee80211_chswitch_done(vif, false);
+			ieee80211_chswitch_done(vif, false, 0);
 			mvmvif->csa_misbehave = false;
 			return;
 		}
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index b18c91c5dd5d..dda13f4351c3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -2427,7 +2427,8 @@ static inline u8 iwl_mvm_phy_band_from_nl80211(enum nl80211_band band)
 /* Channel Switch */
 void iwl_mvm_channel_switch_disconnect_wk(struct work_struct *wk);
 int iwl_mvm_post_channel_switch(struct ieee80211_hw *hw,
-				struct ieee80211_vif *vif);
+				struct ieee80211_vif *vif,
+				struct ieee80211_bss_conf *link);
 
 /* Channel Context */
 /**
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
index 5f0e7144a951..e1f6cea649c3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
@@ -223,7 +223,7 @@ iwl_mvm_te_handle_notify_csa(struct iwl_mvm *mvm,
 		}
 		iwl_mvm_csa_client_absent(mvm, te_data->vif);
 		cancel_delayed_work(&mvmvif->csa_work);
-		ieee80211_chswitch_done(te_data->vif, true);
+		ieee80211_chswitch_done(te_data->vif, true, 0);
 		break;
 	default:
 		/* should never happen */
diff --git a/drivers/net/wireless/ti/wlcore/event.c b/drivers/net/wireless/ti/wlcore/event.c
index 46ab69eab26a..1e082d039b82 100644
--- a/drivers/net/wireless/ti/wlcore/event.c
+++ b/drivers/net/wireless/ti/wlcore/event.c
@@ -229,7 +229,7 @@ void wlcore_event_channel_switch(struct wl1271 *wl,
 		vif = wl12xx_wlvif_to_vif(wlvif);
 
 		if (wlvif->bss_type == BSS_TYPE_STA_BSS) {
-			ieee80211_chswitch_done(vif, success);
+			ieee80211_chswitch_done(vif, success, 0);
 			cancel_delayed_work(&wlvif->channel_switch_work);
 		} else {
 			set_bit(WLVIF_FLAG_BEACON_DISABLED, &wlvif->flags);
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index bf21611872a3..b7e68d2721c1 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -2043,7 +2043,7 @@ static void wlcore_channel_switch_work(struct work_struct *work)
 		goto out;
 
 	vif = wl12xx_wlvif_to_vif(wlvif);
-	ieee80211_chswitch_done(vif, false);
+	ieee80211_chswitch_done(vif, false, 0);
 
 	ret = pm_runtime_resume_and_get(wl->dev);
 	if (ret < 0)
@@ -3030,7 +3030,7 @@ static int wlcore_unset_assoc(struct wl1271 *wl, struct wl12xx_vif *wlvif)
 		struct ieee80211_vif *vif = wl12xx_wlvif_to_vif(wlvif);
 
 		wl12xx_cmd_stop_channel_switch(wl, wlvif);
-		ieee80211_chswitch_done(vif, false);
+		ieee80211_chswitch_done(vif, false, 0);
 		cancel_delayed_work(&wlvif->channel_switch_work);
 	}
 
@@ -5451,7 +5451,7 @@ static void wl12xx_op_channel_switch(struct ieee80211_hw *hw,
 
 	if (unlikely(wl->state == WLCORE_STATE_OFF)) {
 		if (test_bit(WLVIF_FLAG_STA_ASSOCIATED, &wlvif->flags))
-			ieee80211_chswitch_done(vif, false);
+			ieee80211_chswitch_done(vif, false, 0);
 		goto out;
 	} else if (unlikely(wl->state != WLCORE_STATE_ON)) {
 		goto out;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 154592ce48e5..09fe4601bf59 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4539,7 +4539,8 @@ struct ieee80211_ops {
 				  struct ieee80211_channel_switch *ch_switch);
 
 	int (*post_channel_switch)(struct ieee80211_hw *hw,
-				   struct ieee80211_vif *vif);
+				   struct ieee80211_vif *vif,
+				   struct ieee80211_bss_conf *link_conf);
 	void (*abort_channel_switch)(struct ieee80211_hw *hw,
 				     struct ieee80211_vif *vif);
 	void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw,
@@ -6537,11 +6538,14 @@ void ieee80211_radar_detected(struct ieee80211_hw *hw);
  * ieee80211_chswitch_done - Complete channel switch process
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @success: make the channel switch successful or not
+ * @link_id: the link_id on which the switch was done. Ignored if success is
+ *	false.
  *
  * Complete the channel switch post-process: set the new operational channel
  * and wake up the suspended queues.
  */
-void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success);
+void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success,
+			     unsigned int link_id);
 
 /**
  * ieee80211_channel_switch_disconnect - disconnect due to channel switch error
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 851d6ed68367..490ee6f52d6e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3590,8 +3590,9 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
-static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
+static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
 {
+	struct ieee80211_sub_if_data *sdata = link_data->sdata;
 	struct ieee80211_local *local = sdata->local;
 	u64 changed = 0;
 	int err;
@@ -3605,20 +3606,20 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 	 * completed successfully
 	 */
 
-	if (sdata->deflink.reserved_chanctx) {
+	if (link_data->reserved_chanctx) {
 		/*
 		 * with multi-vif csa driver may call ieee80211_csa_finish()
 		 * many times while waiting for other interfaces to use their
 		 * reservations
 		 */
-		if (sdata->deflink.reserved_ready)
+		if (link_data->reserved_ready)
 			return 0;
 
 		return ieee80211_link_use_reserved_context(&sdata->deflink);
 	}
 
 	if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef,
-					&sdata->deflink.csa_chandef))
+					&link_data->csa_chandef))
 		return -EINVAL;
 
 	sdata->vif.bss_conf.csa_active = false;
@@ -3635,25 +3636,27 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 
 	ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
 
-	if (sdata->deflink.csa_block_tx) {
+	if (link_data->csa_block_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
-		sdata->deflink.csa_block_tx = false;
+		link_data->csa_block_tx = false;
 	}
 
-	err = drv_post_channel_switch(sdata);
+	err = drv_post_channel_switch(link_data);
 	if (err)
 		return err;
 
-	cfg80211_ch_switch_notify(sdata->dev, &sdata->deflink.csa_chandef, 0,
+	cfg80211_ch_switch_notify(sdata->dev, &link_data->csa_chandef, 0,
 				  sdata->vif.bss_conf.eht_puncturing);
 
 	return 0;
 }
 
-static void ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
+static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
 {
-	if (__ieee80211_csa_finalize(sdata)) {
+	struct ieee80211_sub_if_data *sdata = link_data->sdata;
+
+	if (__ieee80211_csa_finalize(link_data)) {
 		sdata_info(sdata, "failed to finalize CSA, disconnecting\n");
 		cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev,
 				    GFP_KERNEL);
@@ -3662,21 +3665,21 @@ static void ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 
 void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work)
 {
-	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data,
-			     deflink.csa_finalize_work);
+	struct ieee80211_link_data *link =
+		container_of(work, struct ieee80211_link_data, csa_finalize_work);
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	/* AP might have been stopped while waiting for the lock. */
-	if (!sdata->vif.bss_conf.csa_active)
+	if (!link->conf->csa_active)
 		return;
 
 	if (!ieee80211_sdata_running(sdata))
 		return;
 
-	ieee80211_csa_finalize(sdata);
+	ieee80211_csa_finalize(link);
 }
 
 static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
@@ -3919,7 +3922,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 		drv_channel_switch_beacon(sdata, &params->chandef);
 	} else {
 		/* if the beacon didn't change, we can finalize immediately */
-		ieee80211_csa_finalize(sdata);
+		ieee80211_csa_finalize(&sdata->deflink);
 	}
 
 out:
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 554c7aa10cc2..77048b9065e6 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1126,8 +1126,9 @@ drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata,
 }
 
 static inline int
-drv_post_channel_switch(struct ieee80211_sub_if_data *sdata)
+drv_post_channel_switch(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	int ret = 0;
 
@@ -1139,7 +1140,8 @@ drv_post_channel_switch(struct ieee80211_sub_if_data *sdata)
 
 	trace_drv_post_channel_switch(local, sdata);
 	if (local->ops->post_channel_switch)
-		ret = local->ops->post_channel_switch(&local->hw, &sdata->vif);
+		ret = local->ops->post_channel_switch(&local->hw, &sdata->vif,
+						      link->conf);
 	trace_drv_return_int(local, ret);
 	return ret;
 }
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 6d0a29749e8c..0f295c5403b3 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1763,7 +1763,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link)
 	 */
 	link->u.mgd.beacon_crc_valid = false;
 
-	ret = drv_post_channel_switch(sdata);
+	ret = drv_post_channel_switch(link);
 	if (ret) {
 		sdata_info(sdata,
 			   "driver post channel switch failed, disconnecting\n");
@@ -1775,25 +1775,34 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link)
 	cfg80211_ch_switch_notify(sdata->dev, &link->reserved_chandef, 0, 0);
 }
 
-void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success)
+void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success,
+			     unsigned int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
-	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	if (WARN_ON(ieee80211_vif_is_mld(&sdata->vif)))
-		success = false;
+	trace_api_chswitch_done(sdata, success, link_id);
+
+	rcu_read_lock();
 
-	trace_api_chswitch_done(sdata, success);
 	if (!success) {
 		sdata_info(sdata,
 			   "driver channel switch failed, disconnecting\n");
 		wiphy_work_queue(sdata->local->hw.wiphy,
-				 &ifmgd->csa_connection_drop_work);
+				 &sdata->u.mgd.csa_connection_drop_work);
 	} else {
+		struct ieee80211_link_data *link =
+			rcu_dereference(sdata->link[link_id]);
+
+		if (WARN_ON(!link)) {
+			rcu_read_unlock();
+			return;
+		}
+
 		wiphy_delayed_work_queue(sdata->local->hw.wiphy,
-					 &sdata->deflink.u.mgd.chswitch_work,
-					 0);
+					 &link->u.mgd.chswitch_work, 0);
 	}
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(ieee80211_chswitch_done);
 
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index b8c53b4a710b..032718d5b298 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2839,23 +2839,26 @@ TRACE_EVENT(api_sta_block_awake,
 );
 
 TRACE_EVENT(api_chswitch_done,
-	TP_PROTO(struct ieee80211_sub_if_data *sdata, bool success),
+	TP_PROTO(struct ieee80211_sub_if_data *sdata, bool success,
+		 unsigned int link_id),
 
-	TP_ARGS(sdata, success),
+	TP_ARGS(sdata, success, link_id),
 
 	TP_STRUCT__entry(
 		VIF_ENTRY
 		__field(bool, success)
+		__field(unsigned int, link_id)
 	),
 
 	TP_fast_assign(
 		VIF_ASSIGN;
 		__entry->success = success;
+		__entry->link_id = link_id;
 	),
 
 	TP_printk(
-		VIF_PR_FMT " success=%d",
-		VIF_PR_ARG, __entry->success
+		VIF_PR_FMT " success=%d link_id=%d",
+		VIF_PR_ARG, __entry->success, __entry->link_id
 	)
 );
 
-- 
cgit v1.2.3-70-g09d2


From 5806ef25bc6e6cf0c04005ff25a4585437d567de Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Sun, 27 Aug 2023 14:05:22 +0300
Subject: wifi: cfg80211: add ieee80211_fragment_element to public API

This function will be used by the kunit tests within cfg80211. As it
is generally useful, move it from mac80211 to cfg80211.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230827135854.5af9391659f5.Ie534ed6591ba02be8572d4d7242394f29e3af04b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h     | 12 ++++++++++++
 net/mac80211/ieee80211_i.h |  2 --
 net/mac80211/util.c        | 29 -----------------------------
 net/wireless/util.c        | 29 +++++++++++++++++++++++++++++
 4 files changed, 41 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 922fd9e0d9b4..f22b22d7d4e8 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8873,6 +8873,18 @@ static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
 	return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset);
 }
 
+/**
+ * ieee80211_fragment_element - fragment the last element in skb
+ * @skb: The skbuf that the element was added to
+ * @len_pos: Pointer to length of the element to fragment
+ * @frag_id: The element ID to use for fragments
+ *
+ * This function fragments all data after @len_pos, adding fragmentation
+ * elements with the given ID as appropriate. The SKB will grow in size
+ * accordingly.
+ */
+void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id);
+
 /**
  * cfg80211_report_wowlan_wakeup - report wakeup from WoWLAN
  * @wdev: the wireless device reporting the wakeup
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b8465d205076..a6fd87f0c035 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2308,8 +2308,6 @@ ieee802_11_parse_elems(const u8 *start, size_t len, bool action,
 	return ieee802_11_parse_elems_crc(start, len, action, 0, 0, bss);
 }
 
-void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id);
-
 extern const int ieee802_1d_to_ac[8];
 
 static inline int ieee80211_ac_from_tid(int tid)
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index b0232a2b963e..ed113028794a 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -5101,32 +5101,3 @@ u8 *ieee80211_ie_build_eht_cap(u8 *pos,
 
 	return pos;
 }
-
-void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id)
-{
-	unsigned int elem_len;
-
-	if (!len_pos)
-		return;
-
-	elem_len = skb->data + skb->len - len_pos - 1;
-
-	while (elem_len > 255) {
-		/* this one is 255 */
-		*len_pos = 255;
-		/* remaining data gets smaller */
-		elem_len -= 255;
-		/* make space for the fragment ID/len in SKB */
-		skb_put(skb, 2);
-		/* shift back the remaining data to place fragment ID/len */
-		memmove(len_pos + 255 + 3, len_pos + 255 + 1, elem_len);
-		/* place the fragment ID */
-		len_pos += 255 + 1;
-		*len_pos = frag_id;
-		/* and point to fragment length to update later */
-		len_pos++;
-	}
-
-	*len_pos = elem_len;
-}
-EXPORT_SYMBOL_IF_KUNIT(ieee80211_fragment_element);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index fff99fe43fdd..56cbd9979a3f 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1966,6 +1966,35 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
 }
 EXPORT_SYMBOL(ieee80211_ie_split_ric);
 
+void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id)
+{
+	unsigned int elem_len;
+
+	if (!len_pos)
+		return;
+
+	elem_len = skb->data + skb->len - len_pos - 1;
+
+	while (elem_len > 255) {
+		/* this one is 255 */
+		*len_pos = 255;
+		/* remaining data gets smaller */
+		elem_len -= 255;
+		/* make space for the fragment ID/len in SKB */
+		skb_put(skb, 2);
+		/* shift back the remaining data to place fragment ID/len */
+		memmove(len_pos + 255 + 3, len_pos + 255 + 1, elem_len);
+		/* place the fragment ID */
+		len_pos += 255 + 1;
+		*len_pos = frag_id;
+		/* and point to fragment length to update later */
+		len_pos++;
+	}
+
+	*len_pos = elem_len;
+}
+EXPORT_SYMBOL(ieee80211_fragment_element);
+
 bool ieee80211_operating_class_to_band(u8 operating_class,
 				       enum nl80211_band *band)
 {
-- 
cgit v1.2.3-70-g09d2


From 762c8dc7f269b748babe32dd19d2084ce1b3f31f Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Mon, 11 Sep 2023 20:50:45 +0800
Subject: net: dst: remove unnecessary input parameter in dst_alloc and
 dst_init

Since commit 1202cdd66531("Remove DECnet support from kernel") has been
merged, all callers pass in the initial_ref value of 1 when they call
dst_alloc(). Therefore, remove initial_ref when the dst_alloc() is
declared and replace initial_ref with 1 in dst_alloc().
Also when all callers call dst_init(), the value of initial_ref is 1.
Therefore, remove the input parameter initial_ref of the dst_init() and
replace initial_ref with the value 1 in dst_init.

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Link: https://lore.kernel.org/r/20230911125045.346390-1-shaozhengchao@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/dst.h         |  4 ++--
 net/core/dst.c            | 10 +++++-----
 net/ipv4/route.c          |  6 +++---
 net/ipv6/route.c          |  4 ++--
 net/openvswitch/actions.c |  4 ++--
 net/sched/sch_frag.c      |  4 ++--
 net/xfrm/xfrm_policy.c    |  2 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 78884429deed..f8b8599a0600 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -392,10 +392,10 @@ static inline int dst_discard(struct sk_buff *skb)
 {
 	return dst_discard_out(&init_net, skb->sk, skb);
 }
-void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref,
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 		int initial_obsolete, unsigned short flags);
 void dst_init(struct dst_entry *dst, struct dst_ops *ops,
-	      struct net_device *dev, int initial_ref, int initial_obsolete,
+	      struct net_device *dev, int initial_obsolete,
 	      unsigned short flags);
 struct dst_entry *dst_destroy(struct dst_entry *dst);
 void dst_dev_put(struct dst_entry *dst);
diff --git a/net/core/dst.c b/net/core/dst.c
index 980e2fd2f013..6838d3212c37 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,7 +45,7 @@ const struct dst_metrics dst_default_metrics = {
 EXPORT_SYMBOL(dst_default_metrics);
 
 void dst_init(struct dst_entry *dst, struct dst_ops *ops,
-	      struct net_device *dev, int initial_ref, int initial_obsolete,
+	      struct net_device *dev, int initial_obsolete,
 	      unsigned short flags)
 {
 	dst->dev = dev;
@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->tclassid = 0;
 #endif
 	dst->lwtstate = NULL;
-	rcuref_init(&dst->__rcuref, initial_ref);
+	rcuref_init(&dst->__rcuref, 1);
 	INIT_LIST_HEAD(&dst->rt_uncached);
 	dst->__use = 0;
 	dst->lastuse = jiffies;
@@ -77,7 +77,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 EXPORT_SYMBOL(dst_init);
 
 void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
-		int initial_ref, int initial_obsolete, unsigned short flags)
+		int initial_obsolete, unsigned short flags)
 {
 	struct dst_entry *dst;
 
@@ -90,7 +90,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 	if (!dst)
 		return NULL;
 
-	dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+	dst_init(dst, ops, dev, initial_obsolete, flags);
 
 	return dst;
 }
@@ -270,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst,
 	struct dst_entry *dst;
 
 	dst = &md_dst->dst;
-	dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE,
+	dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE,
 		 DST_METADATA | DST_NOCOUNT);
 	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
 	md_dst->type = type;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 66f419e7f9a7..fb3045692b99 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1630,7 +1630,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 {
 	struct rtable *rt;
 
-	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+	rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
 		       (noxfrm ? DST_NOXFRM : 0));
 
 	if (rt) {
@@ -1658,7 +1658,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
 {
 	struct rtable *new_rt;
 
-	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+	new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
 			   rt->dst.flags);
 
 	if (new_rt) {
@@ -2832,7 +2832,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 	struct rtable *ort = (struct rtable *) dst_orig;
 	struct rtable *rt;
 
-	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
+	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
 	if (rt) {
 		struct dst_entry *new = &rt->dst;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9c687b357e6a..9d8dfc7423e4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -341,7 +341,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
 			       int flags)
 {
 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
-					1, DST_OBSOLETE_FORCE_CHK, flags);
+					DST_OBSOLETE_FORCE_CHK, flags);
 
 	if (rt) {
 		rt6_info_init(rt);
@@ -2655,7 +2655,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 	struct net_device *loopback_dev = net->loopback_dev;
 	struct dst_entry *new = NULL;
 
-	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
+	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
 		       DST_OBSOLETE_DEAD, 0);
 	if (rt) {
 		rt6_info_init(rt);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index fd66014d8a76..5f8094acd056 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -873,7 +873,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
 
 		prepare_frag(vport, skb, orig_network_offset,
 			     ovs_key_mac_proto(key));
-		dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
+		dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL,
 			 DST_OBSOLETE_NONE, DST_NOCOUNT);
 		ovs_rt.dst.dev = vport->dev;
 
@@ -890,7 +890,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
 		prepare_frag(vport, skb, orig_network_offset,
 			     ovs_key_mac_proto(key));
 		memset(&ovs_rt, 0, sizeof(ovs_rt));
-		dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
+		dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL,
 			 DST_OBSOLETE_NONE, DST_NOCOUNT);
 		ovs_rt.dst.dev = vport->dev;
 
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
index a9bd0a235890..ce63414185fd 100644
--- a/net/sched/sch_frag.c
+++ b/net/sched/sch_frag.c
@@ -96,7 +96,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
 		unsigned long orig_dst;
 
 		sch_frag_prepare_frag(skb, xmit);
-		dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1,
+		dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
 			 DST_OBSOLETE_NONE, DST_NOCOUNT);
 		sch_frag_rt.dst.dev = skb->dev;
 
@@ -112,7 +112,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
 
 		sch_frag_prepare_frag(skb, xmit);
 		memset(&sch_frag_rt, 0, sizeof(sch_frag_rt));
-		dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1,
+		dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
 			 DST_OBSOLETE_NONE, DST_NOCOUNT);
 		sch_frag_rt.dst.dev = skb->dev;
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d6b405782b63..c4c4fc29ccf5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2561,7 +2561,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 	default:
 		BUG();
 	}
-	xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
+	xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0);
 
 	if (likely(xdst)) {
 		memset_after(xdst, 0, u.dst);
-- 
cgit v1.2.3-70-g09d2


From 11445469dec803730ac7f78a5ad2a5ed131fa941 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Sep 2023 17:05:29 +0000
Subject: net: sock_release_ownership() cleanup

sock_release_ownership() should only be called by user
owning the socket lock.

After prior commit, we can remove one condition.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sock.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index b770261fbdaf..676146e9d181 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1823,12 +1823,11 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
 
 static inline void sock_release_ownership(struct sock *sk)
 {
-	if (sock_owned_by_user_nocheck(sk)) {
-		sk->sk_lock.owned = 0;
+	DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
+	sk->sk_lock.owned = 0;
 
-		/* The sk_lock has mutex_unlock() semantics: */
-		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
-	}
+	/* The sk_lock has mutex_unlock() semantics: */
+	mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 }
 
 /* no reclassification while locks are held */
-- 
cgit v1.2.3-70-g09d2


From 133c4c0d37175f510a10fa9bed51e223936073fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Sep 2023 17:05:31 +0000
Subject: tcp: defer regular ACK while processing socket backlog

This idea came after a particular workload requested
the quickack attribute set on routes, and a performance
drop was noticed for large bulk transfers.

For high throughput flows, it is best to use one cpu
running the user thread issuing socket system calls,
and a separate cpu to process incoming packets from BH context.
(With TSO/GRO, bottleneck is usually the 'user' cpu)

Problem is the user thread can spend a lot of time while holding
the socket lock, forcing BH handler to queue most of incoming
packets in the socket backlog.

Whenever the user thread releases the socket lock, it must first
process all accumulated packets in the backlog, potentially
adding latency spikes. Due to flood mitigation, having too many
packets in the backlog increases chance of unexpected drops.

Backlog processing unfortunately shifts a fair amount of cpu cycles
from the BH cpu to the 'user' cpu, thus reducing max throughput.

This patch takes advantage of the backlog processing,
and the fact that ACK are mostly cumulative.

The idea is to detect we are in the backlog processing
and defer all eligible ACK into a single one,
sent from tcp_release_cb().

This saves cpu cycles on both sides, and network resources.

Performance of a single TCP flow on a 200Gbit NIC:

- Throughput is increased by 20% (100Gbit -> 120Gbit).
- Number of generated ACK per second shrinks from 240,000 to 40,000.
- Number of backlog drops per second shrinks from 230 to 0.

Benchmark context:
 - Regular netperf TCP_STREAM (no zerocopy)
 - Intel(R) Xeon(R) Platinum 8481C (Saphire Rapids)
 - MAX_SKB_FRAGS = 17 (~60KB per GRO packet)

This feature is guarded by a new sysctl, and enabled by default:
 /proc/sys/net/ipv4/tcp_backlog_ack_defer

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/ip-sysctl.rst |  7 +++++++
 include/linux/tcp.h                    | 14 ++++++++------
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  9 +++++++++
 net/ipv4/tcp_input.c                   |  8 ++++++++
 net/ipv4/tcp_ipv4.c                    |  1 +
 net/ipv4/tcp_output.c                  |  5 ++++-
 7 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a66054d0763a..5bfa1837968c 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER
 
 	Default : 44
 
+tcp_backlog_ack_defer - BOOLEAN
+	If set, user thread processing socket backlog tries sending
+	one ACK for the whole queue. This helps to avoid potential
+	long latencies at end of a TCP socket syscall.
+
+	Default : true
+
 tcp_slow_start_after_idle - BOOLEAN
 	If set, provide RFC2861 behavior and time out the congestion
 	window after an idle period.  An idle period is defined at
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3c5efeeb024f..44d946161d4a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -463,15 +463,17 @@ enum tsq_enum {
 	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
 				    * tcp_v{4|6}_mtu_reduced()
 				    */
+	TCP_ACK_DEFERRED,	   /* TX pure ack is deferred */
 };
 
 enum tsq_flags {
-	TSQF_THROTTLED			= (1UL << TSQ_THROTTLED),
-	TSQF_QUEUED			= (1UL << TSQ_QUEUED),
-	TCPF_TSQ_DEFERRED		= (1UL << TCP_TSQ_DEFERRED),
-	TCPF_WRITE_TIMER_DEFERRED	= (1UL << TCP_WRITE_TIMER_DEFERRED),
-	TCPF_DELACK_TIMER_DEFERRED	= (1UL << TCP_DELACK_TIMER_DEFERRED),
-	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
+	TSQF_THROTTLED			= BIT(TSQ_THROTTLED),
+	TSQF_QUEUED			= BIT(TSQ_QUEUED),
+	TCPF_TSQ_DEFERRED		= BIT(TCP_TSQ_DEFERRED),
+	TCPF_WRITE_TIMER_DEFERRED	= BIT(TCP_WRITE_TIMER_DEFERRED),
+	TCPF_DELACK_TIMER_DEFERRED	= BIT(TCP_DELACK_TIMER_DEFERRED),
+	TCPF_MTU_REDUCED_DEFERRED	= BIT(TCP_MTU_REDUCED_DEFERRED),
+	TCPF_ACK_DEFERRED		= BIT(TCP_ACK_DEFERRED),
 };
 
 #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7a41c4791536..d96d05b08819 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -132,6 +132,7 @@ struct netns_ipv4 {
 	u8 sysctl_tcp_syncookies;
 	u8 sysctl_tcp_migrate_req;
 	u8 sysctl_tcp_comp_sack_nr;
+	u8 sysctl_tcp_backlog_ack_defer;
 	int sysctl_tcp_reordering;
 	u8 sysctl_tcp_retries1;
 	u8 sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6ac890b4073f..e7f024d93572 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dou8vec_minmax,
 		.extra1		= SYSCTL_ZERO,
 	},
+	{
+		.procname	= "tcp_backlog_ack_defer",
+		.data		= &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{
 		.procname       = "tcp_reflect_tos",
 		.data           = &init_net.ipv4.sysctl_tcp_reflect_tos,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 06fe1cf645d5..41b471748437 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5553,6 +5553,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    tcp_in_quickack_mode(sk) ||
 	    /* Protocol state mandates a one-time immediate ACK */
 	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
+		/* If we are running from __release_sock() in user context,
+		 * Defer the ack until tcp_release_cb().
+		 */
+		if (sock_owned_by_user_nocheck(sk) &&
+		    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
+			set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
+			return;
+		}
 send_now:
 		tcp_send_ack(sk);
 		return;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 27140e5cdc06..f13eb7e23d03 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3263,6 +3263,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4cac12d0e63..1fc1f879cfd6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1077,7 +1077,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
 #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
 			  TCPF_WRITE_TIMER_DEFERRED |	\
 			  TCPF_DELACK_TIMER_DEFERRED |	\
-			  TCPF_MTU_REDUCED_DEFERRED)
+			  TCPF_MTU_REDUCED_DEFERRED |	\
+			  TCPF_ACK_DEFERRED)
 /**
  * tcp_release_cb - tcp release_sock() callback
  * @sk: socket
@@ -1114,6 +1115,8 @@ void tcp_release_cb(struct sock *sk)
 		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 		__sock_put(sk);
 	}
+	if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
+		tcp_send_ack(sk);
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
-- 
cgit v1.2.3-70-g09d2


From 2b5dcb31a19a2e0acd869b12c9db9b2d696ef544 Mon Sep 17 00:00:00 2001
From: Leon Hwang <hffilwlqm@gmail.com>
Date: Tue, 12 Sep 2023 23:04:41 +0800
Subject: bpf, x64: Fix tailcall infinite loop

From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall
handling in JIT"), the tailcall on x64 works better than before.

From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms
for x64 JIT"), tailcall is able to run in BPF subprograms on x64.

From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program
to other BPF programs"), BPF program is able to trace other BPF programs.

How about combining them all together?

1. FENTRY/FEXIT on a BPF subprogram.
2. A tailcall runs in the BPF subprogram.
3. The tailcall calls the subprogram's caller.

As a result, a tailcall infinite loop comes up. And the loop would halt
the machine.

As we know, in tail call context, the tail_call_cnt propagates by stack
and rax register between BPF subprograms. So do in trampolines.

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT")
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
Link: https://lore.kernel.org/r/20230912150442.2009-3-hffilwlqm@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++------
 include/linux/bpf.h         |  5 +++++
 kernel/bpf/trampoline.c     |  4 ++--
 kernel/bpf/verifier.c       |  3 +++
 4 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bcca1c9b9a02..2846c21d75bf 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1022,6 +1022,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
+/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
+#define RESTORE_TAIL_CALL_CNT(stack)				\
+	EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
 		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
@@ -1627,9 +1631,7 @@ st:			if (is_imm8(insn->off))
 
 			func = (u8 *) __bpf_call_base + imm32;
 			if (tail_call_reachable) {
-				/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
-				EMIT3_off32(0x48, 0x8B, 0x85,
-					    -round_up(bpf_prog->aux->stack_depth, 8) - 8);
+				RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
 				if (!imm32)
 					return -EINVAL;
 				offs = 7 + x86_call_depth_emit_accounting(&prog, func);
@@ -2404,6 +2406,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	 *                     [ ...        ]
 	 *                     [ stack_arg2 ]
 	 * RBP - arg_stack_off [ stack_arg1 ]
+	 * RSP                 [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX
 	 */
 
 	/* room for return value of orig_call or fentry prog */
@@ -2468,6 +2471,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	else
 		/* sub rsp, stack_size */
 		EMIT4(0x48, 0x83, 0xEC, stack_size);
+	if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+		EMIT1(0x50);		/* push rax */
 	/* mov QWORD PTR [rbp - rbx_off], rbx */
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off);
 
@@ -2520,9 +2525,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 		restore_regs(m, &prog, regs_off);
 		save_args(m, &prog, arg_stack_off, true);
 
+		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+			/* Before calling the original function, restore the
+			 * tail_call_cnt from stack to rax.
+			 */
+			RESTORE_TAIL_CALL_CNT(stack_size);
+
 		if (flags & BPF_TRAMP_F_ORIG_STACK) {
-			emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
-			EMIT2(0xff, 0xd0); /* call *rax */
+			emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
+			EMIT2(0xff, 0xd3); /* call *rbx */
 		} else {
 			/* call original function */
 			if (emit_rsb_call(&prog, orig_call, prog)) {
@@ -2573,7 +2584,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 			ret = -EINVAL;
 			goto cleanup;
 		}
-	}
+	} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+		/* Before running the original function, restore the
+		 * tail_call_cnt from stack to rax.
+		 */
+		RESTORE_TAIL_CALL_CNT(stack_size);
+
 	/* restore return value of orig_call or fentry prog back into RAX */
 	if (save_ret)
 		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 87eeb3a46a1d..b9e573159432 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1035,6 +1035,11 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_SHARE_IPMODIFY	BIT(6)
 
+/* Indicate that current trampoline is in a tail call context. Then, it has to
+ * cache and restore tail_call_cnt to avoid infinite tail call loop.
+ */
+#define BPF_TRAMP_F_TAIL_CALL_CTX	BIT(7)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 53ff50cac61e..e97aeda3a86b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -415,8 +415,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
 		goto out;
 	}
 
-	/* clear all bits except SHARE_IPMODIFY */
-	tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+	/* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
+	tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
 
 	if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
 	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dbba2b806017..18e673c0ac15 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19774,6 +19774,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	if (!tr)
 		return -ENOMEM;
 
+	if (tgt_prog && tgt_prog->aux->tail_call_reachable)
+		tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
+
 	prog->aux->dst_trampoline = tr;
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 8c73d5248dcf112611654bcd32352dc330b02397 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 13 Sep 2023 09:34:25 +0200
Subject: wifi: cfg80211: fix kernel-doc for wiphy_delayed_work_flush()

Clearly, there's no space in the function name, not sure how
that could've happened. Put the underscore that it should be.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 56cfb8ce1f7f ("wifi: cfg80211: add flush functions for wiphy work")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f22b22d7d4e8..134d9e0b73c9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5929,7 +5929,7 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy,
 			       struct wiphy_delayed_work *dwork);
 
 /**
- * wiphy_delayed work_flush - flush previously queued delayed work
+ * wiphy_delayed_work_flush - flush previously queued delayed work
  * @wiphy: the wiphy, for debug purposes
  * @work: the work to flush
  *
-- 
cgit v1.2.3-70-g09d2


From 5add321c329b1746589b51359259666ca3dbe219 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 29 Aug 2023 12:17:43 +0200
Subject: wifi: cfg80211: remove scan_width support

There really isn't any support for scanning at different
channel widths than 20 MHz since there's no way to set it.
Remove this support for now, if somebody wants to maintain
this whole thing later we can revisit how it should work.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/wil6210/wmi.c             |  2 -
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |  1 -
 include/net/cfg80211.h                             | 63 +------------------
 include/uapi/linux/nl80211.h                       |  2 +-
 net/mac80211/ibss.c                                | 30 ++-------
 net/mac80211/ieee80211_i.h                         |  3 +-
 net/mac80211/ocb.c                                 |  5 +-
 net/mac80211/scan.c                                | 71 +++++-----------------
 net/wireless/mesh.c                                |  5 +-
 net/wireless/nl80211.c                             |  1 -
 net/wireless/scan.c                                | 23 +------
 net/wireless/trace.h                               |  6 +-
 net/wireless/util.c                                | 14 ++---
 13 files changed, 35 insertions(+), 191 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index 6a5976a2944c..6fdb77d4c59e 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -870,7 +870,6 @@ static void wmi_evt_rx_mgmt(struct wil6210_vif *vif, int id, void *d, int len)
 		struct cfg80211_bss *bss;
 		struct cfg80211_inform_bss bss_data = {
 			.chan = channel,
-			.scan_width = NL80211_BSS_CHAN_WIDTH_20,
 			.signal = signal,
 			.boottime_ns = ktime_to_ns(ktime_get_boottime()),
 		};
@@ -1389,7 +1388,6 @@ wmi_evt_sched_scan_result(struct wil6210_vif *vif, int id, void *d, int len)
 	u32 d_len;
 	struct cfg80211_bss *bss;
 	struct cfg80211_inform_bss bss_data = {
-		.scan_width = NL80211_BSS_CHAN_WIDTH_20,
 		.boottime_ns = ktime_to_ns(ktime_get_boottime()),
 	};
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 2a90bb24ba77..94b4a7b8793d 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -3367,7 +3367,6 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
 
 	freq = ieee80211_channel_to_frequency(channel, band);
 	bss_data.chan = ieee80211_get_channel(wiphy, freq);
-	bss_data.scan_width = NL80211_BSS_CHAN_WIDTH_20;
 	bss_data.boottime_ns = ktime_to_ns(ktime_get_boottime());
 
 	notify_capability = le16_to_cpu(bi->capability);
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 134d9e0b73c9..2d3fa4a29781 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2536,7 +2536,6 @@ struct cfg80211_scan_6ghz_params {
  * @n_ssids: number of SSIDs
  * @channels: channels to scan on.
  * @n_channels: total number of channels to scan
- * @scan_width: channel width for scanning
  * @ie: optional information element(s) to add into Probe Request or %NULL
  * @ie_len: length of ie in octets
  * @duration: how long to listen on each channel, in TUs. If
@@ -2566,7 +2565,6 @@ struct cfg80211_scan_request {
 	struct cfg80211_ssid *ssids;
 	int n_ssids;
 	u32 n_channels;
-	enum nl80211_bss_scan_width scan_width;
 	const u8 *ie;
 	size_t ie_len;
 	u16 duration;
@@ -2661,7 +2659,6 @@ struct cfg80211_bss_select_adjust {
  * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans)
  * @n_ssids: number of SSIDs
  * @n_channels: total number of channels to scan
- * @scan_width: channel width for scanning
  * @ie: optional information element(s) to add into Probe Request or %NULL
  * @ie_len: length of ie in octets
  * @flags: control flags from &enum nl80211_scan_flags
@@ -2709,7 +2706,6 @@ struct cfg80211_sched_scan_request {
 	struct cfg80211_ssid *ssids;
 	int n_ssids;
 	u32 n_channels;
-	enum nl80211_bss_scan_width scan_width;
 	const u8 *ie;
 	size_t ie_len;
 	u32 flags;
@@ -2757,7 +2753,6 @@ enum cfg80211_signal_type {
 /**
  * struct cfg80211_inform_bss - BSS inform data
  * @chan: channel the frame was received on
- * @scan_width: scan width that was used
  * @signal: signal strength value, according to the wiphy's
  *	signal type
  * @boottime_ns: timestamp (CLOCK_BOOTTIME) when the information was
@@ -2777,7 +2772,6 @@ enum cfg80211_signal_type {
  */
 struct cfg80211_inform_bss {
 	struct ieee80211_channel *chan;
-	enum nl80211_bss_scan_width scan_width;
 	s32 signal;
 	u64 boottime_ns;
 	u64 parent_tsf;
@@ -2811,7 +2805,6 @@ struct cfg80211_bss_ies {
  * for use in scan results and similar.
  *
  * @channel: channel this BSS is on
- * @scan_width: width of the control channel
  * @bssid: BSSID of the BSS
  * @beacon_interval: the beacon interval as from the frame
  * @capability: the capability field in host byte order
@@ -2841,7 +2834,6 @@ struct cfg80211_bss_ies {
  */
 struct cfg80211_bss {
 	struct ieee80211_channel *channel;
-	enum nl80211_bss_scan_width scan_width;
 
 	const struct cfg80211_bss_ies __rcu *ies;
 	const struct cfg80211_bss_ies __rcu *beacon_ies;
@@ -6321,13 +6313,11 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
 /**
  * ieee80211_mandatory_rates - get mandatory rates for a given band
  * @sband: the band to look for rates in
- * @scan_width: width of the control channel
  *
  * This function returns a bitmap of the mandatory rates for the given
  * band, bits are set according to the rate position in the bitrates array.
  */
-u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
-			      enum nl80211_bss_scan_width scan_width);
+u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband);
 
 /*
  * Radiotap parsing functions -- for controlled injection support
@@ -6988,22 +6978,6 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
 			       struct ieee80211_mgmt *mgmt, size_t len,
 			       gfp_t gfp);
 
-static inline struct cfg80211_bss * __must_check
-cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
-				struct ieee80211_channel *rx_channel,
-				enum nl80211_bss_scan_width scan_width,
-				struct ieee80211_mgmt *mgmt, size_t len,
-				s32 signal, gfp_t gfp)
-{
-	struct cfg80211_inform_bss data = {
-		.chan = rx_channel,
-		.scan_width = scan_width,
-		.signal = signal,
-	};
-
-	return cfg80211_inform_bss_frame_data(wiphy, &data, mgmt, len, gfp);
-}
-
 static inline struct cfg80211_bss * __must_check
 cfg80211_inform_bss_frame(struct wiphy *wiphy,
 			  struct ieee80211_channel *rx_channel,
@@ -7012,7 +6986,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 {
 	struct cfg80211_inform_bss data = {
 		.chan = rx_channel,
-		.scan_width = NL80211_BSS_CHAN_WIDTH_20,
 		.signal = signal,
 	};
 
@@ -7114,26 +7087,6 @@ cfg80211_inform_bss_data(struct wiphy *wiphy,
 			 u16 beacon_interval, const u8 *ie, size_t ielen,
 			 gfp_t gfp);
 
-static inline struct cfg80211_bss * __must_check
-cfg80211_inform_bss_width(struct wiphy *wiphy,
-			  struct ieee80211_channel *rx_channel,
-			  enum nl80211_bss_scan_width scan_width,
-			  enum cfg80211_bss_frame_type ftype,
-			  const u8 *bssid, u64 tsf, u16 capability,
-			  u16 beacon_interval, const u8 *ie, size_t ielen,
-			  s32 signal, gfp_t gfp)
-{
-	struct cfg80211_inform_bss data = {
-		.chan = rx_channel,
-		.scan_width = scan_width,
-		.signal = signal,
-	};
-
-	return cfg80211_inform_bss_data(wiphy, &data, ftype, bssid, tsf,
-					capability, beacon_interval, ie, ielen,
-					gfp);
-}
-
 static inline struct cfg80211_bss * __must_check
 cfg80211_inform_bss(struct wiphy *wiphy,
 		    struct ieee80211_channel *rx_channel,
@@ -7144,7 +7097,6 @@ cfg80211_inform_bss(struct wiphy *wiphy,
 {
 	struct cfg80211_inform_bss data = {
 		.chan = rx_channel,
-		.scan_width = NL80211_BSS_CHAN_WIDTH_20,
 		.signal = signal,
 	};
 
@@ -7229,19 +7181,6 @@ void cfg80211_bss_iter(struct wiphy *wiphy,
 				    void *data),
 		       void *iter_data);
 
-static inline enum nl80211_bss_scan_width
-cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef)
-{
-	switch (chandef->width) {
-	case NL80211_CHAN_WIDTH_5:
-		return NL80211_BSS_CHAN_WIDTH_5;
-	case NL80211_CHAN_WIDTH_10:
-		return NL80211_BSS_CHAN_WIDTH_10;
-	default:
-		return NL80211_BSS_CHAN_WIDTH_20;
-	}
-}
-
 /**
  * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame
  * @dev: network device
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 88eb85c63029..b628126e06fa 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5038,7 +5038,7 @@ enum nl80211_bss_scan_width {
  *	elements from a Beacon frame (bin); not present if no Beacon frame has
  *	yet been received
  * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel
- *	(u32, enum nl80211_bss_scan_width)
+ *	(u32, enum nl80211_bss_scan_width) - No longer used!
  * @NL80211_BSS_BEACON_TSF: TSF of the last received beacon (u64)
  *	(not present if no beacon frame has been received yet)
  * @NL80211_BSS_PRESP_DATA: the data in @NL80211_BSS_INFORMATION_ELEMENTS and
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 9907cea6457c..55ec34602b53 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -377,7 +377,6 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
 
 	bss_meta.chan = chan;
-	bss_meta.scan_width = cfg80211_chandef_to_scan_width(&chandef);
 	bss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt,
 					     presp->head_len, GFP_KERNEL);
 
@@ -595,7 +594,6 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
 	struct sta_info *sta;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	struct ieee80211_supported_band *sband;
-	enum nl80211_bss_scan_width scan_width;
 	int band;
 
 	/*
@@ -624,7 +622,6 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
 	if (WARN_ON_ONCE(!chanctx_conf))
 		return NULL;
 	band = chanctx_conf->def.chan->band;
-	scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
 	rcu_read_unlock();
 
 	sta = sta_info_alloc(sdata, addr, GFP_KERNEL);
@@ -636,7 +633,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
 	/* make sure mandatory rates are always added */
 	sband = local->hw.wiphy->bands[band];
 	sta->sta.deflink.supp_rates[band] = supp_rates |
-			ieee80211_mandatory_rates(sband, scan_width);
+			ieee80211_mandatory_rates(sband);
 
 	return ieee80211_ibss_finish_sta(sta);
 }
@@ -975,7 +972,6 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
 {
 	struct sta_info *sta;
 	enum nl80211_band band = rx_status->band;
-	enum nl80211_bss_scan_width scan_width;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
 	bool rates_updated = false;
@@ -1001,15 +997,9 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
 			u32 prev_rates;
 
 			prev_rates = sta->sta.deflink.supp_rates[band];
-			/* make sure mandatory rates are always added */
-			scan_width = NL80211_BSS_CHAN_WIDTH_20;
-			if (rx_status->bw == RATE_INFO_BW_5)
-				scan_width = NL80211_BSS_CHAN_WIDTH_5;
-			else if (rx_status->bw == RATE_INFO_BW_10)
-				scan_width = NL80211_BSS_CHAN_WIDTH_10;
 
 			sta->sta.deflink.supp_rates[band] = supp_rates |
-				ieee80211_mandatory_rates(sband, scan_width);
+				ieee80211_mandatory_rates(sband);
 			if (sta->sta.deflink.supp_rates[band] != prev_rates) {
 				ibss_dbg(sdata,
 					 "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n",
@@ -1196,7 +1186,6 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 	struct sta_info *sta;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	struct ieee80211_supported_band *sband;
-	enum nl80211_bss_scan_width scan_width;
 	int band;
 
 	/*
@@ -1222,7 +1211,6 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 	band = chanctx_conf->def.chan->band;
-	scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
 	rcu_read_unlock();
 
 	sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
@@ -1232,7 +1220,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 	/* make sure mandatory rates are always added */
 	sband = local->hw.wiphy->bands[band];
 	sta->sta.deflink.supp_rates[band] = supp_rates |
-			ieee80211_mandatory_rates(sband, scan_width);
+			ieee80211_mandatory_rates(sband);
 
 	spin_lock(&ifibss->incomplete_lock);
 	list_add(&sta->list, &ifibss->incomplete_stations);
@@ -1282,7 +1270,6 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
 static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
-	enum nl80211_bss_scan_width scan_width;
 
 	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
@@ -1304,9 +1291,8 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
 	sdata_info(sdata,
 		   "No active IBSS STAs - trying to scan for other IBSS networks with same SSID (merge)\n");
 
-	scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef);
 	ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len,
-				    NULL, 0, scan_width);
+				    NULL, 0);
 }
 
 static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
@@ -1424,7 +1410,6 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 	struct cfg80211_bss *cbss;
 	struct ieee80211_channel *chan = NULL;
 	const u8 *bssid = NULL;
-	enum nl80211_bss_scan_width scan_width;
 	int active_ibss;
 
 	lockdep_assert_wiphy(sdata->local->hw.wiphy);
@@ -1483,8 +1468,6 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 
 		sdata_info(sdata, "Trigger new scan to find an IBSS to join\n");
 
-		scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef);
-
 		if (ifibss->fixed_channel) {
 			num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy,
 								 &ifibss->chandef,
@@ -1492,11 +1475,10 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 								 ARRAY_SIZE(channels));
 			ieee80211_request_ibss_scan(sdata, ifibss->ssid,
 						    ifibss->ssid_len, channels,
-						    num, scan_width);
+						    num);
 		} else {
 			ieee80211_request_ibss_scan(sdata, ifibss->ssid,
-						    ifibss->ssid_len, NULL,
-						    0, scan_width);
+						    ifibss->ssid_len, NULL, 0);
 		}
 	} else {
 		int interval = IEEE80211_SCAN_INTERVAL;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index e7dc4cdcdcde..e443a8e5e9be 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1911,8 +1911,7 @@ void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work);
 int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
 				const u8 *ssid, u8 ssid_len,
 				struct ieee80211_channel **channels,
-				unsigned int n_channels,
-				enum nl80211_bss_scan_width scan_width);
+				unsigned int n_channels);
 int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 			   struct cfg80211_scan_request *req);
 void ieee80211_scan_cancel(struct ieee80211_local *local);
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
index 6e2965ffb809..449af4e1cca4 100644
--- a/net/mac80211/ocb.c
+++ b/net/mac80211/ocb.c
@@ -44,7 +44,6 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	struct ieee80211_supported_band *sband;
-	enum nl80211_bss_scan_width scan_width;
 	struct sta_info *sta;
 	int band;
 
@@ -66,7 +65,6 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 	band = chanctx_conf->def.chan->band;
-	scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
 	rcu_read_unlock();
 
 	sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
@@ -75,8 +73,7 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 
 	/* Add only mandatory rates for now */
 	sband = local->hw.wiphy->bands[band];
-	sta->sta.deflink.supp_rates[band] =
-		ieee80211_mandatory_rates(sband, scan_width);
+	sta->sta.deflink.supp_rates[band] = ieee80211_mandatory_rates(sband);
 
 	spin_lock(&ifocb->incomplete_lock);
 	list_add(&sta->list, &ifocb->incomplete_stations);
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 58d525e41f6b..24fa06105378 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -187,12 +187,6 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 	else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC))
 		bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal;
 
-	bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_20;
-	if (rx_status->bw == RATE_INFO_BW_5)
-		bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_5;
-	else if (rx_status->bw == RATE_INFO_BW_10)
-		bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10;
-
 	bss_meta.chan = channel;
 
 	rcu_read_lock();
@@ -315,22 +309,11 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb)
 		ieee80211_rx_bss_put(local, bss);
 }
 
-static void
-ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef,
-			       enum nl80211_bss_scan_width scan_width)
+static void ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef)
 {
 	memset(chandef, 0, sizeof(*chandef));
-	switch (scan_width) {
-	case NL80211_BSS_CHAN_WIDTH_5:
-		chandef->width = NL80211_CHAN_WIDTH_5;
-		break;
-	case NL80211_BSS_CHAN_WIDTH_10:
-		chandef->width = NL80211_CHAN_WIDTH_10;
-		break;
-	default:
-		chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
-		break;
-	}
+
+	chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
 }
 
 /* return false if no more work */
@@ -378,7 +361,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata)
 	}
 
 	local->hw_scan_req->req.n_channels = n_chans;
-	ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
+	ieee80211_prepare_scan_chandef(&chandef);
 
 	if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
 		flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
@@ -919,7 +902,6 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
 {
 	int skip;
 	struct ieee80211_channel *chan;
-	enum nl80211_bss_scan_width oper_scan_width;
 	struct cfg80211_scan_request *scan_req;
 
 	scan_req = rcu_dereference_protected(local->scan_req,
@@ -933,42 +915,21 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
 	local->scan_chandef.freq1_offset = chan->freq_offset;
 	local->scan_chandef.center_freq2 = 0;
 
-	/* For scanning on the S1G band, ignore scan_width (which is constant
-	 * across all channels) for now since channel width is specific to each
-	 * channel. Detect the required channel width here and likely revisit
-	 * later. Maybe scan_width could be used to build the channel scan list?
+	/* For scanning on the S1G band, detect the channel width according to
+	 * the channel being scanned.
 	 */
 	if (chan->band == NL80211_BAND_S1GHZ) {
 		local->scan_chandef.width = ieee80211_s1g_channel_width(chan);
 		goto set_channel;
 	}
 
-	switch (scan_req->scan_width) {
-	case NL80211_BSS_CHAN_WIDTH_5:
-		local->scan_chandef.width = NL80211_CHAN_WIDTH_5;
-		break;
-	case NL80211_BSS_CHAN_WIDTH_10:
-		local->scan_chandef.width = NL80211_CHAN_WIDTH_10;
-		break;
-	default:
-	case NL80211_BSS_CHAN_WIDTH_20:
-		/* If scanning on oper channel, use whatever channel-type
-		 * is currently in use.
-		 */
-		oper_scan_width = cfg80211_chandef_to_scan_width(
-					&local->_oper_chandef);
-		if (chan == local->_oper_chandef.chan &&
-		    oper_scan_width == scan_req->scan_width)
-			local->scan_chandef = local->_oper_chandef;
-		else
-			local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
-		break;
-	case NL80211_BSS_CHAN_WIDTH_1:
-	case NL80211_BSS_CHAN_WIDTH_2:
-		/* shouldn't get here, S1G handled above */
-		WARN_ON(1);
-		break;
-	}
+	/* If scanning on oper channel, use whatever channel-type
+	 * is currently in use.
+	 */
+	if (chan == local->_oper_chandef.chan)
+		local->scan_chandef = local->_oper_chandef;
+	else
+		local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
 
 set_channel:
 	if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL))
@@ -1152,8 +1113,7 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
 				const u8 *ssid, u8 ssid_len,
 				struct ieee80211_channel **channels,
-				unsigned int n_channels,
-				enum nl80211_bss_scan_width scan_width)
+				unsigned int n_channels)
 {
 	struct ieee80211_local *local = sdata->local;
 	int ret = -EBUSY, i, n_ch = 0;
@@ -1210,7 +1170,6 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
 
 	local->int_scan_req->ssids = &local->scan_ssid;
 	local->int_scan_req->n_ssids = 1;
-	local->int_scan_req->scan_width = scan_width;
 	memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN);
 	local->int_scan_req->ssids[0].ssid_len = ssid_len;
 
@@ -1311,7 +1270,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
-	ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
+	ieee80211_prepare_scan_chandef(&chandef);
 
 	ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz,
 				 &sched_scan_ies, req->ie,
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index dc75abdb8f2e..83306979fbe2 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -172,7 +172,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 	 * basic rates
 	 */
 	if (!setup->basic_rates) {
-		enum nl80211_bss_scan_width scan_width;
 		struct ieee80211_supported_band *sband =
 				rdev->wiphy.bands[setup->chandef.chan->band];
 
@@ -193,9 +192,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 				}
 			}
 		} else {
-			scan_width = cfg80211_chandef_to_scan_width(&setup->chandef);
-			setup->basic_rates = ieee80211_mandatory_rates(sband,
-								       scan_width);
+			setup->basic_rates = ieee80211_mandatory_rates(sband);
 		}
 	}
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ab0aea7dca7d..f2dd4c85a10f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10283,7 +10283,6 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 	    nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) ||
 	    nla_put_u32(msg, NL80211_BSS_FREQUENCY_OFFSET,
 			res->channel->freq_offset) ||
-	    nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) ||
 	    nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO,
 			jiffies_to_msecs(jiffies - intbss->ts)))
 		goto nla_put_failure;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index ae4d000009fe..a5758edf53b8 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1638,8 +1638,6 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
 			continue;
 		if (bss->pub.channel != new->pub.channel)
 			continue;
-		if (bss->pub.scan_width != new->pub.scan_width)
-			continue;
 		if (rcu_access_pointer(bss->pub.beacon_ies))
 			continue;
 		ies = rcu_access_pointer(bss->pub.ies);
@@ -1936,8 +1934,7 @@ EXPORT_SYMBOL(cfg80211_get_ies_channel_number);
  */
 static struct ieee80211_channel *
 cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
-			 struct ieee80211_channel *channel,
-			 enum nl80211_bss_scan_width scan_width)
+			 struct ieee80211_channel *channel)
 {
 	u32 freq;
 	int channel_number;
@@ -1977,16 +1974,6 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
 		return channel;
 	}
 
-	if (scan_width == NL80211_BSS_CHAN_WIDTH_10 ||
-	    scan_width == NL80211_BSS_CHAN_WIDTH_5) {
-		/*
-		 * Ignore channel number in 5 and 10 MHz channels where there
-		 * may not be an n:1 or 1:n mapping between frequencies and
-		 * channel numbers.
-		 */
-		return channel;
-	}
-
 	/*
 	 * Use the channel determined through the payload channel number
 	 * instead of the RX channel reported by the driver.
@@ -2046,14 +2033,12 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
 	channel = data->channel;
 	if (!channel)
 		channel = cfg80211_get_bss_channel(wiphy, data->ie, data->ielen,
-						   drv_data->chan,
-						   drv_data->scan_width);
+						   drv_data->chan);
 	if (!channel)
 		return NULL;
 
 	memcpy(tmp.pub.bssid, data->bssid, ETH_ALEN);
 	tmp.pub.channel = channel;
-	tmp.pub.scan_width = drv_data->scan_width;
 	if (data->bss_source != BSS_SOURCE_STA_PROFILE)
 		tmp.pub.signal = drv_data->signal;
 	else
@@ -2814,8 +2799,7 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 			variable = ext->u.s1g_beacon.variable;
 	}
 
-	channel = cfg80211_get_bss_channel(wiphy, variable,
-					   ielen, data->chan, data->scan_width);
+	channel = cfg80211_get_bss_channel(wiphy, variable, ielen, data->chan);
 	if (!channel)
 		return NULL;
 
@@ -2868,7 +2852,6 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 	tmp.pub.beacon_interval = beacon_int;
 	tmp.pub.capability = capability;
 	tmp.pub.channel = channel;
-	tmp.pub.scan_width = data->scan_width;
 	tmp.pub.signal = data->signal;
 	tmp.ts_boottime = data->boottime_ns;
 	tmp.parent_tsf = data->parent_tsf;
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 617c0d0dfa96..126c3a03e43e 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3590,7 +3590,6 @@ TRACE_EVENT(cfg80211_inform_bss_frame,
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		CHAN_ENTRY
-		__field(enum nl80211_bss_scan_width, scan_width)
 		__dynamic_array(u8, mgmt, len)
 		__field(s32, signal)
 		__field(u64, ts_boottime)
@@ -3600,7 +3599,6 @@ TRACE_EVENT(cfg80211_inform_bss_frame,
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		CHAN_ASSIGN(data->chan);
-		__entry->scan_width = data->scan_width;
 		if (mgmt)
 			memcpy(__get_dynamic_array(mgmt), mgmt, len);
 		__entry->signal = data->signal;
@@ -3609,8 +3607,8 @@ TRACE_EVENT(cfg80211_inform_bss_frame,
 		MAC_ASSIGN(parent_bssid, data->parent_bssid);
 	),
 	TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT
-		  "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: %pM",
-		  WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
+		  "signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: %pM",
+		  WIPHY_PR_ARG, CHAN_PR_ARG,
 		  __entry->signal, (unsigned long long)__entry->ts_boottime,
 		  (unsigned long long)__entry->parent_tsf,
 		  __entry->parent_bssid)
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 56cbd9979a3f..213c9405e645 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -43,8 +43,7 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
 }
 EXPORT_SYMBOL(ieee80211_get_response_rate);
 
-u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
-			      enum nl80211_bss_scan_width scan_width)
+u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband)
 {
 	struct ieee80211_rate *bitrates;
 	u32 mandatory_rates = 0;
@@ -54,15 +53,10 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
 	if (WARN_ON(!sband))
 		return 1;
 
-	if (sband->band == NL80211_BAND_2GHZ) {
-		if (scan_width == NL80211_BSS_CHAN_WIDTH_5 ||
-		    scan_width == NL80211_BSS_CHAN_WIDTH_10)
-			mandatory_flag = IEEE80211_RATE_MANDATORY_G;
-		else
-			mandatory_flag = IEEE80211_RATE_MANDATORY_B;
-	} else {
+	if (sband->band == NL80211_BAND_2GHZ)
+		mandatory_flag = IEEE80211_RATE_MANDATORY_B;
+	else
 		mandatory_flag = IEEE80211_RATE_MANDATORY_A;
-	}
 
 	bitrates = sband->bitrates;
 	for (i = 0; i < sband->n_bitrates; i++)
-- 
cgit v1.2.3-70-g09d2


From 0cfaec25995ad3be316631b945be7ced81daa4e7 Mon Sep 17 00:00:00 2001
From: Aloka Dixit <quic_alokad@quicinc.com>
Date: Thu, 27 Jul 2023 10:40:56 -0700
Subject: wifi: nl80211: fixes to FILS discovery updates

Add a new flag 'update' which is set to true during start_ap()
if (and only if) one of the following two conditions are met:
- Userspace passed an empty nested attribute which indicates that
  the feature should be disabled and templates deleted.
- Userspace passed all the parameters for the nested attribute.

Existing configuration will not be changed while the flag
remains false.

Add similar changes for unsolicited broadcast probe response
transmission.

Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20230727174100.11721-2-quic_alokad@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h | 11 +++++++----
 net/wireless/nl80211.c       | 16 +++++++++++++++-
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2d3fa4a29781..aa8c4538f93d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1338,6 +1338,7 @@ struct cfg80211_acl_data {
  * struct cfg80211_fils_discovery - FILS discovery parameters from
  * IEEE Std 802.11ai-2016, Annex C.3 MIB detail.
  *
+ * @update: Set to true if the feature configuration should be updated.
  * @min_interval: Minimum packet interval in TUs (0 - 10000)
  * @max_interval: Maximum packet interval in TUs (0 - 10000)
  * @tmpl_len: Template length
@@ -1345,6 +1346,7 @@ struct cfg80211_acl_data {
  *	frame headers.
  */
 struct cfg80211_fils_discovery {
+	bool update;
 	u32 min_interval;
 	u32 max_interval;
 	size_t tmpl_len;
@@ -1355,6 +1357,7 @@ struct cfg80211_fils_discovery {
  * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe
  *	response parameters in 6GHz.
  *
+ * @update: Set to true if the feature configuration should be updated.
  * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned
  *	in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive
  *	scanning
@@ -1362,6 +1365,7 @@ struct cfg80211_fils_discovery {
  * @tmpl: Template data for probe response
  */
 struct cfg80211_unsol_bcast_probe_resp {
+	bool update;
 	u32 interval;
 	size_t tmpl_len;
 	const u8 *tmpl;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index b628126e06fa..f797ab7a6547 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2690,11 +2690,13 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_FILS_DISCOVERY: Optional parameter to configure FILS
  *	discovery. It is a nested attribute, see
- *	&enum nl80211_fils_discovery_attributes.
+ *	&enum nl80211_fils_discovery_attributes. Userspace should pass an empty
+ *	nested attribute to disable this feature and delete the templates.
  *
  * @NL80211_ATTR_UNSOL_BCAST_PROBE_RESP: Optional parameter to configure
  *	unsolicited broadcast probe response. It is a nested attribute, see
- *	&enum nl80211_unsol_bcast_probe_resp_attributes.
+ *	&enum nl80211_unsol_bcast_probe_resp_attributes. Userspace should pass an empty
+ *	nested attribute to disable this feature and delete the templates.
  *
  * @NL80211_ATTR_S1G_CAPABILITY: S1G Capability information element (from
  *	association request when used with NL80211_CMD_NEW_STATION)
@@ -7606,7 +7608,7 @@ enum nl80211_iftype_akm_attributes {
  * @NL80211_FILS_DISCOVERY_ATTR_INT_MIN: Minimum packet interval (u32, TU).
  *	Allowed range: 0..10000 (TU = Time Unit)
  * @NL80211_FILS_DISCOVERY_ATTR_INT_MAX: Maximum packet interval (u32, TU).
- *	Allowed range: 0..10000 (TU = Time Unit)
+ *	Allowed range: 0..10000 (TU = Time Unit). If set to 0, the feature is disabled.
  * @NL80211_FILS_DISCOVERY_ATTR_TMPL: Template data for FILS discovery action
  *	frame including the headers.
  *
@@ -7639,7 +7641,8 @@ enum nl80211_fils_discovery_attributes {
  *
  * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT: Maximum packet interval (u32, TU).
  *	Allowed range: 0..20 (TU = Time Unit). IEEE P802.11ax/D6.0
- *	26.17.2.3.2 (AP behavior for fast passive scanning).
+ *	26.17.2.3.2 (AP behavior for fast passive scanning). If set to 0, the feature is
+ *	disabled.
  * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL: Unsolicited broadcast probe response
  *	frame template (binary).
  *
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f2dd4c85a10f..53618a147907 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5647,6 +5647,13 @@ static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev,
 	if (ret)
 		return ret;
 
+	if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] &&
+	    !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] &&
+	    !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]) {
+		fd->update = true;
+		return 0;
+	}
+
 	if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] ||
 	    !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] ||
 	    !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL])
@@ -5656,7 +5663,7 @@ static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev,
 	fd->tmpl = nla_data(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]);
 	fd->min_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN]);
 	fd->max_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX]);
-
+	fd->update = true;
 	return 0;
 }
 
@@ -5679,6 +5686,12 @@ nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev,
 	if (ret)
 		return ret;
 
+	if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] &&
+	    !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]) {
+		presp->update = true;
+		return 0;
+	}
+
 	if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] ||
 	    !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL])
 		return -EINVAL;
@@ -5686,6 +5699,7 @@ nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev,
 	presp->tmpl = nla_data(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]);
 	presp->tmpl_len = nla_len(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]);
 	presp->interval = nla_get_u32(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT]);
+	presp->update = true;
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 66f85d57b7109baf8a7d5ee04049ac9412611d35 Mon Sep 17 00:00:00 2001
From: Aloka Dixit <quic_alokad@quicinc.com>
Date: Thu, 27 Jul 2023 10:40:58 -0700
Subject: wifi: cfg80211: modify prototype for change_beacon

Modify the prototype for change_beacon() in struct cfg80211_op to
accept cfg80211_ap_settings instead of cfg80211_beacon_data so that
it can process data in addition to beacons.
Modify the prototypes of ieee80211_change_beacon() and driver specific
functions accordingly.

Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20230727174100.11721-4-quic_alokad@quicinc.com
[while at it, remove pointless "if (info)" check in tracing that just
 makes all the lines longer than they need be - it's never NULL]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |  4 +-
 drivers/net/wireless/ath/wil6210/cfg80211.c        |  3 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |  4 +-
 drivers/net/wireless/marvell/mwifiex/cfg80211.c    |  3 +-
 drivers/net/wireless/microchip/wilc1000/cfg80211.c |  4 +-
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c  |  4 +-
 drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c  |  6 +-
 include/net/cfg80211.h                             |  2 +-
 net/mac80211/cfg.c                                 | 14 ++--
 net/wireless/nl80211.c                             | 16 +++--
 net/wireless/rdev-ops.h                            |  2 +-
 net/wireless/trace.h                               | 74 +++++++++++-----------
 12 files changed, 73 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 1dba55c2d9dc..eea60e2fca44 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -2954,7 +2954,7 @@ static int ath6kl_start_ap(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int ath6kl_change_beacon(struct wiphy *wiphy, struct net_device *dev,
-				struct cfg80211_beacon_data *beacon)
+				struct cfg80211_ap_settings *params)
 {
 	struct ath6kl_vif *vif = netdev_priv(dev);
 
@@ -2964,7 +2964,7 @@ static int ath6kl_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	if (vif->next_mode != AP_NETWORK)
 		return -EOPNOTSUPP;
 
-	return ath6kl_set_ies(vif, beacon);
+	return ath6kl_set_ies(vif, &params->beacon);
 }
 
 static int ath6kl_stop_ap(struct wiphy *wiphy, struct net_device *dev,
diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 40f9a7ef8980..dfbb478ae274 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -2082,11 +2082,12 @@ void wil_cfg80211_ap_recovery(struct wil6210_priv *wil)
 
 static int wil_cfg80211_change_beacon(struct wiphy *wiphy,
 				      struct net_device *ndev,
-				      struct cfg80211_beacon_data *bcon)
+				      struct cfg80211_ap_settings *params)
 {
 	struct wil6210_priv *wil = wiphy_to_wil(wiphy);
 	struct wireless_dev *wdev = ndev->ieee80211_ptr;
 	struct wil6210_vif *vif = ndev_to_vif(ndev);
+	struct cfg80211_beacon_data *bcon = &params->beacon;
 	int rc;
 	u32 privacy = 0;
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 94b4a7b8793d..9012456e1a18 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -5415,13 +5415,13 @@ static int brcmf_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *ndev,
 
 static s32
 brcmf_cfg80211_change_beacon(struct wiphy *wiphy, struct net_device *ndev,
-			     struct cfg80211_beacon_data *info)
+			     struct cfg80211_ap_settings *info)
 {
 	struct brcmf_if *ifp = netdev_priv(ndev);
 
 	brcmf_dbg(TRACE, "Enter\n");
 
-	return brcmf_config_ap_mgmt_ie(ifp->vif, info);
+	return brcmf_config_ap_mgmt_ie(ifp->vif, &info->beacon);
 }
 
 static int
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index ba4e29713a8c..70473be42d7b 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -1835,10 +1835,11 @@ static int mwifiex_cfg80211_set_cqm_rssi_config(struct wiphy *wiphy,
  */
 static int mwifiex_cfg80211_change_beacon(struct wiphy *wiphy,
 					  struct net_device *dev,
-					  struct cfg80211_beacon_data *data)
+					  struct cfg80211_ap_settings *params)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 	struct mwifiex_adapter *adapter = priv->adapter;
+	struct cfg80211_beacon_data *data = &params->beacon;
 
 	mwifiex_cancel_scan(adapter);
 
diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
index b545d93c6e37..3447470d3d02 100644
--- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c
+++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
@@ -1441,11 +1441,11 @@ static int start_ap(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int change_beacon(struct wiphy *wiphy, struct net_device *dev,
-			 struct cfg80211_beacon_data *beacon)
+			 struct cfg80211_ap_settings *params)
 {
 	struct wilc_vif *vif = netdev_priv(dev);
 
-	return wilc_add_beacon(vif, 0, 0, beacon);
+	return wilc_add_beacon(vif, 0, 0, &params->beacon);
 }
 
 static int stop_ap(struct wiphy *wiphy, struct net_device *dev,
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 73e6f9408b51..9388adcdcac1 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -331,11 +331,11 @@ out:
 }
 
 static int qtnf_change_beacon(struct wiphy *wiphy, struct net_device *dev,
-			      struct cfg80211_beacon_data *info)
+			      struct cfg80211_ap_settings *info)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(dev);
 
-	return qtnf_mgmt_set_appie(vif, info);
+	return qtnf_mgmt_set_appie(vif, &info->beacon);
 }
 
 static int qtnf_start_ap(struct wiphy *wiphy, struct net_device *dev,
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index 2ae7843abdf7..5ddc2d9a6060 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -2319,11 +2319,13 @@ static int cfg80211_rtw_start_ap(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static int cfg80211_rtw_change_beacon(struct wiphy *wiphy, struct net_device *ndev,
-		struct cfg80211_beacon_data *info)
+		struct cfg80211_ap_settings *info)
 {
 	struct adapter *adapter = rtw_netdev_priv(ndev);
 
-	return rtw_add_beacon(adapter, info->head, info->head_len, info->tail, info->tail_len);
+	return rtw_add_beacon(adapter, info->beacon.head,
+			      info->beacon.head_len, info->beacon.tail,
+			      info->beacon.tail_len);
 }
 
 static int cfg80211_rtw_stop_ap(struct wiphy *wiphy, struct net_device *ndev,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index aa8c4538f93d..d69841f64459 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4495,7 +4495,7 @@ struct cfg80211_ops {
 	int	(*start_ap)(struct wiphy *wiphy, struct net_device *dev,
 			    struct cfg80211_ap_settings *settings);
 	int	(*change_beacon)(struct wiphy *wiphy, struct net_device *dev,
-				 struct cfg80211_beacon_data *info);
+				 struct cfg80211_ap_settings *info);
 	int	(*stop_ap)(struct wiphy *wiphy, struct net_device *dev,
 			   unsigned int link_id);
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e28a22ebe581..e17ce9b8b8cb 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1477,10 +1477,12 @@ error:
 }
 
 static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
-				   struct cfg80211_beacon_data *params)
+				   struct cfg80211_ap_settings *params)
+
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_link_data *link;
+	struct cfg80211_beacon_data *beacon = &params->beacon;
 	struct beacon_data *old;
 	int err;
 	struct ieee80211_bss_conf *link_conf;
@@ -1488,7 +1490,7 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 
 	lockdep_assert_wiphy(wiphy);
 
-	link = sdata_dereference(sdata->link[params->link_id], sdata);
+	link = sdata_dereference(sdata->link[beacon->link_id], sdata);
 	if (!link)
 		return -ENOLINK;
 
@@ -1504,14 +1506,14 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	if (!old)
 		return -ENOENT;
 
-	err = ieee80211_assign_beacon(sdata, link, params, NULL, NULL,
+	err = ieee80211_assign_beacon(sdata, link, beacon, NULL, NULL,
 				      &changed);
 	if (err < 0)
 		return err;
 
-	if (params->he_bss_color_valid &&
-	    params->he_bss_color.enabled != link_conf->he_bss_color.enabled) {
-		link_conf->he_bss_color.enabled = params->he_bss_color.enabled;
+	if (beacon->he_bss_color_valid &&
+	    beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) {
+		link_conf->he_bss_color.enabled = beacon->he_bss_color.enabled;
 		changed |= BSS_CHANGED_HE_BSS_COLOR;
 	}
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 53618a147907..6449072e8def 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6198,7 +6198,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	struct cfg80211_beacon_data params;
+	struct cfg80211_ap_settings *params;
 	int err;
 
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
@@ -6211,15 +6211,21 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	if (!wdev->links[link_id].ap.beacon_interval)
 		return -EINVAL;
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params, info->extack);
+	params = kzalloc(sizeof(*params), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	err = nl80211_parse_beacon(rdev, info->attrs, &params->beacon,
+				   info->extack);
 	if (err)
 		goto out;
 
-	err = rdev_change_beacon(rdev, dev, &params);
+	err = rdev_change_beacon(rdev, dev, params);
 
 out:
-	kfree(params.mbssid_ies);
-	kfree(params.rnr_ies);
+	kfree(params->beacon.mbssid_ies);
+	kfree(params->beacon.rnr_ies);
+	kfree(params);
 	return err;
 }
 
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 90bb7ac4b930..c6a2c07e380b 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -173,7 +173,7 @@ static inline int rdev_start_ap(struct cfg80211_registered_device *rdev,
 
 static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev,
 				     struct net_device *dev,
-				     struct cfg80211_beacon_data *info)
+				     struct cfg80211_ap_settings *info)
 {
 	int ret;
 	trace_rdev_change_beacon(&rdev->wiphy, dev, info);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 126c3a03e43e..1557dc1d58e2 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -615,49 +615,47 @@ TRACE_EVENT(rdev_start_ap,
 
 TRACE_EVENT(rdev_change_beacon,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
-		 struct cfg80211_beacon_data *info),
+		 struct cfg80211_ap_settings *info),
 	TP_ARGS(wiphy, netdev, info),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
 		__field(int, link_id)
-		__dynamic_array(u8, head, info ? info->head_len : 0)
-		__dynamic_array(u8, tail, info ? info->tail_len : 0)
-		__dynamic_array(u8, beacon_ies, info ? info->beacon_ies_len : 0)
-		__dynamic_array(u8, proberesp_ies,
-				info ? info->proberesp_ies_len : 0)
-		__dynamic_array(u8, assocresp_ies,
-				info ? info->assocresp_ies_len : 0)
-		__dynamic_array(u8, probe_resp, info ? info->probe_resp_len : 0)
-	),
-	TP_fast_assign(
-		WIPHY_ASSIGN;
-		NETDEV_ASSIGN;
-		if (info) {
-			__entry->link_id = info->link_id;
-			if (info->head)
-				memcpy(__get_dynamic_array(head), info->head,
-				       info->head_len);
-			if (info->tail)
-				memcpy(__get_dynamic_array(tail), info->tail,
-				       info->tail_len);
-			if (info->beacon_ies)
-				memcpy(__get_dynamic_array(beacon_ies),
-				       info->beacon_ies, info->beacon_ies_len);
-			if (info->proberesp_ies)
-				memcpy(__get_dynamic_array(proberesp_ies),
-				       info->proberesp_ies,
-				       info->proberesp_ies_len);
-			if (info->assocresp_ies)
-				memcpy(__get_dynamic_array(assocresp_ies),
-				       info->assocresp_ies,
-				       info->assocresp_ies_len);
-			if (info->probe_resp)
-				memcpy(__get_dynamic_array(probe_resp),
-				       info->probe_resp, info->probe_resp_len);
-		} else {
-			__entry->link_id = -1;
-		}
+		__dynamic_array(u8, head, info->beacon.head_len)
+		__dynamic_array(u8, tail, info->beacon.tail_len)
+		__dynamic_array(u8, beacon_ies, info->beacon.beacon_ies_len)
+		__dynamic_array(u8, proberesp_ies, info->beacon.proberesp_ies_len)
+		__dynamic_array(u8, assocresp_ies, info->beacon.assocresp_ies_len)
+		__dynamic_array(u8, probe_resp, info->beacon.probe_resp_len)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		__entry->link_id = info->beacon.link_id;
+		if (info->beacon.head)
+			memcpy(__get_dynamic_array(head),
+			       info->beacon.head,
+			       info->beacon.head_len);
+		if (info->beacon.tail)
+			memcpy(__get_dynamic_array(tail),
+			       info->beacon.tail,
+			       info->beacon.tail_len);
+		if (info->beacon.beacon_ies)
+			memcpy(__get_dynamic_array(beacon_ies),
+			       info->beacon.beacon_ies,
+			       info->beacon.beacon_ies_len);
+		if (info->beacon.proberesp_ies)
+			memcpy(__get_dynamic_array(proberesp_ies),
+			       info->beacon.proberesp_ies,
+			       info->beacon.proberesp_ies_len);
+		if (info->beacon.assocresp_ies)
+			memcpy(__get_dynamic_array(assocresp_ies),
+			       info->beacon.assocresp_ies,
+			       info->beacon.assocresp_ies_len);
+		if (info->beacon.probe_resp)
+			memcpy(__get_dynamic_array(probe_resp),
+			       info->beacon.probe_resp,
+			       info->beacon.probe_resp_len);
 	),
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id:%d",
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
-- 
cgit v1.2.3-70-g09d2


From b13b6bbfbb627884f18982600f7b5a5200652531 Mon Sep 17 00:00:00 2001
From: Abhishek Kumar <kuabhs@chromium.org>
Date: Thu, 29 Jun 2023 03:52:54 +0000
Subject: wifi: cfg80211: call reg_call_notifier on beacon hints

Currently the channel property updates are not propagated to
driver. This causes issues in the discovery of hidden SSIDs and
fails to connect to them.
This change defines a new wiphy flag which when enabled by vendor
driver, the reg_call_notifier callback will be trigger on beacon
hints. This ensures that the channel property changes are visible
to the vendor driver. The vendor changes the channels for active
scans. This fixes the discovery issue of hidden SSID.

Signed-off-by: Abhishek Kumar <kuabhs@chromium.org>
Link: https://lore.kernel.org/r/20230629035254.1.I059fe585f9f9e896c2d51028ef804d197c8c009e@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  3 +++
 net/wireless/reg.c     | 20 ++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d69841f64459..0551a38d6760 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4861,6 +4861,8 @@ struct cfg80211_ops {
  * @WIPHY_FLAG_SUPPORTS_EXT_KCK_32: The device supports 32-byte KCK keys.
  * @WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER: The device could handle reg notify for
  *	NL80211_REGDOM_SET_BY_DRIVER.
+ * @WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON: reg_call_notifier() is called if driver
+ *	set this flag to update channels on beacon hints.
  */
 enum wiphy_flags {
 	WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK		= BIT(0),
@@ -4887,6 +4889,7 @@ enum wiphy_flags {
 	WIPHY_FLAG_SUPPORTS_5_10_MHZ		= BIT(22),
 	WIPHY_FLAG_HAS_CHANNEL_SWITCH		= BIT(23),
 	WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER	= BIT(24),
+	WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON     = BIT(25),
 };
 
 /**
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 8a99eafa4b62..33e2570f2bd6 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2151,6 +2151,13 @@ static bool reg_is_world_roaming(struct wiphy *wiphy)
 	return false;
 }
 
+static void reg_call_notifier(struct wiphy *wiphy,
+			      struct regulatory_request *request)
+{
+	if (wiphy->reg_notifier)
+		wiphy->reg_notifier(wiphy, request);
+}
+
 static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx,
 			      struct reg_beacon *reg_beacon)
 {
@@ -2158,6 +2165,7 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx,
 	struct ieee80211_channel *chan;
 	bool channel_changed = false;
 	struct ieee80211_channel chan_before;
+	struct regulatory_request *lr = get_last_request();
 
 	sband = wiphy->bands[reg_beacon->chan.band];
 	chan = &sband->channels[chan_idx];
@@ -2183,8 +2191,11 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx,
 		channel_changed = true;
 	}
 
-	if (channel_changed)
+	if (channel_changed) {
 		nl80211_send_beacon_hint_event(wiphy, &chan_before, chan);
+		if (wiphy->flags & WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON)
+			reg_call_notifier(wiphy, lr);
+	}
 }
 
 /*
@@ -2327,13 +2338,6 @@ static void reg_process_ht_flags(struct wiphy *wiphy)
 		reg_process_ht_flags_band(wiphy, wiphy->bands[band]);
 }
 
-static void reg_call_notifier(struct wiphy *wiphy,
-			      struct regulatory_request *request)
-{
-	if (wiphy->reg_notifier)
-		wiphy->reg_notifier(wiphy, request);
-}
-
 static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 {
 	struct cfg80211_chan_def chandef = {};
-- 
cgit v1.2.3-70-g09d2


From 30ca8b0c4d6c9fb1d76e5894b1e8bf7c6a12224d Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <quic_adisi@quicinc.com>
Date: Tue, 12 Sep 2023 10:48:55 +0530
Subject: wifi: cfg80211: export DFS CAC time and usable state helper functions

cfg80211 has cfg80211_chandef_dfs_usable() function to know whether
at least one channel in the chandef is in usable state or not. Also,
cfg80211_chandef_dfs_cac_time() function is there which tells the CAC
time required for the given chandef.

Make these two functions visible to drivers by exporting their symbol
to global list of kernel symbols.

Lower level drivers can make use of these two functions to be aware
if CAC is required on the given chandef and for how long. For example
drivers which maintains the CAC state internally can make use of these.

Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20230912051857.2284-2-quic_adisi@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 24 ++++++++++++++++++++++++
 net/wireless/chan.c    |  2 ++
 net/wireless/core.h    | 17 -----------------
 3 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0551a38d6760..a5841a96284d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1002,6 +1002,30 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
 				  const struct cfg80211_chan_def *chandef,
 				  enum nl80211_iftype iftype);
 
+/**
+ * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable and we
+ *				 can/need start CAC on such channel
+ * @wiphy: the wiphy to validate against
+ * @chandef: the channel definition to check
+ *
+ * Return: true if all channels available and at least
+ *	   one channel requires CAC (NL80211_DFS_USABLE)
+ */
+bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
+				 const struct cfg80211_chan_def *chandef);
+
+/**
+ * cfg80211_chandef_dfs_cac_time - get the DFS CAC time (in ms) for given
+ *				   channel definition
+ * @wiphy: the wiphy to validate against
+ * @chandef: the channel definition to check
+ *
+ * Returns: DFS CAC time (in ms) which applies for this channel definition
+ */
+unsigned int
+cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
+			      const struct cfg80211_chan_def *chandef);
+
 /**
  * nl80211_send_chandef - sends the channel definition.
  * @msg: the msg to send channel definition
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 842190dfa100..2d21e423abdb 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -666,6 +666,7 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
 
 	return (r1 + r2 > 0);
 }
+EXPORT_SYMBOL(cfg80211_chandef_dfs_usable);
 
 /*
  * Checks if center frequency of chan falls with in the bandwidth
@@ -965,6 +966,7 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
 
 	return max(t1, t2);
 }
+EXPORT_SYMBOL(cfg80211_chandef_dfs_cac_time);
 
 static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
 					u32 center_freq, u32 bandwidth,
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 98f41d9d2ba7..866f0a6934e6 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -452,29 +452,12 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev);
 
 extern struct work_struct cfg80211_disconnect_work;
 
-/**
- * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
- * @wiphy: the wiphy to validate against
- * @chandef: the channel definition to check
- *
- * Checks if chandef is usable and we can/need start CAC on such channel.
- *
- * Return: true if all channels available and at least
- *	   one channel requires CAC (NL80211_DFS_USABLE)
- */
-bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
-				 const struct cfg80211_chan_def *chandef);
-
 void cfg80211_set_dfs_state(struct wiphy *wiphy,
 			    const struct cfg80211_chan_def *chandef,
 			    enum nl80211_dfs_state dfs_state);
 
 void cfg80211_dfs_channels_update_work(struct work_struct *work);
 
-unsigned int
-cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
-			      const struct cfg80211_chan_def *chandef);
-
 void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev);
 
 int
-- 
cgit v1.2.3-70-g09d2


From 3e99b4d282195435a9271fb738c4b146a9d35a5a Mon Sep 17 00:00:00 2001
From: Stephen Douthit <stephen.douthit@gmail.com>
Date: Mon, 13 Feb 2023 15:40:24 -0500
Subject: wifi: mac80211: Sanity check tx bitrate if not provided by driver

If the driver doesn't fill NL80211_STA_INFO_TX_BITRATE in sta_set_sinfo()
then as a fallback sta->deflink.tx_stats.last_rate is used.  Unfortunately
there's no guarantee that this has actually been set before it's used.

Originally found when 'iw <dev> link' would always return a tx rate of
6Mbps regardless of actual link speed for the QCA9337 running firmware
WLAN.TF.2.1-00021-QCARMSWP-1 in my netbook.

Use the sanity check logic from ieee80211_fill_rx_status() and refactor
that to use the new inline function.

Signed-off-by: Stephen Douthit <stephen.douthit@gmail.com>
Link: https://lore.kernel.org/r/20230213204024.3377-1-stephen.douthit@gmail.com
[change to bool ..._rate_valid() instead of int ..._rate_invalid()]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 5 +++++
 net/mac80211/airtime.c  | 2 +-
 net/mac80211/sta_info.c | 3 ++-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 09fe4601bf59..8d993f6ab919 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1078,6 +1078,11 @@ struct ieee80211_tx_rate {
 
 #define IEEE80211_MAX_TX_RETRY		31
 
+static inline bool ieee80211_rate_valid(struct ieee80211_tx_rate *rate)
+{
+	return rate->idx >= 0 && rate->count > 0;
+}
+
 static inline void ieee80211_rate_set_vht(struct ieee80211_tx_rate *rate,
 					  u8 mcs, u8 nss)
 {
diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
index 14be7b526f1d..fdf8b658fede 100644
--- a/net/mac80211/airtime.c
+++ b/net/mac80211/airtime.c
@@ -557,7 +557,7 @@ static int ieee80211_fill_rx_status(struct ieee80211_rx_status *stat,
 	if (ieee80211_fill_rate_info(hw, stat, band, ri))
 		return 0;
 
-	if (rate->idx < 0 || !rate->count)
+	if (!ieee80211_rate_valid(rate))
 		return -1;
 
 	if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH)
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 7243c6aa2161..ba36fc29e532 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2703,7 +2703,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 	}
 
 	if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) &&
-	    !sta->sta.valid_links) {
+	    !sta->sta.valid_links &&
+	    ieee80211_rate_valid(&sta->deflink.tx_stats.last_rate)) {
 		sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate,
 				     &sinfo->txrate);
 		sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE);
-- 
cgit v1.2.3-70-g09d2


From 89de9921dfa77e43b985bde99a6031ab66511020 Mon Sep 17 00:00:00 2001
From: Paul M Stillwell Jr <paul.m.stillwell.jr@intel.com>
Date: Wed, 6 Sep 2023 13:57:01 -0600
Subject: virtchnl: Add CRC stripping capability

Some VFs may want to disable CRC stripping on incoming packets so create
an offload for that. The VF already sends information about configuring
its RX queues so use that structure to indicate that the CRC stripping
should be enabled or not.

Signed-off-by: Paul M Stillwell Jr <paul.m.stillwell.jr@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index d0807ad43f93..dd71d3009771 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -240,6 +240,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		BIT(6)
 /* used to negotiate communicating link speeds in Mbps */
 #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		BIT(7)
+#define  VIRTCHNL_VF_OFFLOAD_CRC		BIT(10)
 #define VIRTCHNL_VF_OFFLOAD_VLAN_V2		BIT(15)
 #define VIRTCHNL_VF_OFFLOAD_VLAN		BIT(16)
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		BIT(17)
@@ -295,7 +296,13 @@ VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_txq_info);
 /* VIRTCHNL_OP_CONFIG_RX_QUEUE
  * VF sends this message to set up parameters for one RX queue.
  * External data buffer contains one instance of virtchnl_rxq_info.
- * PF configures requested queue and returns a status code.
+ * PF configures requested queue and returns a status code. The
+ * crc_disable flag disables CRC stripping on the VF. Setting
+ * the crc_disable flag to 1 will disable CRC stripping for each
+ * queue in the VF where the flag is set. The VIRTCHNL_VF_OFFLOAD_CRC
+ * offload must have been set prior to sending this info or the PF
+ * will ignore the request. This flag should be set the same for
+ * all of the queues for a VF.
  */
 
 /* Rx queue config info */
@@ -307,7 +314,7 @@ struct virtchnl_rxq_info {
 	u16 splithdr_enabled; /* deprecated with AVF 1.0 */
 	u32 databuffer_size;
 	u32 max_pkt_size;
-	u8 pad0;
+	u8 crc_disable;
 	u8 rxdid;
 	u8 pad1[2];
 	u64 dma_ring_addr;
-- 
cgit v1.2.3-70-g09d2


From 9c386911b16d4bf2f24e55ed42bfa397073b4b84 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 14 Sep 2023 09:13:10 +0200
Subject: wifi: cfg80211: fix kernel-doc for wiphy_delayed_work_flush()

Now that I fixed the function name, we see the parameters
are wrong as well. Fix that too.

Change-Id: I1a4cfea446875998a5a242ca36acc8244991a199
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a5841a96284d..8fcfe1869424 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5954,7 +5954,7 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy,
 /**
  * wiphy_delayed_work_flush - flush previously queued delayed work
  * @wiphy: the wiphy, for debug purposes
- * @work: the work to flush
+ * @dwork: the delayed work to flush
  *
  * Flush the work (i.e. run it if pending). This must be called
  * under the wiphy mutex acquired by wiphy_lock().
-- 
cgit v1.2.3-70-g09d2


From 81b36803ac139827538ac5ce4028e750a3c53f53 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:21 +0000
Subject: udp: introduce udp->udp_flags

According to syzbot, it is time to use proper atomic flags
for various UDP flags.

Add udp_flags field, and convert udp->corkflag to first
bit in it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 28 +++++++++++++++++++++-------
 net/ipv4/udp.c      | 12 ++++++------
 net/ipv6/udp.c      |  6 +++---
 3 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 43c1fb2d2c21..23f0693e0d9c 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -32,14 +32,20 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 	return (num + net_hash_mix(net)) & mask;
 }
 
+enum {
+	UDP_FLAGS_CORK,		/* Cork is required */
+};
+
 struct udp_sock {
 	/* inet_sock has to be the first member */
 	struct inet_sock inet;
 #define udp_port_hash		inet.sk.__sk_common.skc_u16hashes[0]
 #define udp_portaddr_hash	inet.sk.__sk_common.skc_u16hashes[1]
 #define udp_portaddr_node	inet.sk.__sk_common.skc_portaddr_node
+
+	unsigned long	 udp_flags;
+
 	int		 pending;	/* Any pending frames ? */
-	unsigned int	 corkflag;	/* Cork is required */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
 			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
@@ -51,6 +57,11 @@ struct udp_sock {
 			 gro_enabled:1,	/* Request GRO aggregation */
 			 accept_udp_l4:1,
 			 accept_udp_fraglist:1;
+/* indicator bits used by pcflag: */
+#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
+#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
+#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
+	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -62,12 +73,6 @@ struct udp_sock {
 	 */
 	__u16		 pcslen;
 	__u16		 pcrlen;
-/* indicator bits used by pcflag: */
-#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
-#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
-	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
-	__u8		 unused[3];
 	/*
 	 * For encapsulation sockets.
 	 */
@@ -95,6 +100,15 @@ struct udp_sock {
 	int		forward_threshold;
 };
 
+#define udp_test_bit(nr, sk)			\
+	test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_set_bit(nr, sk)			\
+	set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_clear_bit(nr, sk)			\
+	clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_assign_bit(nr, sk, val)		\
+	assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val)
+
 #define UDP_MAX_SEGMENTS	(1 << 6UL)
 
 #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f39b9c844580..9709f8a532dc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1051,7 +1051,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	u8 tos, scope;
 	__be16 dport;
 	int err, is_udplite = IS_UDPLITE(sk);
-	int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
+	int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
 	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
 	struct sk_buff *skb;
 	struct ip_options_data opt_copy;
@@ -1315,11 +1315,11 @@ void udp_splice_eof(struct socket *sock)
 	struct sock *sk = sock->sk;
 	struct udp_sock *up = udp_sk(sk);
 
-	if (!up->pending || READ_ONCE(up->corkflag))
+	if (!up->pending || udp_test_bit(CORK, sk))
 		return;
 
 	lock_sock(sk);
-	if (up->pending && !READ_ONCE(up->corkflag))
+	if (up->pending && !udp_test_bit(CORK, sk))
 		udp_push_pending_frames(sk);
 	release_sock(sk);
 }
@@ -2658,9 +2658,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 	switch (optname) {
 	case UDP_CORK:
 		if (val != 0) {
-			WRITE_ONCE(up->corkflag, 1);
+			udp_set_bit(CORK, sk);
 		} else {
-			WRITE_ONCE(up->corkflag, 0);
+			udp_clear_bit(CORK, sk);
 			lock_sock(sk);
 			push_pending_frames(sk);
 			release_sock(sk);
@@ -2783,7 +2783,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 
 	switch (optname) {
 	case UDP_CORK:
-		val = READ_ONCE(up->corkflag);
+		val = udp_test_bit(CORK, sk);
 		break;
 
 	case UDP_ENCAP:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 86b5d509a468..0c6973cd22ce 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1332,7 +1332,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int addr_len = msg->msg_namelen;
 	bool connected = false;
 	int ulen = len;
-	int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
+	int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
@@ -1644,11 +1644,11 @@ static void udpv6_splice_eof(struct socket *sock)
 	struct sock *sk = sock->sk;
 	struct udp_sock *up = udp_sk(sk);
 
-	if (!up->pending || READ_ONCE(up->corkflag))
+	if (!up->pending || udp_test_bit(CORK, sk))
 		return;
 
 	lock_sock(sk);
-	if (up->pending && !READ_ONCE(up->corkflag))
+	if (up->pending && !udp_test_bit(CORK, sk))
 		udp_v6_push_pending_frames(sk);
 	release_sock(sk);
 }
-- 
cgit v1.2.3-70-g09d2


From a0002127cd746fcaa182ad3386ef6931c37f3bda Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:22 +0000
Subject: udp: move udp->no_check6_tx to udp->udp_flags

syzbot reported that udp->no_check6_tx can be read locklessly.
Use one atomic bit from udp->udp_flags

Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 10 +++++-----
 net/ipv4/udp.c      |  4 ++--
 net/ipv6/udp.c      |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 23f0693e0d9c..e3f2a6c7ac1d 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -34,6 +34,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 
 enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
+	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 };
 
 struct udp_sock {
@@ -47,8 +48,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+	unsigned char	 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
 			 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
@@ -115,7 +115,7 @@ struct udp_sock {
 
 static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 {
-	udp_sk(sk)->no_check6_tx = val;
+	udp_assign_bit(NO_CHECK6_TX, sk, val);
 }
 
 static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
@@ -123,9 +123,9 @@ static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
 	udp_sk(sk)->no_check6_rx = val;
 }
 
-static inline bool udp_get_no_check6_tx(struct sock *sk)
+static inline bool udp_get_no_check6_tx(const struct sock *sk)
 {
-	return udp_sk(sk)->no_check6_tx;
+	return udp_test_bit(NO_CHECK6_TX, sk);
 }
 
 static inline bool udp_get_no_check6_rx(struct sock *sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9709f8a532dc..0c6998291c99 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2694,7 +2694,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_NO_CHECK6_TX:
-		up->no_check6_tx = valbool;
+		udp_set_no_check6_tx(sk, valbool);
 		break;
 
 	case UDP_NO_CHECK6_RX:
@@ -2791,7 +2791,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_NO_CHECK6_TX:
-		val = up->no_check6_tx;
+		val = udp_get_no_check6_tx(sk);
 		break;
 
 	case UDP_NO_CHECK6_RX:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0c6973cd22ce..469df0ca561f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1241,7 +1241,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 			kfree_skb(skb);
 			return -EINVAL;
 		}
-		if (udp_sk(sk)->no_check6_tx) {
+		if (udp_get_no_check6_tx(sk)) {
 			kfree_skb(skb);
 			return -EINVAL;
 		}
@@ -1262,7 +1262,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 
 	if (is_udplite)
 		csum = udplite_csum(skb);
-	else if (udp_sk(sk)->no_check6_tx) {   /* UDP csum disabled */
+	else if (udp_get_no_check6_tx(sk)) {   /* UDP csum disabled */
 		skb->ip_summed = CHECKSUM_NONE;
 		goto send;
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
-- 
cgit v1.2.3-70-g09d2


From bcbc1b1de884647aa0318bf74eb7f293d72a1e40 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:23 +0000
Subject: udp: move udp->no_check6_rx to udp->udp_flags

syzbot reported that udp->no_check6_rx can be read locklessly.
Use one atomic bit from udp->udp_flags.

Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 10 +++++-----
 net/ipv4/udp.c      |  4 ++--
 net/ipv6/udp.c      |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index e3f2a6c7ac1d..8d4c3835b1b2 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,6 +35,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
+	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
 };
 
 struct udp_sock {
@@ -48,8 +49,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 encap_enabled:1, /* This socket enabled encap
+	unsigned char	 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
@@ -120,7 +120,7 @@ static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 
 static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
 {
-	udp_sk(sk)->no_check6_rx = val;
+	udp_assign_bit(NO_CHECK6_RX, sk, val);
 }
 
 static inline bool udp_get_no_check6_tx(const struct sock *sk)
@@ -128,9 +128,9 @@ static inline bool udp_get_no_check6_tx(const struct sock *sk)
 	return udp_test_bit(NO_CHECK6_TX, sk);
 }
 
-static inline bool udp_get_no_check6_rx(struct sock *sk)
+static inline bool udp_get_no_check6_rx(const struct sock *sk)
 {
-	return udp_sk(sk)->no_check6_rx;
+	return udp_test_bit(NO_CHECK6_RX, sk);
 }
 
 static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0c6998291c99..cb32826a1db2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2698,7 +2698,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_NO_CHECK6_RX:
-		up->no_check6_rx = valbool;
+		udp_set_no_check6_rx(sk, valbool);
 		break;
 
 	case UDP_SEGMENT:
@@ -2795,7 +2795,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_NO_CHECK6_RX:
-		val = up->no_check6_rx;
+		val = udp_get_no_check6_rx(sk);
 		break;
 
 	case UDP_SEGMENT:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 469df0ca561f..6e1ea3029260 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -858,7 +858,7 @@ start_lookup:
 		/* If zero checksum and no_check is not on for
 		 * the socket then skip it.
 		 */
-		if (!uh->check && !udp_sk(sk)->no_check6_rx)
+		if (!uh->check && !udp_get_no_check6_rx(sk))
 			continue;
 		if (!first) {
 			first = sk;
@@ -980,7 +980,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
 			udp6_sk_rx_dst_set(sk, dst);
 
-		if (!uh->check && !udp_sk(sk)->no_check6_rx) {
+		if (!uh->check && !udp_get_no_check6_rx(sk)) {
 			if (refcounted)
 				sock_put(sk);
 			goto report_csum_error;
@@ -1002,7 +1002,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	/* Unicast */
 	sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
 	if (sk) {
-		if (!uh->check && !udp_sk(sk)->no_check6_rx)
+		if (!uh->check && !udp_get_no_check6_rx(sk))
 			goto report_csum_error;
 		return udp6_unicast_rcv_skb(sk, skb, uh);
 	}
-- 
cgit v1.2.3-70-g09d2


From e1dc0615c6b08ef36414f08c011965b8fb56198b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:24 +0000
Subject: udp: move udp->gro_enabled to udp->udp_flags

syzbot reported that udp->gro_enabled can be read locklessly.
Use one atomic bit from udp->udp_flags.

Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h    | 2 +-
 net/ipv4/udp.c         | 6 +++---
 net/ipv4/udp_offload.c | 4 ++--
 net/ipv6/udp.c         | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 8d4c3835b1b2..b344bd2e41fc 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -36,6 +36,7 @@ enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
+	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
 };
 
 struct udp_sock {
@@ -54,7 +55,6 @@ struct udp_sock {
 					   * different encapsulation layer set
 					   * this
 					   */
-			 gro_enabled:1,	/* Request GRO aggregation */
 			 accept_udp_l4:1,
 			 accept_udp_fraglist:1;
 /* indicator bits used by pcflag: */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cb32826a1db2..1debc10a0f02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1868,7 +1868,7 @@ try_again:
 						      (struct sockaddr *)sin);
 	}
 
-	if (udp_sk(sk)->gro_enabled)
+	if (udp_test_bit(GRO_ENABLED, sk))
 		udp_cmsg_recv(msg, sk, skb);
 
 	if (inet_cmsg_flags(inet))
@@ -2713,7 +2713,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		/* when enabling GRO, accept the related GSO packet type */
 		if (valbool)
 			udp_tunnel_encap_enable(sk->sk_socket);
-		up->gro_enabled = valbool;
+		udp_assign_bit(GRO_ENABLED, sk, valbool);
 		up->accept_udp_l4 = valbool;
 		release_sock(sk);
 		break;
@@ -2803,7 +2803,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_GRO:
-		val = up->gro_enabled;
+		val = udp_test_bit(GRO_ENABLED, sk);
 		break;
 
 	/* The following two cannot be changed on UDP sockets, the return is
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0f46b3c2e4ac..6c95d28d0c4a 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -557,10 +557,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 	NAPI_GRO_CB(skb)->is_flist = 0;
 	if (!sk || !udp_sk(sk)->gro_receive) {
 		if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
-			NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1;
+			NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1;
 
 		if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
-		    (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
+		    (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist)
 			return call_gro_receive(udp_gro_receive_segment, head, skb);
 
 		/* no GRO, be sure flush the current packet */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6e1ea3029260..2c3281879b6d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -413,7 +413,7 @@ try_again:
 						      (struct sockaddr *)sin6);
 	}
 
-	if (udp_sk(sk)->gro_enabled)
+	if (udp_test_bit(GRO_ENABLED, sk))
 		udp_cmsg_recv(msg, sk, skb);
 
 	if (np->rxopt.all)
-- 
cgit v1.2.3-70-g09d2


From f5f52f0884a595ff99ab1a608643fe4025fca2d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:26 +0000
Subject: udp: move udp->accept_udp_{l4|fraglist} to udp->udp_flags

These are read locklessly, move them to udp_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 16 +++++++++-------
 net/ipv4/udp.c      |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index b344bd2e41fc..bb2b87adfbea 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -37,6 +37,8 @@ enum {
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
 	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
+	UDP_FLAGS_ACCEPT_FRAGLIST,
+	UDP_FLAGS_ACCEPT_L4,
 };
 
 struct udp_sock {
@@ -50,13 +52,11 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 encap_enabled:1, /* This socket enabled encap
+	unsigned char	 encap_enabled:1; /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
 					   */
-			 accept_udp_l4:1,
-			 accept_udp_fraglist:1;
 /* indicator bits used by pcflag: */
 #define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
 #define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
@@ -149,10 +149,12 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
 	if (!skb_is_gso(skb))
 		return false;
 
-	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+	    !udp_test_bit(ACCEPT_L4, sk))
 		return true;
 
-	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST &&
+	    !udp_test_bit(ACCEPT_FRAGLIST, sk))
 		return true;
 
 	return false;
@@ -160,8 +162,8 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
 
 static inline void udp_allow_gso(struct sock *sk)
 {
-	udp_sk(sk)->accept_udp_l4 = 1;
-	udp_sk(sk)->accept_udp_fraglist = 1;
+	udp_set_bit(ACCEPT_L4, sk);
+	udp_set_bit(ACCEPT_FRAGLIST, sk);
 }
 
 #define udp_portaddr_for_each_entry(__sk, list) \
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db43907b9a3e..75ba86a87bb6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2716,7 +2716,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		if (valbool)
 			udp_tunnel_encap_enable(sk->sk_socket);
 		udp_assign_bit(GRO_ENABLED, sk, valbool);
-		up->accept_udp_l4 = valbool;
+		udp_assign_bit(ACCEPT_L4, sk, valbool);
 		release_sock(sk);
 		break;
 
-- 
cgit v1.2.3-70-g09d2


From ac9a7f4ce5dda1472e8f44096f33066c6ec1a3b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:27 +0000
Subject: udp: lockless UDP_ENCAP_L2TPINUDP / UDP_GRO

Move udp->encap_enabled to udp->udp_flags.

Add udp_test_and_set_bit() helper to allow lockless
udp_tunnel_encap_enable() implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h        |  9 ++++-----
 include/net/udp_tunnel.h   |  9 +++------
 net/ipv4/udp.c             | 10 +++-------
 net/ipv4/udp_tunnel_core.c |  2 +-
 net/ipv6/udp.c             |  2 +-
 5 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index bb2b87adfbea..0cf83270a4a2 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -39,6 +39,7 @@ enum {
 	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
 	UDP_FLAGS_ACCEPT_FRAGLIST,
 	UDP_FLAGS_ACCEPT_L4,
+	UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
 };
 
 struct udp_sock {
@@ -52,11 +53,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 encap_enabled:1; /* This socket enabled encap
-					   * processing; UDP tunnels and
-					   * different encapsulation layer set
-					   * this
-					   */
+
 /* indicator bits used by pcflag: */
 #define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
 #define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
@@ -104,6 +101,8 @@ struct udp_sock {
 	test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_set_bit(nr, sk)			\
 	set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_test_and_set_bit(nr, sk)		\
+	test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_clear_bit(nr, sk)			\
 	clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_assign_bit(nr, sk, val)		\
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 0ca9b7a11baf..29251c3519cf 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -174,16 +174,13 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
 }
 #endif
 
-static inline void udp_tunnel_encap_enable(struct socket *sock)
+static inline void udp_tunnel_encap_enable(struct sock *sk)
 {
-	struct udp_sock *up = udp_sk(sock->sk);
-
-	if (up->encap_enabled)
+	if (udp_test_and_set_bit(ENCAP_ENABLED, sk))
 		return;
 
-	up->encap_enabled = 1;
 #if IS_ENABLED(CONFIG_IPV6)
-	if (sock->sk->sk_family == PF_INET6)
+	if (READ_ONCE(sk->sk_family) == PF_INET6)
 		ipv6_stub->udpv6_encap_enable();
 #endif
 	udp_encap_enable();
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 75ba86a87bb6..637a4faf9aff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2618,7 +2618,7 @@ void udp_destroy_sock(struct sock *sk)
 			if (encap_destroy)
 				encap_destroy(sk);
 		}
-		if (up->encap_enabled)
+		if (udp_test_bit(ENCAP_ENABLED, sk))
 			static_branch_dec(&udp_encap_needed_key);
 	}
 }
@@ -2685,9 +2685,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 			fallthrough;
 		case UDP_ENCAP_L2TPINUDP:
 			up->encap_type = val;
-			lock_sock(sk);
-			udp_tunnel_encap_enable(sk->sk_socket);
-			release_sock(sk);
+			udp_tunnel_encap_enable(sk);
 			break;
 		default:
 			err = -ENOPROTOOPT;
@@ -2710,14 +2708,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_GRO:
-		lock_sock(sk);
 
 		/* when enabling GRO, accept the related GSO packet type */
 		if (valbool)
-			udp_tunnel_encap_enable(sk->sk_socket);
+			udp_tunnel_encap_enable(sk);
 		udp_assign_bit(GRO_ENABLED, sk, valbool);
 		udp_assign_bit(ACCEPT_L4, sk, valbool);
-		release_sock(sk);
 		break;
 
 	/*
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 9b18f371af0d..1e7e4aecdc48 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -78,7 +78,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 	udp_sk(sk)->gro_receive = cfg->gro_receive;
 	udp_sk(sk)->gro_complete = cfg->gro_complete;
 
-	udp_tunnel_encap_enable(sock);
+	udp_tunnel_encap_enable(sk);
 }
 EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2c3281879b6d..90688877e900 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1670,7 +1670,7 @@ void udpv6_destroy_sock(struct sock *sk)
 			if (encap_destroy)
 				encap_destroy(sk);
 		}
-		if (up->encap_enabled) {
+		if (udp_test_bit(ENCAP_ENABLED, sk)) {
 			static_branch_dec(&udpv6_encap_needed_key);
 			udp_encap_disable();
 		}
-- 
cgit v1.2.3-70-g09d2


From 729549aa350c56a777bb342941ed4d69b6585769 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:29 +0000
Subject: udplite: remove UDPLITE_BIT

This flag is set but never read, we can remove it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 5 ++---
 net/ipv4/udplite.c  | 1 -
 net/ipv6/udplite.c  | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 0cf83270a4a2..58156edec009 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,9 +55,8 @@ struct udp_sock {
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 
 /* indicator bits used by pcflag: */
-#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
-#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
+#define UDPLITE_SEND_CC  0x1  		/* set via udplite setsockopt         */
+#define UDPLITE_RECV_CC  0x2		/* set via udplite setsocktopt        */
 	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 39ecdad1b50c..af37af3ab727 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -21,7 +21,6 @@ EXPORT_SYMBOL(udplite_table);
 static int udplite_sk_init(struct sock *sk)
 {
 	udp_init_sock(sk);
-	udp_sk(sk)->pcflag = UDPLITE_BIT;
 	pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
 		     "please contact the netdev mailing list\n");
 	return 0;
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 267d491e9707..a60bec9b14f1 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -17,7 +17,6 @@
 static int udplitev6_sk_init(struct sock *sk)
 {
 	udpv6_init_sock(sk);
-	udp_sk(sk)->pcflag = UDPLITE_BIT;
 	pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
 		     "please contact the netdev mailing list\n");
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 882af43a0fc37e26d85fb0df0c9edd3bed928de4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:30 +0000
Subject: udplite: fix various data-races

udp->pcflag, udp->pcslen and udp->pcrlen reads/writes are racy.

Move udp->pcflag to udp->udp_flags for atomicity,
and add READ_ONCE()/WRITE_ONCE() annotations for pcslen and pcrlen.

Fixes: ba4e58eca8aa ("[NET]: Supporting UDP-Lite (RFC 3828) in Linux")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h   |  6 ++----
 include/net/udplite.h | 14 +++++++++-----
 net/ipv4/udp.c        | 21 +++++++++++----------
 net/ipv6/udp.c        |  9 +++++----
 4 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 58156edec009..d04188714dca 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -40,6 +40,8 @@ enum {
 	UDP_FLAGS_ACCEPT_FRAGLIST,
 	UDP_FLAGS_ACCEPT_L4,
 	UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
+	UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */
+	UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */
 };
 
 struct udp_sock {
@@ -54,10 +56,6 @@ struct udp_sock {
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 
-/* indicator bits used by pcflag: */
-#define UDPLITE_SEND_CC  0x1  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x2		/* set via udplite setsocktopt        */
-	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
diff --git a/include/net/udplite.h b/include/net/udplite.h
index bd33ff2b8f42..786919d29f8d 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -66,14 +66,18 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh)
 /* Fast-path computation of checksum. Socket may not be locked. */
 static inline __wsum udplite_csum(struct sk_buff *skb)
 {
-	const struct udp_sock *up = udp_sk(skb->sk);
 	const int off = skb_transport_offset(skb);
+	const struct sock *sk = skb->sk;
 	int len = skb->len - off;
 
-	if ((up->pcflag & UDPLITE_SEND_CC) && up->pcslen < len) {
-		if (0 < up->pcslen)
-			len = up->pcslen;
-		udp_hdr(skb)->len = htons(up->pcslen);
+	if (udp_test_bit(UDPLITE_SEND_CC, sk)) {
+		u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen);
+
+		if (pcslen < len) {
+			if (pcslen > 0)
+				len = pcslen;
+			udp_hdr(skb)->len = htons(pcslen);
+		}
 	}
 	skb->ip_summed = CHECKSUM_NONE;     /* no HW support for checksumming */
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2eeab4af17a1..c3ff984b6354 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2120,7 +2120,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 	/*
 	 * 	UDP-Lite specific tests, ignored on UDP sockets
 	 */
-	if ((up->pcflag & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
+	if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+		u16 pcrlen = READ_ONCE(up->pcrlen);
 
 		/*
 		 * MIB statistics other than incrementing the error count are
@@ -2133,7 +2134,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 		 * delivery of packets with coverage values less than a value
 		 * provided by the application."
 		 */
-		if (up->pcrlen == 0) {          /* full coverage was set  */
+		if (pcrlen == 0) {          /* full coverage was set  */
 			net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
 					    UDP_SKB_CB(skb)->cscov, skb->len);
 			goto drop;
@@ -2144,9 +2145,9 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 		 * that it wants x while sender emits packets of smaller size y.
 		 * Therefore the above ...()->partial_cov statement is essential.
 		 */
-		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
+		if (UDP_SKB_CB(skb)->cscov < pcrlen) {
 			net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
-					    UDP_SKB_CB(skb)->cscov, up->pcrlen);
+					    UDP_SKB_CB(skb)->cscov, pcrlen);
 			goto drop;
 		}
 	}
@@ -2729,8 +2730,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 			val = 8;
 		else if (val > USHRT_MAX)
 			val = USHRT_MAX;
-		up->pcslen = val;
-		up->pcflag |= UDPLITE_SEND_CC;
+		WRITE_ONCE(up->pcslen, val);
+		udp_set_bit(UDPLITE_SEND_CC, sk);
 		break;
 
 	/* The receiver specifies a minimum checksum coverage value. To make
@@ -2743,8 +2744,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 			val = 8;
 		else if (val > USHRT_MAX)
 			val = USHRT_MAX;
-		up->pcrlen = val;
-		up->pcflag |= UDPLITE_RECV_CC;
+		WRITE_ONCE(up->pcrlen, val);
+		udp_set_bit(UDPLITE_RECV_CC, sk);
 		break;
 
 	default:
@@ -2808,11 +2809,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 	/* The following two cannot be changed on UDP sockets, the return is
 	 * always 0 (which corresponds to the full checksum coverage of UDP). */
 	case UDPLITE_SEND_CSCOV:
-		val = up->pcslen;
+		val = READ_ONCE(up->pcslen);
 		break;
 
 	case UDPLITE_RECV_CSCOV:
-		val = up->pcrlen;
+		val = READ_ONCE(up->pcrlen);
 		break;
 
 	default:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0e79d189613b..f60ba4295435 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -727,16 +727,17 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 	/*
 	 * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
 	 */
-	if ((up->pcflag & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
+	if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+		u16 pcrlen = READ_ONCE(up->pcrlen);
 
-		if (up->pcrlen == 0) {          /* full coverage was set  */
+		if (pcrlen == 0) {          /* full coverage was set  */
 			net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
 					    UDP_SKB_CB(skb)->cscov, skb->len);
 			goto drop;
 		}
-		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
+		if (UDP_SKB_CB(skb)->cscov < pcrlen) {
 			net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
-					    UDP_SKB_CB(skb)->cscov, up->pcrlen);
+					    UDP_SKB_CB(skb)->cscov, pcrlen);
 			goto drop;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From b0adfba7ee770fef20b1b6d86706c28f7fccfb07 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:01:59 +0000
Subject: ipv6: lockless IPV6_UNICAST_HOPS implementation

Some np->hop_limit accesses are racy, when socket lock is not held.

Add missing annotations and switch to full lockless implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     | 12 +-----------
 include/net/ipv6.h       |  2 +-
 net/ipv6/ip6_output.c    |  2 +-
 net/ipv6/ipv6_sockglue.c | 20 +++++++++++---------
 net/ipv6/mcast.c         |  2 +-
 net/ipv6/ndisc.c         |  2 +-
 6 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index af8a771a053c..c2e087071384 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -213,17 +213,7 @@ struct ipv6_pinfo {
 	__be32			flow_label;
 	__u32			frag_size;
 
-	/*
-	 * Packed in 16bits.
-	 * Omit one shift by putting the signed field at MSB.
-	 */
-#if defined(__BIG_ENDIAN_BITFIELD)
-	__s16			hop_limit:9;
-	__u16			__unused_1:7;
-#else
-	__u16			__unused_1:7;
-	__s16			hop_limit:9;
-#endif
+	s16			hop_limit;
 
 #if defined(__BIG_ENDIAN_BITFIELD)
 	/* Packed in 16bits. */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index c6932d1a3fa8..2e8e7e31e02e 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -916,7 +916,7 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
 	if (ipv6_addr_is_multicast(&fl6->daddr))
 		hlimit = np->mcast_hops;
 	else
-		hlimit = np->hop_limit;
+		hlimit = READ_ONCE(np->hop_limit);
 	if (hlimit < 0)
 		hlimit = ip6_dst_hoplimit(dst);
 	return hlimit;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 54fc4c711f2c..1e16d56d8c38 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -309,7 +309,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	 *	Fill in the IPv6 header
 	 */
 	if (np)
-		hlimit = np->hop_limit;
+		hlimit = READ_ONCE(np->hop_limit);
 	if (hlimit < 0)
 		hlimit = ip6_dst_hoplimit(dst);
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 0e2a0847b387..f27993a1470d 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -415,6 +415,16 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	if (ip6_mroute_opt(optname))
 		return ip6_mroute_setsockopt(sk, optname, optval, optlen);
 
+	/* Handle options that can be set without locking the socket. */
+	switch (optname) {
+	case IPV6_UNICAST_HOPS:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val > 255 || val < -1)
+			return -EINVAL;
+		WRITE_ONCE(np->hop_limit, val);
+		return 0;
+	}
 	if (needs_rtnl)
 		rtnl_lock();
 	sockopt_lock_sock(sk);
@@ -733,14 +743,6 @@ done:
 		}
 		break;
 	}
-	case IPV6_UNICAST_HOPS:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val > 255 || val < -1)
-			goto e_inval;
-		np->hop_limit = val;
-		retv = 0;
-		break;
 
 	case IPV6_MULTICAST_HOPS:
 		if (sk->sk_type == SOCK_STREAM)
@@ -1347,7 +1349,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		struct dst_entry *dst;
 
 		if (optname == IPV6_UNICAST_HOPS)
-			val = np->hop_limit;
+			val = READ_ONCE(np->hop_limit);
 		else
 			val = np->mcast_hops;
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 421264a69e97..4a7967623909 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1716,7 +1716,7 @@ static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb,
 
 	hdr->payload_len = htons(len);
 	hdr->nexthdr = proto;
-	hdr->hop_limit = inet6_sk(sk)->hop_limit;
+	hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit);
 
 	hdr->saddr = *saddr;
 	hdr->daddr = *daddr;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 553c8664e0a7..b554fd40bdc3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -500,7 +500,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
 					      csum_partial(icmp6h,
 							   skb->len, 0));
 
-	ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len);
+	ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
 
 	rcu_read_lock();
 	idev = __in6_dev_get(dst->dev);
-- 
cgit v1.2.3-70-g09d2


From d986f52124e062753e33b6fe303be5904a997eac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:00 +0000
Subject: ipv6: lockless IPV6_MULTICAST_LOOP implementation

Add inet6_{test|set|clear|assign}_bit() helpers.

Note that I am using bits from inet->inet_flags,
this might change in the future if we need more flags.

While solving data-races accessing np->mc_loop,
this patch also allows to implement lockless accesses
to np->mcast_hops in the following patch.

Also constify sk_mc_loop() argument.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h            | 18 ++++++++++++++----
 include/net/inet_sock.h         |  1 +
 include/net/sock.h              |  2 +-
 net/core/sock.c                 |  4 ++--
 net/ipv6/af_inet6.c             |  2 +-
 net/ipv6/ipv6_sockglue.c        | 18 ++++++++----------
 net/ipv6/ndisc.c                |  2 +-
 net/netfilter/ipvs/ip_vs_sync.c |  8 ++------
 8 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c2e087071384..68cf1ca94914 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -218,11 +218,9 @@ struct ipv6_pinfo {
 #if defined(__BIG_ENDIAN_BITFIELD)
 	/* Packed in 16bits. */
 	__s16			mcast_hops:9;
-	__u16			__unused_2:6,
-				mc_loop:1;
+	__u16			__unused_2:7,
 #else
-	__u16			mc_loop:1,
-				__unused_2:6;
+	__u16			__unused_2:7;
 	__s16			mcast_hops:9;
 #endif
 	int			ucast_oif;
@@ -283,6 +281,18 @@ struct ipv6_pinfo {
 	struct inet6_cork	cork;
 };
 
+/* We currently use available bits from inet_sk(sk)->inet_flags,
+ * this could change in the future.
+ */
+#define inet6_test_bit(nr, sk)			\
+	test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_set_bit(nr, sk)			\
+	set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_clear_bit(nr, sk)			\
+	clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_assign_bit(nr, sk, val)		\
+	assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)
+
 /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
 struct raw6_sock {
 	/* inet_sock has to be the first member of raw6_sock */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 2de0e4d4a027..b5a9dca92fb4 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -268,6 +268,7 @@ enum {
 	INET_FLAGS_NODEFRAG	= 17,
 	INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
 	INET_FLAGS_DEFER_CONNECT = 19,
+	INET_FLAGS_MC6_LOOP	= 20,
 };
 
 /* cmsg flags for inet */
diff --git a/include/net/sock.h b/include/net/sock.h
index 676146e9d181..56ac1abadea5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2238,7 +2238,7 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
 	}
 }
 
-bool sk_mc_loop(struct sock *sk);
+bool sk_mc_loop(const struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
 {
diff --git a/net/core/sock.c b/net/core/sock.c
index bb89b88bc1e8..213a62ac13f2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -759,7 +759,7 @@ out:
 	return ret;
 }
 
-bool sk_mc_loop(struct sock *sk)
+bool sk_mc_loop(const struct sock *sk)
 {
 	if (dev_recursion_level())
 		return false;
@@ -771,7 +771,7 @@ bool sk_mc_loop(struct sock *sk)
 		return inet_test_bit(MC_LOOP, sk);
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		return inet6_sk(sk)->mc_loop;
+		return inet6_test_bit(MC6_LOOP, sk);
 #endif
 	}
 	WARN_ON_ONCE(1);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 368824fe9719..bbd4aa1b96d0 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -217,7 +217,7 @@ lookup_protocol:
 	inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
 	np->hop_limit	= -1;
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
-	np->mc_loop	= 1;
+	inet6_set_bit(MC6_LOOP, sk);
 	np->mc_all	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
 	np->repflow	= net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f27993a1470d..755fac85a120 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -424,6 +424,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		WRITE_ONCE(np->hop_limit, val);
 		return 0;
+	case IPV6_MULTICAST_LOOP:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val != valbool)
+			return -EINVAL;
+		inet6_assign_bit(MC6_LOOP, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -755,15 +762,6 @@ done:
 		retv = 0;
 		break;
 
-	case IPV6_MULTICAST_LOOP:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val != valbool)
-			goto e_inval;
-		np->mc_loop = valbool;
-		retv = 0;
-		break;
-
 	case IPV6_UNICAST_IF:
 	{
 		struct net_device *dev = NULL;
@@ -1367,7 +1365,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 	}
 
 	case IPV6_MULTICAST_LOOP:
-		val = np->mc_loop;
+		val = inet6_test_bit(MC6_LOOP, sk);
 		break;
 
 	case IPV6_MULTICAST_IF:
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b554fd40bdc3..679443d7ecb5 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1996,7 +1996,7 @@ static int __net_init ndisc_net_init(struct net *net)
 	np = inet6_sk(sk);
 	np->hop_limit = 255;
 	/* Do not loopback ndisc messages */
-	np->mc_loop = 0;
+	inet6_clear_bit(MC6_LOOP, sk);
 
 	return 0;
 }
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index da5af28ff57b..3c2251cabd04 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1298,17 +1298,13 @@ static void set_sock_size(struct sock *sk, int mode, int val)
 static void set_mcast_loop(struct sock *sk, u_char loop)
 {
 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
-	lock_sock(sk);
 	inet_assign_bit(MC_LOOP, sk, loop);
 #ifdef CONFIG_IP_VS_IPV6
-	if (sk->sk_family == AF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
-
+	if (READ_ONCE(sk->sk_family) == AF_INET6) {
 		/* IPV6_MULTICAST_LOOP */
-		np->mc_loop = loop ? 1 : 0;
+		inet6_assign_bit(MC6_LOOP, sk, loop);
 	}
 #endif
-	release_sock(sk);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 2da23eb07c91241d962f3ff05565065484cd8929 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:01 +0000
Subject: ipv6: lockless IPV6_MULTICAST_HOPS implementation

This fixes data-races around np->mcast_hops,
and make IPV6_MULTICAST_HOPS lockless.

Note that np->mcast_hops is never negative,
thus can fit an u8 field instead of s16.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h            |  9 +--------
 include/net/ipv6.h              |  2 +-
 net/dccp/ipv6.c                 |  2 +-
 net/ipv6/ipv6_sockglue.c        | 28 +++++++++++++++-------------
 net/ipv6/tcp_ipv6.c             |  3 ++-
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 6 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 68cf1ca94914..9cc278b5e4f4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -214,15 +214,8 @@ struct ipv6_pinfo {
 	__u32			frag_size;
 
 	s16			hop_limit;
+	u8			mcast_hops;
 
-#if defined(__BIG_ENDIAN_BITFIELD)
-	/* Packed in 16bits. */
-	__s16			mcast_hops:9;
-	__u16			__unused_2:7,
-#else
-	__u16			__unused_2:7;
-	__s16			mcast_hops:9;
-#endif
 	int			ucast_oif;
 	int			mcast_oif;
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 2e8e7e31e02e..8a04a8985336 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -914,7 +914,7 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
 	int hlimit;
 
 	if (ipv6_addr_is_multicast(&fl6->daddr))
-		hlimit = np->mcast_hops;
+		hlimit = READ_ONCE(np->mcast_hops);
 	else
 		hlimit = READ_ONCE(np->hop_limit);
 	if (hlimit < 0)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 33f6ccf6ba77..83617a16b98e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -676,7 +676,7 @@ ipv6_pktoptions:
 		if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
 			np->mcast_oif = inet6_iif(opt_skb);
 		if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
-			np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+			WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit);
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
 		if (np->repflow)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 755fac85a120..5fff19a87c75 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -431,6 +431,16 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(MC6_LOOP, sk, valbool);
 		return 0;
+	case IPV6_MULTICAST_HOPS:
+		if (sk->sk_type == SOCK_STREAM)
+			return retv;
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val > 255 || val < -1)
+			return -EINVAL;
+		WRITE_ONCE(np->mcast_hops,
+			   val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -751,16 +761,6 @@ done:
 		break;
 	}
 
-	case IPV6_MULTICAST_HOPS:
-		if (sk->sk_type == SOCK_STREAM)
-			break;
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val > 255 || val < -1)
-			goto e_inval;
-		np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
-		retv = 0;
-		break;
 
 	case IPV6_UNICAST_IF:
 	{
@@ -1180,7 +1180,8 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 				put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
 			}
 			if (np->rxopt.bits.rxhlim) {
-				int hlim = np->mcast_hops;
+				int hlim = READ_ONCE(np->mcast_hops);
+
 				put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
 			}
 			if (np->rxopt.bits.rxtclass) {
@@ -1197,7 +1198,8 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 				put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
 			}
 			if (np->rxopt.bits.rxohlim) {
-				int hlim = np->mcast_hops;
+				int hlim = READ_ONCE(np->mcast_hops);
+
 				put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
 			}
 			if (np->rxopt.bits.rxflow) {
@@ -1349,7 +1351,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		if (optname == IPV6_UNICAST_HOPS)
 			val = READ_ONCE(np->hop_limit);
 		else
-			val = np->mcast_hops;
+			val = READ_ONCE(np->mcast_hops);
 
 		if (val < 0) {
 			rcu_read_lock();
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3a88545a265d..54db5fab318b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1542,7 +1542,8 @@ ipv6_pktoptions:
 		if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
 			np->mcast_oif = tcp_v6_iif(opt_skb);
 		if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
-			np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+			WRITE_ONCE(np->mcast_hops,
+				   ipv6_hdr(opt_skb)->hop_limit);
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
 		if (np->repflow)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3c2251cabd04..df1b33b61059 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1322,7 +1322,7 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
 		struct ipv6_pinfo *np = inet6_sk(sk);
 
 		/* IPV6_MULTICAST_HOPS */
-		np->mcast_hops = ttl;
+		WRITE_ONCE(np->mcast_hops, ttl);
 	}
 #endif
 	release_sock(sk);
-- 
cgit v1.2.3-70-g09d2


From dcae74622c051b219ee628669a31716473efda2c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:04 +0000
Subject: ipv6: lockless IPV6_RECVERR_RFC4884 implementation

Move np->recverr_rfc4884 to an atomic flag to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  1 -
 include/net/inet_sock.h  |  1 +
 net/ipv6/datagram.c      |  2 +-
 net/ipv6/ipv6_sockglue.c | 17 ++++++++---------
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 9cc278b5e4f4..0d2b0a1b2dae 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -256,7 +256,6 @@ struct ipv6_pinfo {
 				autoflowlabel:1,
 				autoflowlabel_set:1,
 				mc_all:1,
-				recverr_rfc4884:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index b5a9dca92fb4..8cf1f7b44234 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -269,6 +269,7 @@ enum {
 	INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
 	INET_FLAGS_DEFER_CONNECT = 19,
 	INET_FLAGS_MC6_LOOP	= 20,
+	INET_FLAGS_RECVERR6_RFC4884 = 21,
 };
 
 /* cmsg flags for inet */
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 41ebc4e57473..e81892814935 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -332,7 +332,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
 
 	__skb_pull(skb, payload - skb->data);
 
-	if (inet6_sk(sk)->recverr_rfc4884)
+	if (inet6_test_bit(RECVERR6_RFC4884, sk))
 		ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
 
 	skb_reset_transport_header(skb);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index bbc8a009e05d..b65e73ac2ccd 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -462,6 +462,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		 */
 		WRITE_ONCE(np->min_hopcount, val);
 		return 0;
+	case IPV6_RECVERR_RFC4884:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val < 0 || val > 1)
+			return -EINVAL;
+		inet6_assign_bit(RECVERR6_RFC4884, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -974,14 +981,6 @@ done:
 		np->rxopt.bits.recvfragsize = valbool;
 		retv = 0;
 		break;
-	case IPV6_RECVERR_RFC4884:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val < 0 || val > 1)
-			goto e_inval;
-		np->recverr_rfc4884 = valbool;
-		retv = 0;
-		break;
 	}
 
 unlock:
@@ -1462,7 +1461,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_RECVERR_RFC4884:
-		val = np->recverr_rfc4884;
+		val = inet6_test_bit(RECVERR6_RFC4884, sk);
 		break;
 
 	default:
-- 
cgit v1.2.3-70-g09d2


From 6559c0ff3bc27d7e4d447d31c1d7e8eae0e959f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:05 +0000
Subject: ipv6: lockless IPV6_MULTICAST_ALL implementation

Move np->mc_all to an atomic flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  1 -
 include/net/inet_sock.h  |  1 +
 net/ipv6/af_inet6.c      |  2 +-
 net/ipv6/ipv6_sockglue.c | 14 ++++++--------
 net/ipv6/mcast.c         |  2 +-
 5 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 0d2b0a1b2dae..d88e91b7f0a3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -255,7 +255,6 @@ struct ipv6_pinfo {
 				dontfrag:1,
 				autoflowlabel:1,
 				autoflowlabel_set:1,
-				mc_all:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 8cf1f7b44234..97e70a97dae8 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -270,6 +270,7 @@ enum {
 	INET_FLAGS_DEFER_CONNECT = 19,
 	INET_FLAGS_MC6_LOOP	= 20,
 	INET_FLAGS_RECVERR6_RFC4884 = 21,
+	INET_FLAGS_MC6_ALL	= 22,
 };
 
 /* cmsg flags for inet */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index bbd4aa1b96d0..372fb7b9112c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -218,7 +218,7 @@ lookup_protocol:
 	np->hop_limit	= -1;
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
 	inet6_set_bit(MC6_LOOP, sk);
-	np->mc_all	= 1;
+	inet6_set_bit(MC6_ALL, sk);
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
 	np->repflow	= net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b65e73ac2ccd..7a181831f226 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -469,6 +469,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(RECVERR6_RFC4884, sk, valbool);
 		return 0;
+	case IPV6_MULTICAST_ALL:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		inet6_assign_bit(MC6_ALL, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -890,13 +895,6 @@ done:
 			retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
 		break;
 	}
-	case IPV6_MULTICAST_ALL:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		np->mc_all = valbool;
-		retv = 0;
-		break;
-
 	case MCAST_JOIN_GROUP:
 	case MCAST_LEAVE_GROUP:
 		if (in_compat_syscall())
@@ -1372,7 +1370,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_MULTICAST_ALL:
-		val = np->mc_all;
+		val = inet6_test_bit(MC6_ALL, sk);
 		break;
 
 	case IPV6_UNICAST_IF:
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 4a7967623909..99e28b444a4c 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -642,7 +642,7 @@ bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
 	}
 	if (!mc) {
 		rcu_read_unlock();
-		return np->mc_all;
+		return inet6_test_bit(MC6_ALL, sk);
 	}
 	psl = rcu_dereference(mc->sflist);
 	if (!psl) {
-- 
cgit v1.2.3-70-g09d2


From 5121516b0c4736b7977d977b239e36d23ec64401 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:06 +0000
Subject: ipv6: lockless IPV6_AUTOFLOWLABEL implementation

Move np->autoflowlabel and np->autoflowlabel_set in inet->inet_flags,
to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  2 --
 include/net/inet_sock.h  |  2 ++
 include/net/ipv6.h       |  2 +-
 net/ipv6/ip6_output.c    | 12 +++++-------
 net/ipv6/ipv6_sockglue.c | 11 +++++------
 5 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index d88e91b7f0a3..e3be5dc21b7d 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -253,8 +253,6 @@ struct ipv6_pinfo {
 						 * 100: prefer care-of address
 						 */
 				dontfrag:1,
-				autoflowlabel:1,
-				autoflowlabel_set:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 97e70a97dae8..f1af64a40673 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -271,6 +271,8 @@ enum {
 	INET_FLAGS_MC6_LOOP	= 20,
 	INET_FLAGS_RECVERR6_RFC4884 = 21,
 	INET_FLAGS_MC6_ALL	= 22,
+	INET_FLAGS_AUTOFLOWLABEL_SET = 23,
+	INET_FLAGS_AUTOFLOWLABEL = 24,
 };
 
 /* cmsg flags for inet */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8a04a8985336..4b6cbec059e2 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -428,7 +428,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
 			   int flags);
 int ip6_flowlabel_init(void);
 void ip6_flowlabel_cleanup(void);
-bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np);
+bool ip6_autoflowlabel(struct net *net, const struct sock *sk);
 
 static inline void fl6_sock_release(struct ip6_flowlabel *fl)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ab7ede4a731a..47aa42f93ccd 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -232,12 +232,11 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(ip6_output);
 
-bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
 {
-	if (!np->autoflowlabel_set)
+	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
 		return ip6_default_np_autolabel(net);
-	else
-		return np->autoflowlabel;
+	return inet6_test_bit(AUTOFLOWLABEL, sk);
 }
 
 /*
@@ -314,7 +313,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 		hlimit = ip6_dst_hoplimit(dst);
 
 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
-				ip6_autoflowlabel(net, np), fl6));
+				ip6_autoflowlabel(net, sk), fl6));
 
 	hdr->payload_len = htons(seg_len);
 	hdr->nexthdr = proto;
@@ -1938,7 +1937,6 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 	struct sk_buff *skb, *tmp_skb;
 	struct sk_buff **tail_skb;
 	struct in6_addr *final_dst;
-	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct net *net = sock_net(sk);
 	struct ipv6hdr *hdr;
 	struct ipv6_txoptions *opt = v6_cork->opt;
@@ -1981,7 +1979,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 
 	ip6_flow_hdr(hdr, v6_cork->tclass,
 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
-					ip6_autoflowlabel(net, np), fl6));
+					ip6_autoflowlabel(net, sk), fl6));
 	hdr->hop_limit = v6_cork->hop_limit;
 	hdr->nexthdr = proto;
 	hdr->saddr = fl6->saddr;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 7a181831f226..d5d428a695f7 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -474,6 +474,10 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(MC6_ALL, sk, valbool);
 		return 0;
+	case IPV6_AUTOFLOWLABEL:
+		inet6_assign_bit(AUTOFLOWLABEL, sk, valbool);
+		inet6_set_bit(AUTOFLOWLABEL_SET, sk);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -970,11 +974,6 @@ done:
 		np->dontfrag = valbool;
 		retv = 0;
 		break;
-	case IPV6_AUTOFLOWLABEL:
-		np->autoflowlabel = valbool;
-		np->autoflowlabel_set = 1;
-		retv = 0;
-		break;
 	case IPV6_RECVFRAGSIZE:
 		np->rxopt.bits.recvfragsize = valbool;
 		retv = 0;
@@ -1447,7 +1446,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_AUTOFLOWLABEL:
-		val = ip6_autoflowlabel(sock_net(sk), np);
+		val = ip6_autoflowlabel(sock_net(sk), sk);
 		break;
 
 	case IPV6_RECVFRAGSIZE:
-- 
cgit v1.2.3-70-g09d2


From 1086ca7cce292bb498d7f8f85f4593c9ef4902b7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:07 +0000
Subject: ipv6: lockless IPV6_DONTFRAG implementation

Move np->dontfrag flag to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     | 1 -
 include/net/inet_sock.h  | 1 +
 include/net/ipv6.h       | 6 +++---
 include/net/xfrm.h       | 2 +-
 net/ipv6/icmp.c          | 4 ++--
 net/ipv6/ip6_output.c    | 2 +-
 net/ipv6/ipv6_sockglue.c | 9 ++++-----
 net/ipv6/ping.c          | 2 +-
 net/ipv6/raw.c           | 2 +-
 net/ipv6/udp.c           | 2 +-
 net/l2tp/l2tp_ip6.c      | 2 +-
 11 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e3be5dc21b7d..57d563f1d4b1 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -252,7 +252,6 @@ struct ipv6_pinfo {
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-				dontfrag:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f1af64a40673..ac75324e9e1e 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -273,6 +273,7 @@ enum {
 	INET_FLAGS_MC6_ALL	= 22,
 	INET_FLAGS_AUTOFLOWLABEL_SET = 23,
 	INET_FLAGS_AUTOFLOWLABEL = 24,
+	INET_FLAGS_DONTFRAG	= 25,
 };
 
 /* cmsg flags for inet */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4b6cbec059e2..5a1f2993680d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -373,12 +373,12 @@ static inline void ipcm6_init(struct ipcm6_cookie *ipc6)
 }
 
 static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
-				 const struct ipv6_pinfo *np)
+				 const struct sock *sk)
 {
 	*ipc6 = (struct ipcm6_cookie) {
 		.hlimit = -1,
-		.tclass = np->tclass,
-		.dontfrag = np->dontfrag,
+		.tclass = inet6_sk(sk)->tclass,
+		.dontfrag = inet6_test_bit(DONTFRAG, sk),
 	};
 }
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 363c7d510554..98d7aa78adda 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2166,7 +2166,7 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk)
 
 	proto = sk->sk_protocol;
 	if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
-		return inet6_sk(sk)->dontfrag;
+		return inet6_test_bit(DONTFRAG, sk);
 
 	return false;
 }
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 93a594a901d1..8fb4a791881a 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -588,7 +588,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
 
-	ipcm6_init_sk(&ipc6, np);
+	ipcm6_init_sk(&ipc6, sk);
 	ipc6.sockc.mark = mark;
 	fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
 
@@ -791,7 +791,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
 	msg.offset = 0;
 	msg.type = type;
 
-	ipcm6_init_sk(&ipc6, np);
+	ipcm6_init_sk(&ipc6, sk);
 	ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
 	ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
 	ipc6.sockc.mark = mark;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 47aa42f93ccd..8851fe5d45a0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -2092,7 +2092,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 		return ERR_PTR(err);
 	}
 	if (ipc6->dontfrag < 0)
-		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
+		ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
 
 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
 				&current->task_frag, getfrag, from,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index d5d428a695f7..33dd4dd872e6 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -478,6 +478,9 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		inet6_assign_bit(AUTOFLOWLABEL, sk, valbool);
 		inet6_set_bit(AUTOFLOWLABEL_SET, sk);
 		return 0;
+	case IPV6_DONTFRAG:
+		inet6_assign_bit(DONTFRAG, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -970,10 +973,6 @@ done:
 			goto e_inval;
 		retv = __ip6_sock_set_addr_preferences(sk, val);
 		break;
-	case IPV6_DONTFRAG:
-		np->dontfrag = valbool;
-		retv = 0;
-		break;
 	case IPV6_RECVFRAGSIZE:
 		np->rxopt.bits.recvfragsize = valbool;
 		retv = 0;
@@ -1442,7 +1441,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_DONTFRAG:
-		val = np->dontfrag;
+		val = inet6_test_bit(DONTFRAG, sk);
 		break;
 
 	case IPV6_AUTOFLOWLABEL:
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 5831aaa53d75..4444b61eb23b 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -118,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	     l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if))
 		return -EINVAL;
 
-	ipcm6_init_sk(&ipc6, np);
+	ipcm6_init_sk(&ipc6, sk);
 	ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
 	ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 42fcec3ecf5e..cc9673c1809f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -898,7 +898,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
 
 	if (ipc6.dontfrag < 0)
-		ipc6.dontfrag = np->dontfrag;
+		ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
 
 	if (msg->msg_flags&MSG_CONFIRM)
 		goto do_confirm;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f60ba4295435..e4301500741a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1595,7 +1595,7 @@ back_from_confirm:
 
 do_append_data:
 	if (ipc6.dontfrag < 0)
-		ipc6.dontfrag = np->dontfrag;
+		ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
 	up->len += ulen;
 	err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
 			      &ipc6, fl6, (struct rt6_info *)dst,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index ed8ebb6f5909..40af2431e73a 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -621,7 +621,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
 
 	if (ipc6.dontfrag < 0)
-		ipc6.dontfrag = np->dontfrag;
+		ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
 
 	if (msg->msg_flags & MSG_CONFIRM)
 		goto do_confirm;
-- 
cgit v1.2.3-70-g09d2


From 3fa29971c69519629370b119b0b618ee88ade6b9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:08 +0000
Subject: ipv6: lockless IPV6_RECVERR implemetation

np->recverr is moved to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 +--
 include/net/inet_sock.h  |  1 +
 include/net/ipv6.h       |  4 +---
 net/dccp/ipv6.c          |  2 +-
 net/ipv4/ping.c          |  2 +-
 net/ipv6/datagram.c      |  6 ++----
 net/ipv6/ipv6_sockglue.c | 17 ++++++++---------
 net/ipv6/raw.c           | 10 +++++-----
 net/ipv6/tcp_ipv6.c      |  2 +-
 net/ipv6/udp.c           |  6 +++---
 net/sctp/ipv6.c          |  4 +---
 11 files changed, 25 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 57d563f1d4b1..53f4f1b97a78 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,8 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u16			recverr:1,
-	                        sndflow:1,
+	__u16			sndflow:1,
 				repflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ac75324e9e1e..3b79bc759ff4 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -274,6 +274,7 @@ enum {
 	INET_FLAGS_AUTOFLOWLABEL_SET = 23,
 	INET_FLAGS_AUTOFLOWLABEL = 24,
 	INET_FLAGS_DONTFRAG	= 25,
+	INET_FLAGS_RECVERR6	= 26,
 };
 
 /* cmsg flags for inet */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 5a1f2993680d..bd115980809f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1303,9 +1303,7 @@ static inline int ip6_sock_set_v6only(struct sock *sk)
 
 static inline void ip6_sock_set_recverr(struct sock *sk)
 {
-	lock_sock(sk);
-	inet6_sk(sk)->recverr = true;
-	release_sock(sk);
+	inet6_set_bit(RECVERR6, sk);
 }
 
 static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 83617a16b98e..e6c3d84c2b9e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -185,7 +185,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		goto out;
 	}
 
-	if (!sock_owned_by_user(sk) && np->recverr) {
+	if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
 		sk->sk_err = err;
 		sk_error_report(sk);
 	} else {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 75e0aee35eb7..bc01ad5fc01a 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -581,7 +581,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
 	 *	4.1.3.3.
 	 */
 	if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) ||
-	    (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
+	    (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
 	} else {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index e81892814935..74673a5eff31 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -305,11 +305,10 @@ static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb,
 void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
 		     __be16 port, u32 info, u8 *payload)
 {
-	struct ipv6_pinfo *np  = inet6_sk(sk);
 	struct icmp6hdr *icmph = icmp6_hdr(skb);
 	struct sock_exterr_skb *serr;
 
-	if (!np->recverr)
+	if (!inet6_test_bit(RECVERR6, sk))
 		return;
 
 	skb = skb_clone(skb, GFP_ATOMIC);
@@ -344,12 +343,11 @@ EXPORT_SYMBOL_GPL(ipv6_icmp_error);
 
 void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
 {
-	const struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sock_exterr_skb *serr;
 	struct ipv6hdr *iph;
 	struct sk_buff *skb;
 
-	if (!np->recverr)
+	if (!inet6_test_bit(RECVERR6, sk))
 		return;
 
 	skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 33dd4dd872e6..ec10b45c49c1 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -481,6 +481,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	case IPV6_DONTFRAG:
 		inet6_assign_bit(DONTFRAG, sk, valbool);
 		return 0;
+	case IPV6_RECVERR:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		inet6_assign_bit(RECVERR6, sk, valbool);
+		if (!val)
+			skb_errqueue_purge(&sk->sk_error_queue);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -943,14 +950,6 @@ done:
 		np->pmtudisc = val;
 		retv = 0;
 		break;
-	case IPV6_RECVERR:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		np->recverr = valbool;
-		if (!val)
-			skb_errqueue_purge(&sk->sk_error_queue);
-		retv = 0;
-		break;
 	case IPV6_FLOWINFO_SEND:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1380,7 +1379,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_RECVERR:
-		val = np->recverr;
+		val = inet6_test_bit(RECVERR6, sk);
 		break;
 
 	case IPV6_FLOWINFO_SEND:
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index cc9673c1809f..71f6bdccfa1f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -291,6 +291,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 	       struct inet6_skb_parm *opt,
 	       u8 type, u8 code, int offset, __be32 info)
 {
+	bool recverr = inet6_test_bit(RECVERR6, sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	int err;
 	int harderr;
@@ -300,7 +301,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 	   2. Socket is connected (otherwise the error indication
 	      is useless without recverr and error is hard.
 	 */
-	if (!np->recverr && sk->sk_state != TCP_ESTABLISHED)
+	if (!recverr && sk->sk_state != TCP_ESTABLISHED)
 		return;
 
 	harderr = icmpv6_err_convert(type, code, &err);
@@ -312,14 +313,14 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 		ip6_sk_redirect(skb, sk);
 		return;
 	}
-	if (np->recverr) {
+	if (recverr) {
 		u8 *payload = skb->data;
 		if (!inet_test_bit(HDRINCL, sk))
 			payload += offset;
 		ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
 	}
 
-	if (np->recverr || harderr) {
+	if (recverr || harderr) {
 		sk->sk_err = err;
 		sk_error_report(sk);
 	}
@@ -587,7 +588,6 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 			struct flowi6 *fl6, struct dst_entry **dstp,
 			unsigned int flags, const struct sockcm_cookie *sockc)
 {
-	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct net *net = sock_net(sk);
 	struct ipv6hdr *iph;
 	struct sk_buff *skb;
@@ -668,7 +668,7 @@ out:
 error:
 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 error_check:
-	if (err == -ENOBUFS && !np->recverr)
+	if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk))
 		err = 0;
 	return err;
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 54db5fab318b..b5954b136b57 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -508,7 +508,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			tcp_ld_RTO_revert(sk, seq);
 	}
 
-	if (!sock_owned_by_user(sk) && np->recverr) {
+	if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
 		WRITE_ONCE(sk->sk_err, err);
 		sk_error_report(sk);
 	} else {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e4301500741a..90e873689b88 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -619,7 +619,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		goto out;
 	}
 
-	if (!np->recverr) {
+	if (!inet6_test_bit(RECVERR6, sk)) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
 	} else {
@@ -1283,7 +1283,7 @@ csum_partial:
 send:
 	err = ip6_send_skb(skb);
 	if (err) {
-		if (err == -ENOBUFS && !inet6_sk(sk)->recverr) {
+		if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) {
 			UDP6_INC_STATS(sock_net(sk),
 				       UDP_MIB_SNDBUFERRORS, is_udplite);
 			err = 0;
@@ -1608,7 +1608,7 @@ do_append_data:
 		up->pending = 0;
 
 	if (err > 0)
-		err = np->recverr ? net_xmit_errno(err) : 0;
+		err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0;
 	release_sock(sk);
 
 out:
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 43f2731bf590..42b5b853ea01 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -128,7 +128,6 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb,
 {
 	struct sctp_association *asoc = t->asoc;
 	struct sock *sk = asoc->base.sk;
-	struct ipv6_pinfo *np;
 	int err = 0;
 
 	switch (type) {
@@ -149,9 +148,8 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb,
 		break;
 	}
 
-	np = inet6_sk(sk);
 	icmpv6_err_convert(type, code, &err);
-	if (!sock_owned_by_user(sk) && np->recverr) {
+	if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
 		sk->sk_err = err;
 		sk_error_report(sk);
 	} else {
-- 
cgit v1.2.3-70-g09d2


From 3cccda8db2cf2f2a224d55d5b6e2251d478c58ca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:09 +0000
Subject: ipv6: move np->repflow to atomic flags

Move np->repflow to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  1 -
 include/net/inet_sock.h  |  1 +
 net/dccp/ipv6.c          |  2 +-
 net/ipv6/af_inet6.c      |  3 ++-
 net/ipv6/ip6_flowlabel.c |  8 ++++----
 net/ipv6/tcp_ipv6.c      | 14 ++++++--------
 6 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 53f4f1b97a78..e62413371ea4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -244,7 +244,6 @@ struct ipv6_pinfo {
 
 	/* sockopt flags */
 	__u16			sndflow:1,
-				repflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
 				srcprefs:3,	/* 001: prefer temporary address
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 3b79bc759ff4..5d61c7dc6577 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -275,6 +275,7 @@ enum {
 	INET_FLAGS_AUTOFLOWLABEL = 24,
 	INET_FLAGS_DONTFRAG	= 25,
 	INET_FLAGS_RECVERR6	= 26,
+	INET_FLAGS_REPFLOW	= 27,
 };
 
 /* cmsg flags for inet */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index e6c3d84c2b9e..d7e63eea705d 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -679,7 +679,7 @@ ipv6_pktoptions:
 			WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit);
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
-		if (np->repflow)
+		if (inet6_test_bit(REPFLOW, sk))
 			np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb,
 				      &DCCP_SKB_CB(opt_skb)->header.h6)) {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 372fb7b9112c..48737363377f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -220,7 +220,8 @@ lookup_protocol:
 	inet6_set_bit(MC6_LOOP, sk);
 	inet6_set_bit(MC6_ALL, sk);
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
-	np->repflow	= net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
+	inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect &
+				     FLOWLABEL_REFLECT_ESTABLISHED);
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 	sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
 
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b3ca4beb4405..eca07e10e21f 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -513,7 +513,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
 		return 0;
 	}
 
-	if (np->repflow) {
+	if (inet6_test_bit(REPFLOW, sk)) {
 		freq->flr_label = np->flow_label;
 		return 0;
 	}
@@ -551,10 +551,10 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
 	if (freq->flr_flags & IPV6_FL_F_REFLECT) {
 		if (sk->sk_protocol != IPPROTO_TCP)
 			return -ENOPROTOOPT;
-		if (!np->repflow)
+		if (!inet6_test_bit(REPFLOW, sk))
 			return -ESRCH;
 		np->flow_label = 0;
-		np->repflow = 0;
+		inet6_clear_bit(REPFLOW, sk);
 		return 0;
 	}
 
@@ -626,7 +626,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
 
 		if (sk->sk_protocol != IPPROTO_TCP)
 			return -ENOPROTOOPT;
-		np->repflow = 1;
+		inet6_set_bit(REPFLOW, sk);
 		return 0;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b5954b136b57..201caf88bb99 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -548,7 +548,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 				    &ireq->ir_v6_rmt_addr);
 
 		fl6->daddr = ireq->ir_v6_rmt_addr;
-		if (np->repflow && ireq->pktopts)
+		if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts)
 			fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
 
 		tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
@@ -797,7 +797,7 @@ static void tcp_v6_init_req(struct request_sock *req,
 	    (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
 	     np->rxopt.bits.rxinfo ||
 	     np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
-	     np->rxopt.bits.rxohlim || np->repflow)) {
+	     np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) {
 		refcount_inc(&skb->users);
 		ireq->pktopts = skb;
 	}
@@ -1055,10 +1055,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 	if (sk) {
 		oif = sk->sk_bound_dev_if;
 		if (sk_fullsock(sk)) {
-			const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
-
 			trace_tcp_send_reset(sk, skb);
-			if (np->repflow)
+			if (inet6_test_bit(REPFLOW, sk))
 				label = ip6_flowlabel(ipv6h);
 			priority = sk->sk_priority;
 			txhash = sk->sk_txhash;
@@ -1247,7 +1245,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		newnp->mcast_oif   = inet_iif(skb);
 		newnp->mcast_hops  = ip_hdr(skb)->ttl;
 		newnp->rcv_flowinfo = 0;
-		if (np->repflow)
+		if (inet6_test_bit(REPFLOW, sk))
 			newnp->flow_label = 0;
 
 		/*
@@ -1320,7 +1318,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	newnp->mcast_oif  = tcp_v6_iif(skb);
 	newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
 	newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
-	if (np->repflow)
+	if (inet6_test_bit(REPFLOW, sk))
 		newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
 
 	/* Set ToS of the new socket based upon the value of incoming SYN.
@@ -1546,7 +1544,7 @@ ipv6_pktoptions:
 				   ipv6_hdr(opt_skb)->hop_limit);
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
-		if (np->repflow)
+		if (inet6_test_bit(REPFLOW, sk))
 			np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
 			tcp_v6_restore_cb(opt_skb);
-- 
cgit v1.2.3-70-g09d2


From 83cd5eb654b320c1972254f243531f3f3cebcccf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:10 +0000
Subject: ipv6: lockless IPV6_ROUTER_ALERT_ISOLATE implementation

Reads from np->rtalert_isolate are racy.

Move this flag to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 +--
 include/net/inet_sock.h  |  1 +
 net/ipv6/ip6_output.c    |  3 +--
 net/ipv6/ipv6_sockglue.c | 13 ++++++-------
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e62413371ea4..f288a35f157f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -246,11 +246,10 @@ struct ipv6_pinfo {
 	__u16			sndflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
-				srcprefs:3,	/* 001: prefer temporary address
+				srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 5d61c7dc6577..befee0f66c05 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -276,6 +276,7 @@ enum {
 	INET_FLAGS_DONTFRAG	= 25,
 	INET_FLAGS_RECVERR6	= 26,
 	INET_FLAGS_REPFLOW	= 27,
+	INET_FLAGS_RTALERT_ISOLATE = 28,
 };
 
 /* cmsg flags for inet */
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8851fe5d45a0..f87d8491d7e2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -368,9 +368,8 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 		if (sk && ra->sel == sel &&
 		    (!sk->sk_bound_dev_if ||
 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
 
-			if (np && np->rtalert_isolate &&
+			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
 				continue;
 			}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index ec10b45c49c1..c22a492e0536 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -488,6 +488,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		if (!val)
 			skb_errqueue_purge(&sk->sk_error_queue);
 		return 0;
+	case IPV6_ROUTER_ALERT_ISOLATE:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		inet6_assign_bit(RTALERT_ISOLATE, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -936,12 +941,6 @@ done:
 			goto e_inval;
 		retv = ip6_ra_control(sk, val);
 		break;
-	case IPV6_ROUTER_ALERT_ISOLATE:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		np->rtalert_isolate = valbool;
-		retv = 0;
-		break;
 	case IPV6_MTU_DISCOVER:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1452,7 +1451,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_ROUTER_ALERT_ISOLATE:
-		val = np->rtalert_isolate;
+		val = inet6_test_bit(RTALERT_ISOLATE, sk);
 		break;
 
 	case IPV6_RECVERR_RFC4884:
-- 
cgit v1.2.3-70-g09d2


From 6b724bc4300b431443f3b99520994a5aece347cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:11 +0000
Subject: ipv6: lockless IPV6_MTU_DISCOVER implementation

Most np->pmtudisc reads are racy.

Move this 3bit field on a full byte, add annotations
and make IPV6_MTU_DISCOVER setsockopt() lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h            |  5 ++---
 include/net/ip6_route.h         | 14 +++++++++-----
 net/ipv6/ip6_output.c           |  4 ++--
 net/ipv6/ipv6_sockglue.c        | 17 ++++++++---------
 net/ipv6/raw.c                  |  2 +-
 net/ipv6/udp.c                  |  2 +-
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 7 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index f288a35f157f..10f521a6a9c8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,13 +243,12 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u16			sndflow:1,
-				pmtudisc:3,
-				padding:1,	/* 1 bit hole */
+	__u8			sndflow:1,
 				srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
+	__u8			pmtudisc;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index b32539bb0fb0..b1ea49900b4a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -266,7 +266,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
 	const struct dst_entry *dst = skb_dst(skb);
 	unsigned int mtu;
 
-	if (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) {
+	if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) {
 		mtu = READ_ONCE(dst->dev->mtu);
 		mtu -= lwtunnel_headroom(dst->lwtstate, mtu);
 	} else {
@@ -277,14 +277,18 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
 
 static inline bool ip6_sk_accept_pmtu(const struct sock *sk)
 {
-	return inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_INTERFACE &&
-	       inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT;
+	u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);
+
+	return pmtudisc != IPV6_PMTUDISC_INTERFACE &&
+	       pmtudisc != IPV6_PMTUDISC_OMIT;
 }
 
 static inline bool ip6_sk_ignore_df(const struct sock *sk)
 {
-	return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO ||
-	       inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT;
+	u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);
+
+	return pmtudisc < IPV6_PMTUDISC_DO ||
+	       pmtudisc == IPV6_PMTUDISC_OMIT;
 }
 
 static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f87d8491d7e2..7e5d9eeb990f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1436,10 +1436,10 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	v6_cork->hop_limit = ipc6->hlimit;
 	v6_cork->tclass = ipc6->tclass;
 	if (rt->dst.flags & DST_XFRM_TUNNEL)
-		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
 	else
-		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
 
 	frag_size = READ_ONCE(np->frag_size);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c22a492e0536..85ea42644dcb 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -493,6 +493,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(RTALERT_ISOLATE, sk, valbool);
 		return 0;
+	case IPV6_MTU_DISCOVER:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
+			return -EINVAL;
+		WRITE_ONCE(np->pmtudisc, val);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -941,14 +948,6 @@ done:
 			goto e_inval;
 		retv = ip6_ra_control(sk, val);
 		break;
-	case IPV6_MTU_DISCOVER:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
-			goto e_inval;
-		np->pmtudisc = val;
-		retv = 0;
-		break;
 	case IPV6_FLOWINFO_SEND:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1374,7 +1373,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_MTU_DISCOVER:
-		val = np->pmtudisc;
+		val = READ_ONCE(np->pmtudisc);
 		break;
 
 	case IPV6_RECVERR:
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 71f6bdccfa1f..47372cceb98f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -307,7 +307,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 	harderr = icmpv6_err_convert(type, code, &err);
 	if (type == ICMPV6_PKT_TOOBIG) {
 		ip6_sk_update_pmtu(skb, sk, info);
-		harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
+		harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO);
 	}
 	if (type == NDISC_REDIRECT) {
 		ip6_sk_redirect(skb, sk);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 90e873689b88..c17e19fece1b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -598,7 +598,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!ip6_sk_accept_pmtu(sk))
 			goto out;
 		ip6_sk_update_pmtu(skb, sk, info);
-		if (np->pmtudisc != IPV6_PMTUDISC_DONT)
+		if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT)
 			harderr = 1;
 	}
 	if (type == NDISC_REDIRECT) {
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index df1b33b61059..5820a8156c47 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1341,7 +1341,7 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
 		struct ipv6_pinfo *np = inet6_sk(sk);
 
 		/* IPV6_MTU_DISCOVER */
-		np->pmtudisc = val;
+		WRITE_ONCE(np->pmtudisc, val);
 	}
 #endif
 	release_sock(sk);
-- 
cgit v1.2.3-70-g09d2


From 859f8b265fc2a11af0fb0c52b4087e0409250592 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:12 +0000
Subject: ipv6: lockless IPV6_FLOWINFO_SEND implementation

np->sndflow reads are racy.

Use one bit ftom atomic inet->inet_flags instead,
IPV6_FLOWINFO_SEND setsockopt() can be lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 +--
 include/net/inet_sock.h  |  1 +
 net/dccp/ipv6.c          |  2 +-
 net/ipv4/ping.c          |  3 +--
 net/ipv6/af_inet6.c      |  2 +-
 net/ipv6/datagram.c      |  7 ++++---
 net/ipv6/ipv6_sockglue.c | 13 ++++++-------
 net/ipv6/ping.c          |  2 +-
 net/ipv6/raw.c           |  2 +-
 net/ipv6/tcp_ipv6.c      |  2 +-
 net/ipv6/udp.c           |  2 +-
 net/l2tp/l2tp_ip6.c      |  4 ++--
 net/sctp/ipv6.c          |  3 ++-
 13 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 10f521a6a9c8..09253825c99c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,8 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u8			sndflow:1,
-				srcprefs:3;	/* 001: prefer temporary address
+	__u8			srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index befee0f66c05..98e11958cdff 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -277,6 +277,7 @@ enum {
 	INET_FLAGS_RECVERR6	= 26,
 	INET_FLAGS_REPFLOW	= 27,
 	INET_FLAGS_RTALERT_ISOLATE = 28,
+	INET_FLAGS_SNDFLOW	= 29,
 };
 
 /* cmsg flags for inet */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d7e63eea705d..4803f0614848 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -844,7 +844,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	memset(&fl6, 0, sizeof(fl6));
 
-	if (np->sndflow) {
+	if (inet6_test_bit(SNDFLOW, sk)) {
 		fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
 		IP6_ECN_flow_init(fl6.flowlabel);
 		if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index bc01ad5fc01a..4dd809b7b188 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -899,7 +899,6 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
 
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (family == AF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
 		struct ipv6hdr *ip6 = ipv6_hdr(skb);
 		DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
 
@@ -908,7 +907,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
 			sin6->sin6_port = 0;
 			sin6->sin6_addr = ip6->saddr;
 			sin6->sin6_flowinfo = 0;
-			if (np->sndflow)
+			if (inet6_test_bit(SNDFLOW, sk))
 				sin6->sin6_flowinfo = ip6_flowinfo(ip6);
 			sin6->sin6_scope_id =
 				ipv6_iface_scope_id(&sin6->sin6_addr,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 48737363377f..c6ad0d6e99b5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -537,7 +537,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		}
 		sin->sin6_port = inet->inet_dport;
 		sin->sin6_addr = sk->sk_v6_daddr;
-		if (np->sndflow)
+		if (inet6_test_bit(SNDFLOW, sk))
 			sin->sin6_flowinfo = np->flow_label;
 		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
 				       CGROUP_INET6_GETPEERNAME);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 74673a5eff31..cc6a502db39d 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -80,7 +80,8 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
 	struct flowi6 fl6;
 	int err = 0;
 
-	if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
+	if (inet6_test_bit(SNDFLOW, sk) &&
+	    (np->flow_label & IPV6_FLOWLABEL_MASK)) {
 		flowlabel = fl6_sock_lookup(sk, np->flow_label);
 		if (IS_ERR(flowlabel))
 			return -EINVAL;
@@ -163,7 +164,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (usin->sin6_family != AF_INET6)
 		return -EAFNOSUPPORT;
 
-	if (np->sndflow)
+	if (inet6_test_bit(SNDFLOW, sk))
 		fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
 
 	if (ipv6_addr_any(&usin->sin6_addr)) {
@@ -491,7 +492,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 			const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset),
 								  struct ipv6hdr, daddr);
 			sin->sin6_addr = ip6h->daddr;
-			if (np->sndflow)
+			if (inet6_test_bit(SNDFLOW, sk))
 				sin->sin6_flowinfo = ip6_flowinfo(ip6h);
 			sin->sin6_scope_id =
 				ipv6_iface_scope_id(&sin->sin6_addr,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 85ea42644dcb..e9dc6f881bb9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -500,6 +500,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		WRITE_ONCE(np->pmtudisc, val);
 		return 0;
+	case IPV6_FLOWINFO_SEND:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		inet6_assign_bit(SNDFLOW, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -948,12 +953,6 @@ done:
 			goto e_inval;
 		retv = ip6_ra_control(sk, val);
 		break;
-	case IPV6_FLOWINFO_SEND:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		np->sndflow = valbool;
-		retv = 0;
-		break;
 	case IPV6_FLOWLABEL_MGR:
 		retv = ipv6_flowlabel_opt(sk, optval, optlen);
 		break;
@@ -1381,7 +1380,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_FLOWINFO_SEND:
-		val = np->sndflow;
+		val = inet6_test_bit(SNDFLOW, sk);
 		break;
 
 	case IPV6_FLOWLABEL_MGR:
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 4444b61eb23b..e8fb0d275cc2 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -89,7 +89,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			return -EAFNOSUPPORT;
 		}
 		daddr = &(u->sin6_addr);
-		if (np->sndflow)
+		if (inet6_test_bit(SNDFLOW, sk))
 			fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK;
 		if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
 			oif = u->sin6_scope_id;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 47372cceb98f..a2aa54a2baae 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -795,7 +795,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			return -EINVAL;
 
 		daddr = &sin6->sin6_addr;
-		if (np->sndflow) {
+		if (inet6_test_bit(SNDFLOW, sk)) {
 			fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 			if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 201caf88bb99..94afb8d0f2d0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -163,7 +163,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	memset(&fl6, 0, sizeof(fl6));
 
-	if (np->sndflow) {
+	if (inet6_test_bit(SNDFLOW, sk)) {
 		fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 		IP6_ECN_flow_init(fl6.flowlabel);
 		if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c17e19fece1b..5e9312eefed0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1429,7 +1429,7 @@ do_udp_sendmsg:
 		fl6->fl6_dport = sin6->sin6_port;
 		daddr = &sin6->sin6_addr;
 
-		if (np->sndflow) {
+		if (inet6_test_bit(SNDFLOW, sk)) {
 			fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 			if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 40af2431e73a..44cfb72bbd18 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -431,7 +431,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
 			return -ENOTCONN;
 		lsa->l2tp_conn_id = lsk->peer_conn_id;
 		lsa->l2tp_addr = sk->sk_v6_daddr;
-		if (np->sndflow)
+		if (inet6_test_bit(SNDFLOW, sk))
 			lsa->l2tp_flowinfo = np->flow_label;
 	} else {
 		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -529,7 +529,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			return -EAFNOSUPPORT;
 
 		daddr = &lsa->l2tp_addr;
-		if (np->sndflow) {
+		if (inet6_test_bit(SNDFLOW, sk)) {
 			fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK;
 			if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 42b5b853ea01..5c0ed5909d85 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -296,7 +296,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
 		fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
 
-	if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
+	if (inet6_test_bit(SNDFLOW, sk) &&
+	    (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
 		struct ip6_flowlabel *flowlabel;
 
 		flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
-- 
cgit v1.2.3-70-g09d2


From d609f3d228a8efe991f44f11f24146e2a5209755 Mon Sep 17 00:00:00 2001
From: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
Date: Thu, 7 Sep 2023 09:20:32 +0530
Subject: xsk: add multi-buffer support for sockets sharing umem

Userspace applications indicate their multi-buffer capability to xsk
using XSK_USE_SG socket bind flag. For sockets using shared umem the
bind flag may contain XSK_USE_SG only for the first socket. For any
subsequent socket the only option supported is XDP_SHARED_UMEM.

Add option XDP_UMEM_SG_FLAG in umem config flags to store the
multi-buffer handling capability when indicated by XSK_USE_SG option in
bing flag by the first socket. Use this to derive multi-buffer capability
for subsequent sockets in xsk core.

Signed-off-by: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
Fixes: 81470b5c3c66 ("xsk: introduce XSK_USE_SG bind flag for xsk socket")
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230907035032.2627879-1-tirthendu.sarkar@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h  | 2 ++
 net/xdp/xsk.c           | 2 +-
 net/xdp/xsk_buff_pool.c | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 1617af380162..69b472604b86 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -14,6 +14,8 @@
 #include <linux/mm.h>
 #include <net/sock.h>
 
+#define XDP_UMEM_SG_FLAG (1 << 1)
+
 struct net_device;
 struct xsk_queue;
 struct xdp_buff;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 55f8b9b0e06d..7482d0aca504 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1228,7 +1228,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 
 	xs->dev = dev;
 	xs->zc = xs->umem->zc;
-	xs->sg = !!(flags & XDP_USE_SG);
+	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
 	xs->queue_id = qid;
 	xp_add_xsk(xs->pool, xs);
 
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index b3f7b310811e..49cb9f9a09be 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -170,6 +170,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
 	if (err)
 		return err;
 
+	if (flags & XDP_USE_SG)
+		pool->umem->flags |= XDP_UMEM_SG_FLAG;
+
 	if (flags & XDP_USE_NEED_WAKEUP)
 		pool->uses_need_wakeup = true;
 	/* Tx needs to be explicitly woken up the first time.  Also
-- 
cgit v1.2.3-70-g09d2


From fc45c5b642dbcac3bb10f4f904e4b863233e5369 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 13 Sep 2023 10:13:48 -0700
Subject: bpf: make it easier to add new metadata kfunc

No functional changes.

Instead of having hand-crafted code in bpf_dev_bound_resolve_kfunc,
move kfunc <> xmo handler relationship into XDP_METADATA_KFUNC_xxx.
This way, any time new kfunc is added, we don't have to touch
bpf_dev_bound_resolve_kfunc.

Also document XDP_METADATA_KFUNC_xxx arguments since we now have
more than two and it might be confusing what is what.

Cc: netdev@vger.kernel.org
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230913171350.369987-2-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/xdp.h    | 16 ++++++++++++----
 kernel/bpf/offload.c |  9 +++++----
 net/core/xdp.c       |  4 ++--
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index de08c8e0d134..d59e12f8f311 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -383,14 +383,22 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
 
 #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE
 
+/* Define the relationship between xdp-rx-metadata kfunc and
+ * various other entities:
+ * - xdp_rx_metadata enum
+ * - kfunc name
+ * - xdp_metadata_ops field
+ */
 #define XDP_METADATA_KFUNC_xxx	\
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
-			   bpf_xdp_metadata_rx_timestamp) \
+			   bpf_xdp_metadata_rx_timestamp, \
+			   xmo_rx_timestamp) \
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \
-			   bpf_xdp_metadata_rx_hash) \
+			   bpf_xdp_metadata_rx_hash, \
+			   xmo_rx_hash) \
 
-enum {
-#define XDP_METADATA_KFUNC(name, _) name,
+enum xdp_rx_metadata {
+#define XDP_METADATA_KFUNC(name, _, __) name,
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 MAX_XDP_METADATA_KFUNC,
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 3e4f2ec1af06..6aa6de8d715d 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -845,10 +845,11 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
 	if (!ops)
 		goto out;
 
-	if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
-		p = ops->xmo_rx_timestamp;
-	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
-		p = ops->xmo_rx_hash;
+#define XDP_METADATA_KFUNC(name, _, xmo) \
+	if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
+	XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
 out:
 	up_read(&bpf_devs_lock);
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a70670fe9a2d..bab563b2f812 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
 __diag_pop();
 
 BTF_SET8_START(xdp_metadata_kfunc_ids)
-#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+#define XDP_METADATA_KFUNC(_, name, __) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 BTF_SET8_END(xdp_metadata_kfunc_ids)
@@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
 };
 
 BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
-#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str)
+#define XDP_METADATA_KFUNC(name, str, _) BTF_ID(func, str)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 
-- 
cgit v1.2.3-70-g09d2


From a9c2a608549bb1a2363d289d63907640afcf22af Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 13 Sep 2023 10:13:49 -0700
Subject: bpf: expose information about supported xdp metadata kfunc

Add new xdp-rx-metadata-features member to netdev netlink
which exports a bitmask of supported kfuncs. Most of the patch
is autogenerated (headers), the only relevant part is netdev.yaml
and the changes in netdev-genl.c to marshal into netlink.

Example output on veth:

$ ip link add veth0 type veth peer name veth1 # ifndex == 12
$ ./tools/net/ynl/samples/netdev 12

Select ifc ($ifindex; or 0 = dump; or -2 ntf check): 12
   veth1[12]    xdp-features (23): basic redirect rx-sg xdp-rx-metadata-features (3): timestamp hash xdp-zc-max-segs=0

Cc: netdev@vger.kernel.org
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230913171350.369987-3-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml      | 21 +++++++++++++++++++++
 Documentation/networking/xdp-rx-metadata.rst |  7 +++++++
 include/net/xdp.h                            |  5 ++++-
 include/uapi/linux/netdev.h                  | 16 ++++++++++++++++
 kernel/bpf/offload.c                         |  2 +-
 net/core/netdev-genl.c                       | 12 +++++++++++-
 net/core/xdp.c                               |  4 ++--
 tools/include/uapi/linux/netdev.h            | 16 ++++++++++++++++
 8 files changed, 78 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 1c7284fd535b..c46fcc78fc04 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -42,6 +42,19 @@ definitions:
         doc:
           This feature informs if netdev implements non-linear XDP buffer
           support in ndo_xdp_xmit callback.
+  -
+    type: flags
+    name: xdp-rx-metadata
+    render-max: true
+    entries:
+      -
+        name: timestamp
+        doc:
+          Device is capable of exposing receive HW timestamp via bpf_xdp_metadata_rx_timestamp().
+      -
+        name: hash
+        doc:
+          Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash().
 
 attribute-sets:
   -
@@ -68,6 +81,13 @@ attribute-sets:
         type: u32
         checks:
           min: 1
+      -
+        name: xdp-rx-metadata-features
+        doc: Bitmask of supported XDP receive metadata features.
+             See Documentation/networking/xdp-rx-metadata.rst for more details.
+        type: u64
+        enum: xdp-rx-metadata
+        enum-as-flags: true
 
 operations:
   list:
@@ -84,6 +104,7 @@ operations:
             - ifindex
             - xdp-features
             - xdp-zc-max-segs
+            - xdp-rx-metadata-features
       dump:
         reply: *dev-all
     -
diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
index 25ce72af81c2..205696780b78 100644
--- a/Documentation/networking/xdp-rx-metadata.rst
+++ b/Documentation/networking/xdp-rx-metadata.rst
@@ -105,6 +105,13 @@ bpf_tail_call
 Adding programs that access metadata kfuncs to the ``BPF_MAP_TYPE_PROG_ARRAY``
 is currently not supported.
 
+Supported Devices
+=================
+
+It is possible to query which kfunc the particular netdev implements via
+netlink. See ``xdp-rx-metadata-features`` attribute set in
+``Documentation/netlink/specs/netdev.yaml``.
+
 Example
 =======
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index d59e12f8f311..349c36fb5fd8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -386,19 +386,22 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
 /* Define the relationship between xdp-rx-metadata kfunc and
  * various other entities:
  * - xdp_rx_metadata enum
+ * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml)
  * - kfunc name
  * - xdp_metadata_ops field
  */
 #define XDP_METADATA_KFUNC_xxx	\
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
+			   NETDEV_XDP_RX_METADATA_TIMESTAMP, \
 			   bpf_xdp_metadata_rx_timestamp, \
 			   xmo_rx_timestamp) \
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \
+			   NETDEV_XDP_RX_METADATA_HASH, \
 			   bpf_xdp_metadata_rx_hash, \
 			   xmo_rx_hash) \
 
 enum xdp_rx_metadata {
-#define XDP_METADATA_KFUNC(name, _, __) name,
+#define XDP_METADATA_KFUNC(name, _, __, ___) name,
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 MAX_XDP_METADATA_KFUNC,
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index c1634b95c223..2943a151d4f1 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -38,11 +38,27 @@ enum netdev_xdp_act {
 	NETDEV_XDP_ACT_MASK = 127,
 };
 
+/**
+ * enum netdev_xdp_rx_metadata
+ * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW
+ *   timestamp via bpf_xdp_metadata_rx_timestamp().
+ * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet
+ *   hash via bpf_xdp_metadata_rx_hash().
+ */
+enum netdev_xdp_rx_metadata {
+	NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
+	NETDEV_XDP_RX_METADATA_HASH = 2,
+
+	/* private: */
+	NETDEV_XDP_RX_METADATA_MASK = 3,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
 	NETDEV_A_DEV_XDP_FEATURES,
 	NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
+	NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
 
 	__NETDEV_A_DEV_MAX,
 	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 6aa6de8d715d..e7a1752b5a09 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -845,7 +845,7 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
 	if (!ops)
 		goto out;
 
-#define XDP_METADATA_KFUNC(name, _, xmo) \
+#define XDP_METADATA_KFUNC(name, _, __, xmo) \
 	if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
 	XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index c1aea8b756b6..fe61f85bcf33 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -5,6 +5,7 @@
 #include <linux/rtnetlink.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/xdp.h>
 
 #include "netdev-genl-gen.h"
 
@@ -12,15 +13,24 @@ static int
 netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
 		   const struct genl_info *info)
 {
+	u64 xdp_rx_meta = 0;
 	void *hdr;
 
 	hdr = genlmsg_iput(rsp, info);
 	if (!hdr)
 		return -EMSGSIZE;
 
+#define XDP_METADATA_KFUNC(_, flag, __, xmo) \
+	if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \
+		xdp_rx_meta |= flag;
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
 	if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
 	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
-			      netdev->xdp_features, NETDEV_A_DEV_PAD)) {
+			      netdev->xdp_features, NETDEV_A_DEV_PAD) ||
+	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+			      xdp_rx_meta, NETDEV_A_DEV_PAD)) {
 		genlmsg_cancel(rsp, hdr);
 		return -EINVAL;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bab563b2f812..df4789ab512d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
 __diag_pop();
 
 BTF_SET8_START(xdp_metadata_kfunc_ids)
-#define XDP_METADATA_KFUNC(_, name, __) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 BTF_SET8_END(xdp_metadata_kfunc_ids)
@@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
 };
 
 BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
-#define XDP_METADATA_KFUNC(name, str, _) BTF_ID(func, str)
+#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index c1634b95c223..2943a151d4f1 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -38,11 +38,27 @@ enum netdev_xdp_act {
 	NETDEV_XDP_ACT_MASK = 127,
 };
 
+/**
+ * enum netdev_xdp_rx_metadata
+ * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW
+ *   timestamp via bpf_xdp_metadata_rx_timestamp().
+ * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet
+ *   hash via bpf_xdp_metadata_rx_hash().
+ */
+enum netdev_xdp_rx_metadata {
+	NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
+	NETDEV_XDP_RX_METADATA_HASH = 2,
+
+	/* private: */
+	NETDEV_XDP_RX_METADATA_MASK = 3,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
 	NETDEV_A_DEV_XDP_FEATURES,
 	NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
+	NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
 
 	__NETDEV_A_DEV_MAX,
 	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
-- 
cgit v1.2.3-70-g09d2


From daabb2b098e04753fa3d1b1feed13e5a61bef61c Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay12@gmail.com>
Date: Thu, 7 Sep 2023 23:05:49 +0000
Subject: bpf/tests: add tests for cpuv4 instructions

The BPF JITs now support cpuv4 instructions. Add tests for these new
instructions to the test suite:

1. Sign extended Load
2. Sign extended Mov
3. Unconditional byte swap
4. Unconditional jump with 32-bit offset
5. Signed division and modulo

Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
Link: https://lore.kernel.org/r/20230907230550.1417590-9-puranjay12@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  50 ++++++-
 lib/test_bpf.c         | 371 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 417 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 761af6b3cf2b..0138832ad571 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -117,21 +117,25 @@ struct ctl_table_header;
 
 /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
 
-#define BPF_ALU64_IMM(OP, DST, IMM)				\
+#define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF)			\
 	((struct bpf_insn) {					\
 		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
 		.dst_reg = DST,					\
 		.src_reg = 0,					\
-		.off   = 0,					\
+		.off   = OFF,					\
 		.imm   = IMM })
+#define BPF_ALU64_IMM(OP, DST, IMM)				\
+	BPF_ALU64_IMM_OFF(OP, DST, IMM, 0)
 
-#define BPF_ALU32_IMM(OP, DST, IMM)				\
+#define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF)			\
 	((struct bpf_insn) {					\
 		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
 		.dst_reg = DST,					\
 		.src_reg = 0,					\
-		.off   = 0,					\
+		.off   = OFF,					\
 		.imm   = IMM })
+#define BPF_ALU32_IMM(OP, DST, IMM)				\
+	BPF_ALU32_IMM_OFF(OP, DST, IMM, 0)
 
 /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */
 
@@ -143,6 +147,16 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = LEN })
 
+/* Byte Swap, bswap16/32/64 */
+
+#define BPF_BSWAP(DST, LEN)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE),	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = LEN })
+
 /* Short form of mov, dst_reg = src_reg */
 
 #define BPF_MOV64_REG(DST, SRC)					\
@@ -179,6 +193,24 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = IMM })
 
+/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */
+
+#define BPF_MOVSX64_REG(DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+#define BPF_MOVSX32_REG(DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Special form of mov32, used for doing explicit zero extension on dst. */
 #define BPF_ZEXT_REG(DST)					\
 	((struct bpf_insn) {					\
@@ -263,6 +295,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Memory load, dst_reg = *(signed size *) (src_reg + off16) */
+
+#define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Memory store, *(uint *) (dst_reg + off16) = src_reg */
 
 #define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index ecde4216201e..7916503e6a6a 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5111,6 +5111,104 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 0xffffffff } }
 	},
+	/* MOVSX32 */
+	{
+		"ALU_MOVSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000ffffffefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 8),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU_MOVSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000ffffbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 16),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU_MOVSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000deadbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 32),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* MOVSX64 REG */
+	{
+		"ALU64_MOVSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffffefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 8),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MOVSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 16),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MOVSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffdeadbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 32),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
 	/* BPF_ALU | BPF_ADD | BPF_X */
 	{
 		"ALU_ADD_X: 1 + 2 = 3",
@@ -6105,6 +6203,106 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 2 } },
 	},
+	/* BPF_ALU | BPF_DIV | BPF_X off=1 (SDIV) */
+	{
+		"ALU_SDIV_X: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG_OFF(BPF_DIV, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU | BPF_DIV | BPF_K off=1 (SDIV) */
+	{
+		"ALU_SDIV_K: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM_OFF(BPF_DIV, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU64 | BPF_DIV | BPF_X off=1 (SDIV64) */
+	{
+		"ALU64_SDIV_X: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG_OFF(BPF_DIV, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU64 | BPF_DIV | BPF_K off=1 (SDIV64) */
+	{
+		"ALU64_SDIV_K: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU64_IMM_OFF(BPF_DIV, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU | BPF_MOD | BPF_X off=1 (SMOD) */
+	{
+		"ALU_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG_OFF(BPF_MOD, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU | BPF_MOD | BPF_K off=1 (SMOD) */
+	{
+		"ALU_SMOD_K: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM_OFF(BPF_MOD, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU64 | BPF_MOD | BPF_X off=1 (SMOD64) */
+	{
+		"ALU64_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG_OFF(BPF_MOD, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU64 | BPF_MOD | BPF_K off=1 (SMOD64) */
+	{
+		"ALU64_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU64_IMM_OFF(BPF_MOD, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
 	/* BPF_ALU | BPF_AND | BPF_X */
 	{
 		"ALU_AND_X: 3 & 2 = 2",
@@ -7837,6 +8035,104 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, (u32) (cpu_to_le64(0xfedcba9876543210ULL) >> 32) } },
 	},
+	/* BSWAP */
+	{
+		"BSWAP 16: 0x0123456789abcdef -> 0xefcd",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 16),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcd } },
+	},
+	{
+		"BSWAP 32: 0x0123456789abcdef -> 0xefcdab89",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 32),
+			BPF_ALU64_REG(BPF_MOV, R1, R0),
+			BPF_ALU64_IMM(BPF_RSH, R1, 32),
+			BPF_ALU32_REG(BPF_ADD, R0, R1), /* R1 = 0 */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcdab89 } },
+	},
+	{
+		"BSWAP 64: 0x0123456789abcdef -> 0x67452301",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 64),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x67452301 } },
+	},
+	{
+		"BSWAP 64: 0x0123456789abcdef >> 32 -> 0xefcdab89",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 64),
+			BPF_ALU64_IMM(BPF_RSH, R0, 32),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcdab89 } },
+	},
+	/* BSWAP, reversed */
+	{
+		"BSWAP 16: 0xfedcba9876543210 -> 0x1032",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 16),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1032 } },
+	},
+	{
+		"BSWAP 32: 0xfedcba9876543210 -> 0x10325476",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 32),
+			BPF_ALU64_REG(BPF_MOV, R1, R0),
+			BPF_ALU64_IMM(BPF_RSH, R1, 32),
+			BPF_ALU32_REG(BPF_ADD, R0, R1), /* R1 = 0 */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x10325476 } },
+	},
+	{
+		"BSWAP 64: 0xfedcba9876543210 -> 0x98badcfe",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 64),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x98badcfe } },
+	},
+	{
+		"BSWAP 64: 0xfedcba9876543210 >> 32 -> 0x10325476",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 64),
+			BPF_ALU64_IMM(BPF_RSH, R0, 32),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x10325476 } },
+	},
 	/* BPF_LDX_MEM B/H/W/DW */
 	{
 		"BPF_LDX_MEM | BPF_B, base",
@@ -8228,6 +8524,67 @@ static struct bpf_test tests[] = {
 		{ { 32, 0 } },
 		.stack_depth = 0,
 	},
+	/* BPF_LDX_MEMSX B/H/W */
+	{
+		"BPF_LDX_MEMSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0xdead0000000000f0ULL),
+			BPF_LD_IMM64(R2, 0xfffffffffffffff0ULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_B, R0, R10, -1),
+#else
+			BPF_LDX_MEMSX(BPF_B, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEMSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0xdead00000000f123ULL),
+			BPF_LD_IMM64(R2, 0xfffffffffffff123ULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_H, R0, R10, -2),
+#else
+			BPF_LDX_MEMSX(BPF_H, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEMSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0x00000000deadbeefULL),
+			BPF_LD_IMM64(R2, 0xffffffffdeadbeefULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_W, R0, R10, -4),
+#else
+			BPF_LDX_MEMSX(BPF_W, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
 	/* BPF_STX_MEM B/H/W/DW */
 	{
 		"BPF_STX_MEM | BPF_B",
@@ -9474,6 +9831,20 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP32 | BPF_JA */
+	{
+		"JMP32_JA: Unconditional jump: if (true) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_JMP32_IMM(BPF_JA, 0, 1, 0),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSLT | BPF_K */
 	{
 		"JMP_JSLT_K: Signed jump: if (-2 < -1) return 1",
-- 
cgit v1.2.3-70-g09d2


From 3868ab0f192581eff978501a05f3dc2e01541d77 Mon Sep 17 00:00:00 2001
From: Aananth V <aananthv@google.com>
Date: Thu, 14 Sep 2023 14:36:21 +0000
Subject: tcp: new TCP_INFO stats for RTO events

The 2023 SIGCOMM paper "Improving Network Availability with Protective
ReRoute" has indicated Linux TCP's RTO-triggered txhash rehashing can
effectively reduce application disruption during outages. To better
measure the efficacy of this feature, this patch adds three more
detailed stats during RTO recovery and exports via TCP_INFO.
Applications and monitoring systems can leverage this data to measure
the network path diversity and end-to-end repair latency during network
outages to improve their network infrastructure.

The following counters are added to tcp_sock in order to track RTO
events over the lifetime of a TCP socket.

1. u16 total_rto - Counts the total number of RTO timeouts.
2. u16 total_rto_recoveries - Counts the total number of RTO recoveries.
3. u32 total_rto_time - Counts the total time spent (ms) in RTO
                        recoveries. (time spent in CA_Loss and
                        CA_Recovery states)

To compute total_rto_time, we add a new u32 rto_stamp field to
tcp_sock. rto_stamp records the start timestamp (ms) of the last RTO
recovery (CA_Loss).

Corresponding fields are also added to the tcp_info struct.

Signed-off-by: Aananth V <aananthv@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  8 ++++++++
 include/uapi/linux/tcp.h | 12 ++++++++++++
 net/ipv4/tcp.c           |  9 +++++++++
 net/ipv4/tcp_input.c     | 15 +++++++++++++++
 net/ipv4/tcp_minisocks.c |  4 ++++
 net/ipv4/tcp_timer.c     | 17 +++++++++++++++--
 6 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 44d946161d4a..e15452df9804 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -377,6 +377,14 @@ struct tcp_sock {
 				 * Total data bytes retransmitted
 				 */
 	u32	total_retrans;	/* Total retransmits for entire connection */
+	u32	rto_stamp;	/* Start time (ms) of last CA_Loss recovery */
+	u16	total_rto;	/* Total number of RTO timeouts, including
+				 * SYN/SYN-ACK and recurring timeouts.
+				 */
+	u16	total_rto_recoveries;	/* Total number of RTO recoveries,
+					 * including any unfinished recovery.
+					 */
+	u32	total_rto_time;	/* ms spent in (completed) RTO recoveries. */
 
 	u32	urg_seq;	/* Seq of received urgent pointer */
 	unsigned int		keepalive_time;	  /* time before keep alive takes place */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 879eeb0a084b..d1d08da6331a 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -289,6 +289,18 @@ struct tcp_info {
 				      */
 
 	__u32   tcpi_rehash;         /* PLB or timeout triggered rehash attempts */
+
+	__u16	tcpi_total_rto;	/* Total number of RTO timeouts, including
+				 * SYN/SYN-ACK and recurring timeouts.
+				 */
+	__u16	tcpi_total_rto_recoveries;	/* Total number of RTO
+						 * recoveries, including any
+						 * unfinished recovery.
+						 */
+	__u32	tcpi_total_rto_time;	/* Total time spent in RTO recoveries
+					 * in milliseconds, including any
+					 * unfinished recovery.
+					 */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0c3040a63ebd..69b8d7073708 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3818,6 +3818,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_rcv_wnd = tp->rcv_wnd;
 	info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
 	info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
+
+	info->tcpi_total_rto = tp->total_rto;
+	info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
+	info->tcpi_total_rto_time = tp->total_rto_time;
+	if (tp->rto_stamp) {
+		info->tcpi_total_rto_time += tcp_time_stamp_raw() -
+						tp->rto_stamp;
+	}
+
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8d2c91703158..584825ddd0a0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2088,6 +2088,10 @@ void tcp_clear_retrans(struct tcp_sock *tp)
 	tp->undo_marker = 0;
 	tp->undo_retrans = -1;
 	tp->sacked_out = 0;
+	tp->rto_stamp = 0;
+	tp->total_rto = 0;
+	tp->total_rto_recoveries = 0;
+	tp->total_rto_time = 0;
 }
 
 static inline void tcp_init_undo(struct tcp_sock *tp)
@@ -2825,6 +2829,14 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 	tcp_set_ca_state(sk, TCP_CA_Recovery);
 }
 
+static void tcp_update_rto_time(struct tcp_sock *tp)
+{
+	if (tp->rto_stamp) {
+		tp->total_rto_time += tcp_time_stamp(tp) - tp->rto_stamp;
+		tp->rto_stamp = 0;
+	}
+}
+
 /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
  * recovered or spurious. Otherwise retransmits more on partial ACKs.
  */
@@ -3029,6 +3041,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 		break;
 	case TCP_CA_Loss:
 		tcp_process_loss(sk, flag, num_dupack, rexmit);
+		if (icsk->icsk_ca_state != TCP_CA_Loss)
+			tcp_update_rto_time(tp);
 		tcp_identify_packet_loss(sk, ack_flag);
 		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
 		      (*ack_flag & FLAG_LOST_RETRANS)))
@@ -6454,6 +6468,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
 		tcp_try_undo_recovery(sk);
 
 	/* Reset rtx states to prevent spurious retransmits_timed_out() */
+	tcp_update_rto_time(tp);
 	tp->retrans_stamp = 0;
 	inet_csk(sk)->icsk_retransmits = 0;
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b98d476f1594..eee8ab1bfa0e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -565,6 +565,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->undo_marker = treq->snt_isn;
 		newtp->retrans_stamp = div_u64(treq->snt_synack,
 					       USEC_PER_SEC / TCP_TS_HZ);
+		newtp->total_rto = req->num_timeout;
+		newtp->total_rto_recoveries = 1;
+		newtp->total_rto_time = tcp_time_stamp_raw() -
+						newtp->retrans_stamp;
 	}
 	newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d7d64682b068..3f61c6a70a1f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -415,6 +415,19 @@ abort:		tcp_write_err(sk);
 	}
 }
 
+static void tcp_update_rto_stats(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!icsk->icsk_retransmits) {
+		tp->total_rto_recoveries++;
+		tp->rto_stamp = tcp_time_stamp(tp);
+	}
+	icsk->icsk_retransmits++;
+	tp->total_rto++;
+}
+
 /*
  *	Timer for Fast Open socket to retransmit SYNACK. Note that the
  *	sk here is the child socket, not the parent (listener) socket.
@@ -447,7 +460,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 	 */
 	inet_rtx_syn_ack(sk, req);
 	req->num_timeout++;
-	icsk->icsk_retransmits++;
+	tcp_update_rto_stats(sk);
 	if (!tp->retrans_stamp)
 		tp->retrans_stamp = tcp_time_stamp(tp);
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -575,7 +588,7 @@ void tcp_retransmit_timer(struct sock *sk)
 
 	tcp_enter_loss(sk);
 
-	icsk->icsk_retransmits++;
+	tcp_update_rto_stats(sk);
 	if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
 		/* Retransmission failed because of local congestion,
 		 * Let senders fight for local resources conservatively.
-- 
cgit v1.2.3-70-g09d2


From 9af27da6313c8f8c6a26c7ea3fe23d6b9664a3a8 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:31:58 +0200
Subject: bpf: Use bpf_is_subprog to check for subprogs

We would like to know whether a bpf_prog corresponds to the main prog or
one of the subprogs. The current JIT implementations simply check this
using the func_idx in bpf_prog->aux->func_idx. When the index is 0, it
belongs to the main program, otherwise it corresponds to some
subprogram.

This will also be necessary to halt exception propagation while walking
the stack when an exception is thrown, so we add a simple helper
function to check this, named bpf_is_subprog, and convert existing JIT
implementations to also make use of it.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 2 +-
 arch/s390/net/bpf_jit_comp.c  | 2 +-
 arch/x86/net/bpf_jit_comp.c   | 2 +-
 include/linux/bpf.h           | 5 +++++
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 150d1c6543f7..7d4af64e3982 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -288,7 +288,7 @@ static bool is_lsi_offset(int offset, int scale)
 static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 {
 	const struct bpf_prog *prog = ctx->prog;
-	const bool is_main_prog = prog->aux->func_idx == 0;
+	const bool is_main_prog = !bpf_is_subprog(prog);
 	const u8 r6 = bpf2a64[BPF_REG_6];
 	const u8 r7 = bpf2a64[BPF_REG_7];
 	const u8 r8 = bpf2a64[BPF_REG_8];
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index de2fb12120d2..eeb42e5cd7d6 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -556,7 +556,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
 	EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt);
 	jit->prologue_plt_ret = jit->prg;
 
-	if (fp->aux->func_idx == 0) {
+	if (!bpf_is_subprog(fp)) {
 		/* Initialize the tail call counter in the main program. */
 		/* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
 		_EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2846c21d75bf..a0d03503b3cb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1049,7 +1049,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
-		      bpf_prog->aux->func_idx != 0);
+		      bpf_is_subprog(bpf_prog));
 	push_callee_regs(&prog, callee_regs_used);
 
 	ilen = prog - temp;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b9e573159432..9171b0b6a590 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3194,4 +3194,9 @@ static inline gfp_t bpf_memcg_flags(gfp_t flags)
 	return flags;
 }
 
+static inline bool bpf_is_subprog(const struct bpf_prog *prog)
+{
+	return prog->aux->func_idx != 0;
+}
+
 #endif /* _LINUX_BPF_H */
-- 
cgit v1.2.3-70-g09d2


From fd5d27b70188379bb441d404c29a0afb111e1753 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:31:59 +0200
Subject: arch/x86: Implement arch_bpf_stack_walk

The plumbing for offline unwinding when we throw an exception in
programs would require walking the stack, hence introduce a new
arch_bpf_stack_walk function. This is provided when the JIT supports
exceptions, i.e. bpf_jit_supports_exceptions is true. The arch-specific
code is really minimal, hence it should be straightforward to extend
this support to other architectures as well, as it reuses the logic of
arch_stack_walk, but allowing access to unwind_state data.

Once the stack pointer and frame pointer are known for the main subprog
during the unwinding, we know the stack layout and location of any
callee-saved registers which must be restored before we return back to
the kernel. This handling will be added in the subsequent patches.

Note that while we primarily unwind through BPF frames, which are
effectively CONFIG_UNWINDER_FRAME_POINTER, we still need one of this or
CONFIG_UNWINDER_ORC to be able to unwind through the bpf_throw frame
from which we begin walking the stack. We also require both sp and bp
(stack and frame pointers) from the unwind_state structure, which are
only available when one of these two options are enabled.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
 include/linux/filter.h      |  2 ++
 kernel/bpf/core.c           |  9 +++++++++
 3 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index a0d03503b3cb..d0c24b5a6abb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -16,6 +16,7 @@
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
+#include <asm/unwind.h>
 
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -2933,3 +2934,30 @@ void bpf_jit_free(struct bpf_prog *prog)
 
 	bpf_prog_unlock_free(prog);
 }
+
+bool bpf_jit_supports_exceptions(void)
+{
+	/* We unwind through both kernel frames (starting from within bpf_throw
+	 * call) and BPF frames. Therefore we require one of ORC or FP unwinder
+	 * to be enabled to walk kernel frames and reach BPF frames in the stack
+	 * trace.
+	 */
+	return IS_ENABLED(CONFIG_UNWINDER_ORC) || IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER);
+}
+
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+	struct unwind_state state;
+	unsigned long addr;
+
+	for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
+	     unwind_next_frame(&state)) {
+		addr = unwind_get_return_address(&state);
+		if (!addr || !consume_fn(cookie, (u64)addr, (u64)state.sp, (u64)state.bp))
+			break;
+	}
+	return;
+#endif
+	WARN(1, "verification of programs using bpf_throw should have failed\n");
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0138832ad571..88874de974cb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -954,6 +954,8 @@ bool bpf_jit_needs_zext(void);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
+bool bpf_jit_supports_exceptions(void);
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 95599df82ee4..c4ac084f2767 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2914,6 +2914,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
 	return -ENOTSUPP;
 }
 
+bool __weak bpf_jit_supports_exceptions(void)
+{
+	return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 static int __init bpf_global_ma_init(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 335d1c5b545284d75ef96ee42e461eacefe865bb Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:00 +0200
Subject: bpf: Implement support for adding hidden subprogs

Introduce support in the verifier for generating a subprogram and
include it as part of a BPF program dynamically after the do_check phase
is complete. The first user will be the next patch which generates
default exception callbacks if none are set for the program. The phase
of invocation will be do_misc_fixups. Note that this is an internal
verifier function, and should be used with instruction blocks which
uphold the invariants stated in check_subprogs.

Since these subprogs are always appended to the end of the instruction
sequence of the program, it becomes relatively inexpensive to do the
related adjustments to the subprog_info of the program. Only the fake
exit subprogram is shifted forward, making room for our new subprog.

This is useful to insert a new subprogram, get it JITed, and obtain its
function pointer. The next patch will use this functionality to insert a
default exception callback which will be invoked after unwinding the
stack.

Note that these added subprograms are invisible to userspace, and never
reported in BPF_OBJ_GET_INFO_BY_ID etc. For now, only a single
subprogram is supported, but more can be easily supported in the future.

To this end, two function counts are introduced now, the existing
func_cnt, and real_func_cnt, the latter including hidden programs. This
allows us to conver the JIT code to use the real_func_cnt for management
of resources while syscall path continues working with existing
func_cnt.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  1 +
 include/linux/bpf_verifier.h |  3 ++-
 kernel/bpf/core.c            | 12 ++++++------
 kernel/bpf/syscall.c         |  2 +-
 kernel/bpf/verifier.c        | 36 +++++++++++++++++++++++++++++++++---
 5 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9171b0b6a590..c3667e95af59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1389,6 +1389,7 @@ struct bpf_prog_aux {
 	u32 stack_depth;
 	u32 id;
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
+	u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
 	u32 ctx_arg_info_size;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a3236651ec64..3c2a8636ab29 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -588,6 +588,7 @@ struct bpf_verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
+	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
@@ -598,7 +599,7 @@ struct bpf_verifier_env {
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
-	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
+	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */
 	union {
 		struct bpf_idmap idmap_scratch;
 		struct bpf_idset idset_scratch;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c4ac084f2767..840ba952702d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -212,7 +212,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
 	const struct bpf_line_info *linfo;
 	void **jited_linfo;
 
-	if (!prog->aux->jited_linfo)
+	if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
 		/* Userspace did not provide linfo */
 		return;
 
@@ -539,7 +539,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
 {
 	int i;
 
-	for (i = 0; i < fp->aux->func_cnt; i++)
+	for (i = 0; i < fp->aux->real_func_cnt; i++)
 		bpf_prog_kallsyms_del(fp->aux->func[i]);
 }
 
@@ -589,7 +589,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
 
 	/* prog->aux->name will be ignored if full btf name is available */
-	if (prog->aux->func_info_cnt) {
+	if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
 		type = btf_type_by_id(prog->aux->btf,
 				      prog->aux->func_info[prog->aux->func_idx].type_id);
 		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
@@ -1208,7 +1208,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 		if (!extra_pass)
 			addr = NULL;
 		else if (prog->aux->func &&
-			 off >= 0 && off < prog->aux->func_cnt)
+			 off >= 0 && off < prog->aux->real_func_cnt)
 			addr = (u8 *)prog->aux->func[off]->bpf_func;
 		else
 			return -EINVAL;
@@ -2721,7 +2721,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
 	if (aux->dst_trampoline)
 		bpf_trampoline_put(aux->dst_trampoline);
-	for (i = 0; i < aux->func_cnt; i++) {
+	for (i = 0; i < aux->real_func_cnt; i++) {
 		/* We can just unlink the subprog poke descriptor table as
 		 * it was originally linked to the main program and is also
 		 * released along with it.
@@ -2729,7 +2729,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 		aux->func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(aux->func[i]);
 	}
-	if (aux->func_cnt) {
+	if (aux->real_func_cnt) {
 		kfree(aux->func);
 		bpf_prog_unlock_free(aux->prog);
 	} else {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6a692f3bea15..85c1d908f70f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2749,7 +2749,7 @@ free_used_maps:
 	 * period before we can tear down JIT memory since symbols
 	 * are already exposed under kallsyms.
 	 */
-	__bpf_prog_put_noref(prog, prog->aux->func_cnt);
+	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
 free_prog_sec:
 	free_uid(prog->aux->user);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 18e673c0ac15..39548e326d53 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15210,7 +15210,8 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 	if (!aux->func_info)
 		return;
 
-	for (i = 0; i < env->subprog_cnt; i++)
+	/* func_info is not available for hidden subprogs */
+	for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
 		aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
@@ -18151,7 +18152,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		 * the call instruction, as an index for this list
 		 */
 		func[i]->aux->func = func;
-		func[i]->aux->func_cnt = env->subprog_cnt;
+		func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+		func[i]->aux->real_func_cnt = env->subprog_cnt;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
@@ -18197,7 +18199,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->aux->extable = func[0]->aux->extable;
 	prog->aux->num_exentries = func[0]->aux->num_exentries;
 	prog->aux->func = func;
-	prog->aux->func_cnt = env->subprog_cnt;
+	prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+	prog->aux->real_func_cnt = env->subprog_cnt;
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
@@ -18433,6 +18436,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	return 0;
 }
 
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+	struct bpf_subprog_info *info = env->subprog_info;
+	int cnt = env->subprog_cnt;
+	struct bpf_prog *prog;
+
+	/* We only reserve one slot for hidden subprogs in subprog_info. */
+	if (env->hidden_subprog_cnt) {
+		verbose(env, "verifier internal error: only one hidden subprog supported\n");
+		return -EFAULT;
+	}
+	/* We're not patching any existing instruction, just appending the new
+	 * ones for the hidden subprog. Hence all of the adjustment operations
+	 * in bpf_patch_insn_data are no-ops.
+	 */
+	prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+	if (!prog)
+		return -ENOMEM;
+	env->prog = prog;
+	info[cnt + 1].start = info[cnt].start;
+	info[cnt].start = prog->len - len + 1;
+	env->subprog_cnt++;
+	env->hidden_subprog_cnt++;
+	return 0;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
-- 
cgit v1.2.3-70-g09d2


From f18b03fabaa9b7c80e80b72a621f481f0d706ae0 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:01 +0200
Subject: bpf: Implement BPF exceptions

This patch implements BPF exceptions, and introduces a bpf_throw kfunc
to allow programs to throw exceptions during their execution at runtime.
A bpf_throw invocation is treated as an immediate termination of the
program, returning back to its caller within the kernel, unwinding all
stack frames.

This allows the program to simplify its implementation, by testing for
runtime conditions which the verifier has no visibility into, and assert
that they are true. In case they are not, the program can simply throw
an exception from the other branch.

BPF exceptions are explicitly *NOT* an unlikely slowpath error handling
primitive, and this objective has guided design choices of the
implementation of the them within the kernel (with the bulk of the cost
for unwinding the stack offloaded to the bpf_throw kfunc).

The implementation of this mechanism requires use of add_hidden_subprog
mechanism introduced in the previous patch, which generates a couple of
instructions to move R1 to R0 and exit. The JIT then rewrites the
prologue of this subprog to take the stack pointer and frame pointer as
inputs and reset the stack frame, popping all callee-saved registers
saved by the main subprog. The bpf_throw function then walks the stack
at runtime, and invokes this exception subprog with the stack and frame
pointers as parameters.

Reviewers must take note that currently the main program is made to save
all callee-saved registers on x86_64 during entry into the program. This
is because we must do an equivalent of a lightweight context switch when
unwinding the stack, therefore we need the callee-saved registers of the
caller of the BPF program to be able to return with a sane state.

Note that we have to additionally handle r12, even though it is not used
by the program, because when throwing the exception the program makes an
entry into the kernel which could clobber r12 after saving it on the
stack. To be able to preserve the value we received on program entry, we
push r12 and restore it from the generated subprogram when unwinding the
stack.

For now, bpf_throw invocation fails when lingering resources or locks
exist in that path of the program. In a future followup, bpf_throw will
be extended to perform frame-by-frame unwinding to release lingering
resources for each stack frame, removing this limitation.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c                    |  89 ++++++++++++++++---
 include/linux/bpf.h                            |   3 +
 include/linux/bpf_verifier.h                   |   4 +
 include/linux/filter.h                         |   6 ++
 kernel/bpf/core.c                              |   2 +-
 kernel/bpf/helpers.c                           |  38 ++++++++
 kernel/bpf/verifier.c                          | 116 ++++++++++++++++++++++---
 tools/testing/selftests/bpf/bpf_experimental.h |  16 ++++
 8 files changed, 247 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d0c24b5a6abb..84005f2114e0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -18,6 +18,8 @@
 #include <asm/text-patching.h>
 #include <asm/unwind.h>
 
+static bool all_callee_regs_used[4] = {true, true, true, true};
+
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
 	if (len == 1)
@@ -256,6 +258,14 @@ struct jit_context {
 /* Number of bytes that will be skipped on tailcall */
 #define X86_TAIL_CALL_OFFSET	(11 + ENDBR_INSN_SIZE)
 
+static void push_r12(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x54);   /* push r12 */
+	*pprog = prog;
+}
+
 static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
 {
 	u8 *prog = *pprog;
@@ -271,6 +281,14 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
 	*pprog = prog;
 }
 
+static void pop_r12(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x5C);   /* pop r12 */
+	*pprog = prog;
+}
+
 static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
 {
 	u8 *prog = *pprog;
@@ -292,7 +310,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
  * while jumping to another program
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
-			  bool tail_call_reachable, bool is_subprog)
+			  bool tail_call_reachable, bool is_subprog,
+			  bool is_exception_cb)
 {
 	u8 *prog = *pprog;
 
@@ -312,8 +331,22 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 			/* Keep the same instruction layout. */
 			EMIT2(0x66, 0x90); /* nop2 */
 	}
-	EMIT1(0x55);             /* push rbp */
-	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+	/* Exception callback receives FP as third parameter */
+	if (is_exception_cb) {
+		EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
+		EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */
+		/* The main frame must have exception_boundary as true, so we
+		 * first restore those callee-saved regs from stack, before
+		 * reusing the stack frame.
+		 */
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+		/* Reset the stack frame. */
+		EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */
+	} else {
+		EMIT1(0x55);             /* push rbp */
+		EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+	}
 
 	/* X86_TAIL_CALL_OFFSET is here */
 	EMIT_ENDBR();
@@ -472,7 +505,8 @@ static void emit_return(u8 **pprog, u8 *ip)
  *   goto *(prog->bpf_func + prologue_size);
  * out:
  */
-static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
+static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
+					u8 **pprog, bool *callee_regs_used,
 					u32 stack_depth, u8 *ip,
 					struct jit_context *ctx)
 {
@@ -522,7 +556,12 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
 	offset = ctx->tail_call_indirect_label - (prog + 2 - start);
 	EMIT2(X86_JE, offset);                    /* je out */
 
-	pop_callee_regs(&prog, callee_regs_used);
+	if (bpf_prog->aux->exception_boundary) {
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+	} else {
+		pop_callee_regs(&prog, callee_regs_used);
+	}
 
 	EMIT1(0x58);                              /* pop rax */
 	if (stack_depth)
@@ -546,7 +585,8 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
 	*pprog = prog;
 }
 
-static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
+static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
+				      struct bpf_jit_poke_descriptor *poke,
 				      u8 **pprog, u8 *ip,
 				      bool *callee_regs_used, u32 stack_depth,
 				      struct jit_context *ctx)
@@ -575,7 +615,13 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
 	emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
 		  poke->tailcall_bypass);
 
-	pop_callee_regs(&prog, callee_regs_used);
+	if (bpf_prog->aux->exception_boundary) {
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+	} else {
+		pop_callee_regs(&prog, callee_regs_used);
+	}
+
 	EMIT1(0x58);                                  /* pop rax */
 	if (stack_depth)
 		EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
@@ -1050,8 +1096,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
-		      bpf_is_subprog(bpf_prog));
-	push_callee_regs(&prog, callee_regs_used);
+		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+	/* Exception callback will clobber callee regs for its own use, and
+	 * restore the original callee regs from main prog's stack frame.
+	 */
+	if (bpf_prog->aux->exception_boundary) {
+		/* We also need to save r12, which is not mapped to any BPF
+		 * register, as we throw after entry into the kernel, which may
+		 * overwrite r12.
+		 */
+		push_r12(&prog);
+		push_callee_regs(&prog, all_callee_regs_used);
+	} else {
+		push_callee_regs(&prog, callee_regs_used);
+	}
 
 	ilen = prog - temp;
 	if (rw_image)
@@ -1648,13 +1706,15 @@ st:			if (is_imm8(insn->off))
 
 		case BPF_JMP | BPF_TAIL_CALL:
 			if (imm32)
-				emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
+				emit_bpf_tail_call_direct(bpf_prog,
+							  &bpf_prog->aux->poke_tab[imm32 - 1],
 							  &prog, image + addrs[i - 1],
 							  callee_regs_used,
 							  bpf_prog->aux->stack_depth,
 							  ctx);
 			else
-				emit_bpf_tail_call_indirect(&prog,
+				emit_bpf_tail_call_indirect(bpf_prog,
+							    &prog,
 							    callee_regs_used,
 							    bpf_prog->aux->stack_depth,
 							    image + addrs[i - 1],
@@ -1907,7 +1967,12 @@ emit_jmp:
 			seen_exit = true;
 			/* Update cleanup_addr */
 			ctx->cleanup_addr = proglen;
-			pop_callee_regs(&prog, callee_regs_used);
+			if (bpf_prog->aux->exception_boundary) {
+				pop_callee_regs(&prog, all_callee_regs_used);
+				pop_r12(&prog);
+			} else {
+				pop_callee_regs(&prog, callee_regs_used);
+			}
 			EMIT1(0xC9);         /* leave */
 			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
 			break;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3667e95af59..16740ee82082 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1410,6 +1410,8 @@ struct bpf_prog_aux {
 	bool sleepable;
 	bool tail_call_reachable;
 	bool xdp_has_frags;
+	bool exception_cb;
+	bool exception_boundary;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
@@ -1432,6 +1434,7 @@ struct bpf_prog_aux {
 	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
+	unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3c2a8636ab29..da21a3ec5027 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -541,7 +541,9 @@ struct bpf_subprog_info {
 	bool has_tail_call;
 	bool tail_call_reachable;
 	bool has_ld_abs;
+	bool is_cb;
 	bool is_async_cb;
+	bool is_exception_cb;
 };
 
 struct bpf_verifier_env;
@@ -589,6 +591,7 @@ struct bpf_verifier_env {
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
+	int exception_callback_subprog;
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
@@ -596,6 +599,7 @@ struct bpf_verifier_env {
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
 	bool seen_direct_write;
+	bool seen_exception;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 88874de974cb..27406aee2d40 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1171,6 +1171,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
 bool is_bpf_text_address(unsigned long addr);
 int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		    char *sym);
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);
 
 static inline const char *
 bpf_address_lookup(unsigned long addr, unsigned long *size,
@@ -1238,6 +1239,11 @@ static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
 	return -ERANGE;
 }
 
+static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+{
+	return NULL;
+}
+
 static inline const char *
 bpf_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 840ba952702d..7849b9cca749 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -733,7 +733,7 @@ bool is_bpf_text_address(unsigned long addr)
 	return ret;
 }
 
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
 {
 	struct bpf_ksym *ksym = bpf_ksym_find(addr);
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b0a9834f1051..78e8f4de6750 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2449,6 +2449,43 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
 	rcu_read_unlock();
 }
 
+struct bpf_throw_ctx {
+	struct bpf_prog_aux *aux;
+	u64 sp;
+	u64 bp;
+	int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+	struct bpf_throw_ctx *ctx = cookie;
+	struct bpf_prog *prog;
+
+	if (!is_bpf_text_address(ip))
+		return !ctx->cnt;
+	prog = bpf_prog_ksym_find(ip);
+	ctx->cnt++;
+	if (bpf_is_subprog(prog))
+		return true;
+	ctx->aux = prog->aux;
+	ctx->sp = sp;
+	ctx->bp = bp;
+	return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+	struct bpf_throw_ctx ctx = {};
+
+	arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+	WARN_ON_ONCE(!ctx.aux);
+	if (ctx.aux)
+		WARN_ON_ONCE(!ctx.aux->exception_boundary);
+	WARN_ON_ONCE(!ctx.bp);
+	WARN_ON_ONCE(!ctx.cnt);
+	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+}
+
 __diag_pop();
 
 BTF_SET8_START(generic_btf_ids)
@@ -2478,6 +2515,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
 BTF_SET8_END(generic_btf_ids)
 
 static const struct btf_kfunc_id_set generic_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 39548e326d53..9baa6f187b38 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -543,6 +543,7 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
 }
 
 static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
 
 static bool is_callback_calling_function(enum bpf_func_id func_id)
 {
@@ -1748,7 +1749,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		return -ENOMEM;
 	dst_state->jmp_history_cnt = src->jmp_history_cnt;
 
-	/* if dst has more stack frames then src frame, free them */
+	/* if dst has more stack frames then src frame, free them, this is also
+	 * necessary in case of exceptional exits using bpf_throw.
+	 */
 	for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
 		free_func_state(dst_state->frame[i]);
 		dst_state->frame[i] = NULL;
@@ -2868,7 +2871,7 @@ next:
 		if (i == subprog_end - 1) {
 			/* to avoid fall-through from one subprog into another
 			 * the last insn of the subprog should be either exit
-			 * or unconditional jump back
+			 * or unconditional jump back or bpf_throw call
 			 */
 			if (code != (BPF_JMP | BPF_EXIT) &&
 			    code != (BPF_JMP32 | BPF_JA) &&
@@ -5661,6 +5664,27 @@ continue_func:
 	for (; i < subprog_end; i++) {
 		int next_insn, sidx;
 
+		if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+			bool err = false;
+
+			if (!is_bpf_throw_kfunc(insn + i))
+				continue;
+			if (subprog[idx].is_cb)
+				err = true;
+			for (int c = 0; c < frame && !err; c++) {
+				if (subprog[ret_prog[c]].is_cb) {
+					err = true;
+					break;
+				}
+			}
+			if (!err)
+				continue;
+			verbose(env,
+				"bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+				i, idx);
+			return -EINVAL;
+		}
+
 		if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
 			continue;
 		/* remember insn and function to return to */
@@ -8919,6 +8943,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	 * callbacks
 	 */
 	if (set_callee_state_cb != set_callee_state) {
+		env->subprog_info[subprog].is_cb = true;
 		if (bpf_pseudo_kfunc_call(insn) &&
 		    !is_callback_calling_kfunc(insn->imm)) {
 			verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
@@ -9308,7 +9333,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		verbose(env, "to caller at %d:\n", *insn_idx);
 		print_verifier_state(env, caller, true);
 	}
-	/* clear everything in the callee */
+	/* clear everything in the callee. In case of exceptional exits using
+	 * bpf_throw, this will be done by copy_verifier_state for extra frames. */
 	free_func_state(callee);
 	state->frame[state->curframe--] = NULL;
 	return 0;
@@ -9432,17 +9458,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	return 0;
 }
 
-static int check_reference_leak(struct bpf_verifier_env *env)
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
 {
 	struct bpf_func_state *state = cur_func(env);
 	bool refs_lingering = false;
 	int i;
 
-	if (state->frameno && !state->in_callback_fn)
+	if (!exception_exit && state->frameno && !state->in_callback_fn)
 		return 0;
 
 	for (i = 0; i < state->acquired_refs; i++) {
-		if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+		if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
 			continue;
 		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
 			state->refs[i].id, state->refs[i].insn_idx);
@@ -9697,7 +9723,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	switch (func_id) {
 	case BPF_FUNC_tail_call:
-		err = check_reference_leak(env);
+		err = check_reference_leak(env, false);
 		if (err) {
 			verbose(env, "tail_call would lead to reference leak\n");
 			return err;
@@ -10332,6 +10358,7 @@ enum special_kfunc_type {
 	KF_bpf_dynptr_clone,
 	KF_bpf_percpu_obj_new_impl,
 	KF_bpf_percpu_obj_drop_impl,
+	KF_bpf_throw,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -10354,6 +10381,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -10378,6 +10406,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -10695,6 +10724,12 @@ static bool is_callback_calling_kfunc(u32 btf_id)
 	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
 }
 
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+	return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+	       insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
 static bool is_rbtree_lock_required_kfunc(u32 btf_id)
 {
 	return is_bpf_rbtree_api_kfunc(btf_id);
@@ -11480,6 +11515,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
+	if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+		if (!bpf_jit_supports_exceptions()) {
+			verbose(env, "JIT does not support calling kfunc %s#%d\n",
+				func_name, meta.func_id);
+			return -ENOTSUPP;
+		}
+		env->seen_exception = true;
+	}
+
 	for (i = 0; i < CALLER_SAVED_REGS; i++)
 		mark_reg_not_init(env, regs, caller_saved[i]);
 
@@ -14525,7 +14569,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * gen_ld_abs() may terminate the program at runtime, leading to
 	 * reference leak.
 	 */
-	err = check_reference_leak(env);
+	err = check_reference_leak(env, false);
 	if (err) {
 		verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
 		return err;
@@ -16539,6 +16583,7 @@ static int do_check(struct bpf_verifier_env *env)
 	int prev_insn_idx = -1;
 
 	for (;;) {
+		bool exception_exit = false;
 		struct bpf_insn *insn;
 		u8 class;
 		int err;
@@ -16753,12 +16798,17 @@ static int do_check(struct bpf_verifier_env *env)
 						return -EINVAL;
 					}
 				}
-				if (insn->src_reg == BPF_PSEUDO_CALL)
+				if (insn->src_reg == BPF_PSEUDO_CALL) {
 					err = check_func_call(env, insn, &env->insn_idx);
-				else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+				} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
 					err = check_kfunc_call(env, insn, &env->insn_idx);
-				else
+					if (!err && is_bpf_throw_kfunc(insn)) {
+						exception_exit = true;
+						goto process_bpf_exit_full;
+					}
+				} else {
 					err = check_helper_call(env, insn, &env->insn_idx);
+				}
 				if (err)
 					return err;
 
@@ -16788,7 +16838,7 @@ static int do_check(struct bpf_verifier_env *env)
 					verbose(env, "BPF_EXIT uses reserved fields\n");
 					return -EINVAL;
 				}
-
+process_bpf_exit_full:
 				if (env->cur_state->active_lock.ptr &&
 				    !in_rbtree_lock_required_cb(env)) {
 					verbose(env, "bpf_spin_unlock is missing\n");
@@ -16807,10 +16857,23 @@ static int do_check(struct bpf_verifier_env *env)
 				 * function, for which reference_state must
 				 * match caller reference state when it exits.
 				 */
-				err = check_reference_leak(env);
+				err = check_reference_leak(env, exception_exit);
 				if (err)
 					return err;
 
+				/* The side effect of the prepare_func_exit
+				 * which is being skipped is that it frees
+				 * bpf_func_state. Typically, process_bpf_exit
+				 * will only be hit with outermost exit.
+				 * copy_verifier_state in pop_stack will handle
+				 * freeing of any extra bpf_func_state left over
+				 * from not processing all nested function
+				 * exits. We also skip return code checks as
+				 * they are not needed for exceptional exits.
+				 */
+				if (exception_exit)
+					goto process_bpf_exit;
+
 				if (state->curframe) {
 					/* exit from nested function */
 					err = prepare_func_exit(env, &env->insn_idx);
@@ -18113,6 +18176,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		}
 		func[i]->aux->num_exentries = num_exentries;
 		func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+		func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+		if (!i)
+			func[i]->aux->exception_boundary = env->seen_exception;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@@ -18201,6 +18267,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->aux->func = func;
 	prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
 	prog->aux->real_func_cnt = env->subprog_cnt;
+	prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+	prog->aux->exception_boundary = func[0]->aux->exception_boundary;
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
@@ -18437,7 +18505,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 }
 
 /* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
-static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
 {
 	struct bpf_subprog_info *info = env->subprog_info;
 	int cnt = env->subprog_cnt;
@@ -18481,6 +18549,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	struct bpf_map *map_ptr;
 	int i, ret, cnt, delta = 0;
 
+	if (env->seen_exception && !env->exception_callback_subprog) {
+		struct bpf_insn patch[] = {
+			env->prog->insnsi[insn_cnt - 1],
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		};
+
+		ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
+		if (ret < 0)
+			return ret;
+		prog = env->prog;
+		insn = prog->insnsi;
+
+		env->exception_callback_subprog = env->subprog_cnt - 1;
+		/* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+		env->subprog_info[env->exception_callback_subprog].is_cb = true;
+		env->subprog_info[env->exception_callback_subprog].is_async_cb = true;
+		env->subprog_info[env->exception_callback_subprog].is_exception_cb = true;
+	}
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		/* Make divide-by-zero exceptions impossible. */
 		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 4494eaa9937e..333b54a86e3a 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -162,4 +162,20 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
 /* Convenience macro to wrap over bpf_obj_drop_impl */
 #define bpf_percpu_obj_drop(kptr) bpf_percpu_obj_drop_impl(kptr, NULL)
 
+/* Description
+ *	Throw a BPF exception from the program, immediately terminating its
+ *	execution and unwinding the stack. The supplied 'cookie' parameter
+ *	will be the return value of the program when an exception is thrown.
+ *
+ *	Note that throwing an exception with lingering resources (locks,
+ *	references, etc.) will lead to a verification error.
+ *
+ *	Note that callbacks *cannot* call this helper.
+ * Returns
+ *	Never.
+ * Throws
+ *	An exception with the specified 'cookie' value.
+ */
+extern void bpf_throw(u64 cookie) __ksym;
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From b9ae0c9dd0aca79bffc17be51c2dc148d1f72708 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:03 +0200
Subject: bpf: Add support for custom exception callbacks

By default, the subprog generated by the verifier to handle a thrown
exception hardcodes a return value of 0. To allow user-defined logic
and modification of the return value when an exception is thrown,
introduce the 'exception_callback:' declaration tag, which marks a
callback as the default exception handler for the program.

The format of the declaration tag is 'exception_callback:<value>', where
<value> is the name of the exception callback. Each main program can be
tagged using this BTF declaratiion tag to associate it with an exception
callback. In case the tag is absent, the default callback is used.

As such, the exception callback cannot be modified at runtime, only set
during verification.

Allowing modification of the callback for the current program execution
at runtime leads to issues when the programs begin to nest, as any
per-CPU state maintaing this information will have to be saved and
restored. We don't want it to stay in bpf_prog_aux as this takes a
global effect for all programs. An alternative solution is spilling
the callback pointer at a known location on the program stack on entry,
and then passing this location to bpf_throw as a parameter.

However, since exceptions are geared more towards a use case where they
are ideally never invoked, optimizing for this use case and adding to
the complexity has diminishing returns.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                            |   4 +-
 include/linux/bpf_verifier.h                   |   1 +
 kernel/bpf/btf.c                               |  29 +++++--
 kernel/bpf/verifier.c                          | 113 +++++++++++++++++++++++--
 tools/testing/selftests/bpf/bpf_experimental.h |  31 ++++++-
 5 files changed, 160 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 16740ee82082..30063a760b5a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2422,9 +2422,11 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg);
+			  struct bpf_reg_state *reg, bool is_ex_cb);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+				    int comp_idx, const char *tag_key);
 
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index da21a3ec5027..94ec766432f5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -300,6 +300,7 @@ struct bpf_func_state {
 	bool in_callback_fn;
 	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
+	bool in_exception_callback_fn;
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 187b57276fec..f93e835d90af 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3310,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	return BTF_FIELD_FOUND;
 }
 
-static const char *btf_find_decl_tag_value(const struct btf *btf,
-					   const struct btf_type *pt,
-					   int comp_idx, const char *tag_key)
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+				    int comp_idx, const char *tag_key)
 {
+	const char *value = NULL;
 	int i;
 
 	for (i = 1; i < btf_nr_types(btf); i++) {
@@ -3327,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,
 			continue;
 		if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
 			continue;
-		return __btf_name_by_offset(btf, t->name_off) + len;
+		/* Prevent duplicate entries for same type */
+		if (value)
+			return ERR_PTR(-EEXIST);
+		value = __btf_name_by_offset(btf, t->name_off) + len;
 	}
-	return NULL;
+	if (!value)
+		return ERR_PTR(-ENOENT);
+	return value;
 }
 
 static int
@@ -3347,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
 	if (t->size != sz)
 		return BTF_FIELD_IGNORE;
 	value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
-	if (!value_type)
+	if (IS_ERR(value_type))
 		return -EINVAL;
 	node_field_name = strstr(value_type, ":");
 	if (!node_field_name)
@@ -6954,7 +6959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
  * (either PTR_TO_CTX or SCALAR_VALUE).
  */
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *regs)
+			  struct bpf_reg_state *regs, bool is_ex_cb)
 {
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
@@ -7011,7 +7016,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
-	/* check that function returns int */
+	/* check that function returns int, exception cb also requires this */
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
@@ -7060,6 +7065,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			i, btf_type_str(t), tname);
 		return -EINVAL;
 	}
+	/* We have already ensured that the callback returns an integer, just
+	 * like all global subprogs. We need to determine it only has a single
+	 * scalar argument.
+	 */
+	if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) {
+		bpf_log(log, "exception cb only supports single integer argument\n");
+		return -EINVAL;
+	}
 	return 0;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ec767ae08c2b..ec3f22312516 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2457,6 +2457,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 	return env->subprog_cnt - 1;
 }
 
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+	struct bpf_prog_aux *aux = env->prog->aux;
+	struct btf *btf = aux->btf;
+	const struct btf_type *t;
+	u32 main_btf_id, id;
+	const char *name;
+	int ret, i;
+
+	/* Non-zero func_info_cnt implies valid btf */
+	if (!aux->func_info_cnt)
+		return 0;
+	main_btf_id = aux->func_info[0].type_id;
+
+	t = btf_type_by_id(btf, main_btf_id);
+	if (!t) {
+		verbose(env, "invalid btf id for main subprog in func_info\n");
+		return -EINVAL;
+	}
+
+	name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+	if (IS_ERR(name)) {
+		ret = PTR_ERR(name);
+		/* If there is no tag present, there is no exception callback */
+		if (ret == -ENOENT)
+			ret = 0;
+		else if (ret == -EEXIST)
+			verbose(env, "multiple exception callback tags for main subprog\n");
+		return ret;
+	}
+
+	ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+	if (ret < 0) {
+		verbose(env, "exception callback '%s' could not be found in BTF\n", name);
+		return ret;
+	}
+	id = ret;
+	t = btf_type_by_id(btf, id);
+	if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
+		verbose(env, "exception callback '%s' must have global linkage\n", name);
+		return -EINVAL;
+	}
+	ret = 0;
+	for (i = 0; i < aux->func_info_cnt; i++) {
+		if (aux->func_info[i].type_id != id)
+			continue;
+		ret = aux->func_info[i].insn_off;
+		/* Further func_info and subprog checks will also happen
+		 * later, so assume this is the right insn_off for now.
+		 */
+		if (!ret) {
+			verbose(env, "invalid exception callback insn_off in func_info: 0\n");
+			ret = -EINVAL;
+		}
+	}
+	if (!ret) {
+		verbose(env, "exception callback type id not found in func_info\n");
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
 #define MAX_KFUNC_DESCS 256
 #define MAX_KFUNC_BTFS	256
 
@@ -2796,8 +2858,8 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 {
 	struct bpf_subprog_info *subprog = env->subprog_info;
+	int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int i, ret, insn_cnt = env->prog->len;
 
 	/* Add entry function. */
 	ret = add_subprog(env, 0);
@@ -2823,6 +2885,26 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 			return ret;
 	}
 
+	ret = bpf_find_exception_callback_insn_off(env);
+	if (ret < 0)
+		return ret;
+	ex_cb_insn = ret;
+
+	/* If ex_cb_insn > 0, this means that the main program has a subprog
+	 * marked using BTF decl tag to serve as the exception callback.
+	 */
+	if (ex_cb_insn) {
+		ret = add_subprog(env, ex_cb_insn);
+		if (ret < 0)
+			return ret;
+		for (i = 1; i < env->subprog_cnt; i++) {
+			if (env->subprog_info[i].start != ex_cb_insn)
+				continue;
+			env->exception_callback_subprog = i;
+			break;
+		}
+	}
+
 	/* Add a fake 'exit' subprog which could simplify subprog iteration
 	 * logic. 'subprog_cnt' should not be increased.
 	 */
@@ -5707,6 +5789,10 @@ continue_func:
 			/* async callbacks don't increase bpf prog stack size unless called directly */
 			if (!bpf_pseudo_call(insn + i))
 				continue;
+			if (subprog[sidx].is_exception_cb) {
+				verbose(env, "insn %d cannot call exception cb directly\n", i);
+				return -EINVAL;
+			}
 		}
 		i = next_insn;
 		idx = sidx;
@@ -5728,8 +5814,13 @@ continue_func:
 	 * tail call counter throughout bpf2bpf calls combined with tailcalls
 	 */
 	if (tail_call_reachable)
-		for (j = 0; j < frame; j++)
+		for (j = 0; j < frame; j++) {
+			if (subprog[ret_prog[j]].is_exception_cb) {
+				verbose(env, "cannot tail call within exception cb\n");
+				return -EINVAL;
+			}
 			subprog[ret_prog[j]].tail_call_reachable = true;
+		}
 	if (subprog[0].tail_call_reachable)
 		env->prog->aux->tail_call_reachable = true;
 
@@ -14630,7 +14721,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	const bool is_subprog = frame->subprogno;
 
 	/* LSM and struct_ops func-ptr's return type could be "void" */
-	if (!is_subprog) {
+	if (!is_subprog || frame->in_exception_callback_fn) {
 		switch (prog_type) {
 		case BPF_PROG_TYPE_LSM:
 			if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -14678,7 +14769,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 		return 0;
 	}
 
-	if (is_subprog) {
+	if (is_subprog && !frame->in_exception_callback_fn) {
 		if (reg->type != SCALAR_VALUE) {
 			verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
 				reg_type_str(env, reg->type));
@@ -19334,7 +19425,7 @@ static void free_states(struct bpf_verifier_env *env)
 	}
 }
 
-static int do_check_common(struct bpf_verifier_env *env, int subprog)
+static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
 	struct bpf_verifier_state *state;
@@ -19365,7 +19456,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		ret = btf_prepare_func_args(env, subprog, regs);
+		ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb);
 		if (ret)
 			goto out;
 		for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
@@ -19381,6 +19472,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				regs[i].id = ++env->id_gen;
 			}
 		}
+		if (is_ex_cb) {
+			state->frame[0]->in_exception_callback_fn = true;
+			env->subprog_info[subprog].is_cb = true;
+			env->subprog_info[subprog].is_async_cb = true;
+			env->subprog_info[subprog].is_exception_cb = true;
+		}
 	} else {
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
@@ -19445,7 +19542,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
 			continue;
 		env->insn_idx = env->subprog_info[i].start;
 		WARN_ON_ONCE(env->insn_idx == 0);
-		ret = do_check_common(env, i);
+		ret = do_check_common(env, i, env->exception_callback_subprog == i);
 		if (ret) {
 			return ret;
 		} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -19462,7 +19559,7 @@ static int do_check_main(struct bpf_verifier_env *env)
 	int ret;
 
 	env->insn_idx = 0;
-	ret = do_check_common(env, 0);
+	ret = do_check_common(env, 0, false);
 	if (!ret)
 		env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return ret;
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 333b54a86e3a..9a87170524ce 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -165,7 +165,16 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
 /* Description
  *	Throw a BPF exception from the program, immediately terminating its
  *	execution and unwinding the stack. The supplied 'cookie' parameter
- *	will be the return value of the program when an exception is thrown.
+ *	will be the return value of the program when an exception is thrown,
+ *	and the default exception callback is used. Otherwise, if an exception
+ *	callback is set using the '__exception_cb(callback)' declaration tag
+ *	on the main program, the 'cookie' parameter will be the callback's only
+ *	input argument.
+ *
+ *	Thus, in case of default exception callback, 'cookie' is subjected to
+ *	constraints on the program's return value (as with R0 on exit).
+ *	Otherwise, the return value of the marked exception callback will be
+ *	subjected to the same checks.
  *
  *	Note that throwing an exception with lingering resources (locks,
  *	references, etc.) will lead to a verification error.
@@ -178,4 +187,24 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
  */
 extern void bpf_throw(u64 cookie) __ksym;
 
+/* This macro must be used to mark the exception callback corresponding to the
+ * main program. For example:
+ *
+ * int exception_cb(u64 cookie) {
+ *	return cookie;
+ * }
+ *
+ * SEC("tc")
+ * __exception_cb(exception_cb)
+ * int main_prog(struct __sk_buff *ctx) {
+ *	...
+ *	return TC_ACT_OK;
+ * }
+ *
+ * Here, exception callback for the main program will be 'exception_cb'. Note
+ * that this attribute can only be used once, and multiple exception callbacks
+ * specified for the main program will lead to verification error.
+ */
+#define __exception_cb(name) __attribute__((btf_decl_tag("exception_callback:" #name)))
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 7ccb84f04cda1dd6f64f352e9795db308e9cdc0c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:06 +0200
Subject: mm: kasan: Declare kasan_unpoison_task_stack_below in kasan.h

We require access to this kasan helper in BPF code in the next patch
where we have to unpoison the task stack when we unwind and reset the
stack frame from bpf_throw, and it never really unpoisons the poisoned
stack slots on entry when compiler instrumentation is generated by
CONFIG_KASAN_STACK and inline instrumentation is supported.

Also, remove the declaration from mm/kasan/kasan.h as we put it in the
header file kasan.h.

Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Suggested-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-10-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/kasan.h | 2 ++
 mm/kasan/kasan.h      | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 819b6bc8ac08..7a463f814db2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -283,8 +283,10 @@ static inline bool kasan_check_byte(const void *address)
 
 #if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
 void kasan_unpoison_task_stack(struct task_struct *task);
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
 #else
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+static inline void kasan_unpoison_task_stack_below(const void *watermark) {}
 #endif
 
 #ifdef CONFIG_KASAN_GENERIC
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 2e973b36fe07..5eefe202bb8f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -558,7 +558,6 @@ void kasan_restore_multi_shot(bool enabled);
  * code. Declared here to avoid warnings about missing declarations.
  */
 
-asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
 void __asan_register_globals(void *globals, ssize_t size);
 void __asan_unregister_globals(void *globals, ssize_t size);
 void __asan_handle_no_return(void);
-- 
cgit v1.2.3-70-g09d2


From 3badff3a25d815e915d89565a0c82dec608a8d2b Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Wed, 13 Sep 2023 21:49:36 +0100
Subject: dpll: spec: Add Netlink spec in YAML

Add a protocol spec for DPLL.
Add code generated from the spec.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/netlink/specs/dpll.yaml | 488 ++++++++++++++++++++++++++++++++++
 drivers/dpll/dpll_nl.c                | 162 +++++++++++
 drivers/dpll/dpll_nl.h                |  51 ++++
 include/uapi/linux/dpll.h             | 201 ++++++++++++++
 4 files changed, 902 insertions(+)
 create mode 100644 Documentation/netlink/specs/dpll.yaml
 create mode 100644 drivers/dpll/dpll_nl.c
 create mode 100644 drivers/dpll/dpll_nl.h
 create mode 100644 include/uapi/linux/dpll.h

(limited to 'include')

diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
new file mode 100644
index 000000000000..8b86b28b47a6
--- /dev/null
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -0,0 +1,488 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+
+name: dpll
+
+doc: DPLL subsystem.
+
+definitions:
+  -
+    type: enum
+    name: mode
+    doc: |
+      working modes a dpll can support, differentiates if and how dpll selects
+      one of its inputs to syntonize with it, valid values for DPLL_A_MODE
+      attribute
+    entries:
+      -
+        name: manual
+        doc: input can be only selected by sending a request to dpll
+        value: 1
+      -
+        name: automatic
+        doc: highest prio input pin auto selected by dpll
+    render-max: true
+  -
+    type: enum
+    name: lock-status
+    doc: |
+      provides information of dpll device lock status, valid values for
+      DPLL_A_LOCK_STATUS attribute
+    entries:
+      -
+        name: unlocked
+        doc: |
+          dpll was not yet locked to any valid input (or forced by setting
+          DPLL_A_MODE to DPLL_MODE_DETACHED)
+        value: 1
+      -
+        name: locked
+        doc: |
+          dpll is locked to a valid signal, but no holdover available
+      -
+        name: locked-ho-acq
+        doc: |
+          dpll is locked and holdover acquired
+      -
+        name: holdover
+        doc: |
+          dpll is in holdover state - lost a valid lock or was forced
+          by disconnecting all the pins (latter possible only
+          when dpll lock-state was already DPLL_LOCK_STATUS_LOCKED_HO_ACQ,
+          if dpll lock-state was not DPLL_LOCK_STATUS_LOCKED_HO_ACQ, the
+          dpll's lock-state shall remain DPLL_LOCK_STATUS_UNLOCKED)
+    render-max: true
+  -
+    type: const
+    name: temp-divider
+    value: 1000
+    doc: |
+      temperature divider allowing userspace to calculate the
+      temperature as float with three digit decimal precision.
+      Value of (DPLL_A_TEMP / DPLL_TEMP_DIVIDER) is integer part of
+      temperature value.
+      Value of (DPLL_A_TEMP % DPLL_TEMP_DIVIDER) is fractional part of
+      temperature value.
+  -
+    type: enum
+    name: type
+    doc: type of dpll, valid values for DPLL_A_TYPE attribute
+    entries:
+      -
+        name: pps
+        doc: dpll produces Pulse-Per-Second signal
+        value: 1
+      -
+        name: eec
+        doc: dpll drives the Ethernet Equipment Clock
+    render-max: true
+  -
+    type: enum
+    name: pin-type
+    doc: |
+      defines possible types of a pin, valid values for DPLL_A_PIN_TYPE
+      attribute
+    entries:
+      -
+        name: mux
+        doc: aggregates another layer of selectable pins
+        value: 1
+      -
+        name: ext
+        doc: external input
+      -
+        name: synce-eth-port
+        doc: ethernet port PHY's recovered clock
+      -
+        name: int-oscillator
+        doc: device internal oscillator
+      -
+        name: gnss
+        doc: GNSS recovered clock
+    render-max: true
+  -
+    type: enum
+    name: pin-direction
+    doc: |
+      defines possible direction of a pin, valid values for
+      DPLL_A_PIN_DIRECTION attribute
+    entries:
+      -
+        name: input
+        doc: pin used as a input of a signal
+        value: 1
+      -
+        name: output
+        doc: pin used to output the signal
+    render-max: true
+  -
+    type: const
+    name: pin-frequency-1-hz
+    value: 1
+  -
+    type: const
+    name: pin-frequency-10-khz
+    value: 10000
+  -
+    type: const
+    name: pin-frequency-77_5-khz
+    value: 77500
+  -
+    type: const
+    name: pin-frequency-10-mhz
+    value: 10000000
+  -
+    type: enum
+    name: pin-state
+    doc: |
+      defines possible states of a pin, valid values for
+      DPLL_A_PIN_STATE attribute
+    entries:
+      -
+        name: connected
+        doc: pin connected, active input of phase locked loop
+        value: 1
+      -
+        name: disconnected
+        doc: pin disconnected, not considered as a valid input
+      -
+        name: selectable
+        doc: pin enabled for automatic input selection
+    render-max: true
+  -
+    type: flags
+    name: pin-capabilities
+    doc: |
+      defines possible capabilities of a pin, valid flags on
+      DPLL_A_PIN_CAPABILITIES attribute
+    entries:
+      -
+        name: direction-can-change
+        doc: pin direction can be changed
+      -
+        name: priority-can-change
+        doc: pin priority can be changed
+      -
+        name: state-can-change
+        doc: pin state can be changed
+
+attribute-sets:
+  -
+    name: dpll
+    enum-name: dpll_a
+    attributes:
+      -
+        name: id
+        type: u32
+      -
+        name: module-name
+        type: string
+      -
+        name: pad
+        type: pad
+      -
+        name: clock-id
+        type: u64
+      -
+        name: mode
+        type: u32
+        enum: mode
+      -
+        name: mode-supported
+        type: u32
+        enum: mode
+        multi-attr: true
+      -
+        name: lock-status
+        type: u32
+        enum: lock-status
+      -
+        name: temp
+        type: s32
+      -
+        name: type
+        type: u32
+        enum: type
+  -
+    name: pin
+    enum-name: dpll_a_pin
+    attributes:
+      -
+        name: id
+        type: u32
+      -
+        name: parent-id
+        type: u32
+      -
+        name: module-name
+        type: string
+      -
+        name: pad
+        type: pad
+      -
+        name: clock-id
+        type: u64
+      -
+        name: board-label
+        type: string
+      -
+        name: panel-label
+        type: string
+      -
+        name: package-label
+        type: string
+      -
+        name: type
+        type: u32
+        enum: pin-type
+      -
+        name: direction
+        type: u32
+        enum: pin-direction
+      -
+        name: frequency
+        type: u64
+      -
+        name: frequency-supported
+        type: nest
+        multi-attr: true
+        nested-attributes: frequency-range
+      -
+        name: frequency-min
+        type: u64
+      -
+        name: frequency-max
+        type: u64
+      -
+        name: prio
+        type: u32
+      -
+        name: state
+        type: u32
+        enum: pin-state
+      -
+        name: capabilities
+        type: u32
+      -
+        name: parent-device
+        type: nest
+        multi-attr: true
+        nested-attributes: pin-parent-device
+      -
+        name: parent-pin
+        type: nest
+        multi-attr: true
+        nested-attributes: pin-parent-pin
+  -
+    name: pin-parent-device
+    subset-of: pin
+    attributes:
+      -
+        name: parent-id
+        type: u32
+      -
+        name: direction
+        type: u32
+      -
+        name: prio
+        type: u32
+      -
+        name: state
+        type: u32
+  -
+    name: pin-parent-pin
+    subset-of: pin
+    attributes:
+      -
+        name: parent-id
+        type: u32
+      -
+        name: state
+        type: u32
+  -
+    name: frequency-range
+    subset-of: pin
+    attributes:
+      -
+        name: frequency-min
+        type: u64
+      -
+        name: frequency-max
+        type: u64
+
+operations:
+  enum-name: dpll_cmd
+  list:
+    -
+      name: device-id-get
+      doc: |
+        Get id of dpll device that matches given attributes
+      attribute-set: dpll
+      flags: [ admin-perm ]
+
+      do:
+        pre: dpll-lock-doit
+        post: dpll-unlock-doit
+        request:
+          attributes:
+            - module-name
+            - clock-id
+            - type
+        reply:
+          attributes:
+            - id
+
+    -
+      name: device-get
+      doc: |
+        Get list of DPLL devices (dump) or attributes of a single dpll device
+      attribute-set: dpll
+      flags: [ admin-perm ]
+
+      do:
+        pre: dpll-pre-doit
+        post: dpll-post-doit
+        request:
+          attributes:
+            - id
+        reply: &dev-attrs
+          attributes:
+            - id
+            - module-name
+            - mode
+            - mode-supported
+            - lock-status
+            - temp
+            - clock-id
+            - type
+
+      dump:
+        pre: dpll-lock-dumpit
+        post: dpll-unlock-dumpit
+        reply: *dev-attrs
+
+    -
+      name: device-set
+      doc: Set attributes for a DPLL device
+      attribute-set: dpll
+      flags: [ admin-perm ]
+
+      do:
+        pre: dpll-pre-doit
+        post: dpll-post-doit
+        request:
+          attributes:
+            - id
+    -
+      name: device-create-ntf
+      doc: Notification about device appearing
+      notify: device-get
+      mcgrp: monitor
+    -
+      name: device-delete-ntf
+      doc: Notification about device disappearing
+      notify: device-get
+      mcgrp: monitor
+    -
+      name: device-change-ntf
+      doc: Notification about device configuration being changed
+      notify: device-get
+      mcgrp: monitor
+    -
+      name: pin-id-get
+      doc: |
+        Get id of a pin that matches given attributes
+      attribute-set: pin
+      flags: [ admin-perm ]
+
+      do:
+        pre: dpll-lock-doit
+        post: dpll-unlock-doit
+        request:
+          attributes:
+            - module-name
+            - clock-id
+            - board-label
+            - panel-label
+            - package-label
+            - type
+        reply:
+          attributes:
+            - id
+
+    -
+      name: pin-get
+      doc: |
+        Get list of pins and its attributes.
+        - dump request without any attributes given - list all the pins in the
+          system
+        - dump request with target dpll - list all the pins registered with
+          a given dpll device
+        - do request with target dpll and target pin - single pin attributes
+      attribute-set: pin
+      flags: [ admin-perm ]
+
+      do:
+        pre: dpll-pin-pre-doit
+        post: dpll-pin-post-doit
+        request:
+          attributes:
+            - id
+        reply: &pin-attrs
+          attributes:
+            - id
+            - board-label
+            - panel-label
+            - package-label
+            - type
+            - frequency
+            - frequency-supported
+            - capabilities
+            - parent-device
+            - parent-pin
+
+      dump:
+        pre: dpll-lock-dumpit
+        post: dpll-unlock-dumpit
+        request:
+          attributes:
+            - id
+        reply: *pin-attrs
+
+    -
+      name: pin-set
+      doc: Set attributes of a target pin
+      attribute-set: pin
+      flags: [ admin-perm ]
+
+      do:
+        pre: dpll-pin-pre-doit
+        post: dpll-pin-post-doit
+        request:
+          attributes:
+            - id
+            - frequency
+            - direction
+            - prio
+            - state
+            - parent-device
+            - parent-pin
+    -
+      name: pin-create-ntf
+      doc: Notification about pin appearing
+      notify: pin-get
+      mcgrp: monitor
+    -
+      name: pin-delete-ntf
+      doc: Notification about pin disappearing
+      notify: pin-get
+      mcgrp: monitor
+    -
+      name: pin-change-ntf
+      doc: Notification about pin configuration being changed
+      notify: pin-get
+      mcgrp: monitor
+
+mcast-groups:
+  list:
+    -
+      name: monitor
diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c
new file mode 100644
index 000000000000..14064c8c783b
--- /dev/null
+++ b/drivers/dpll/dpll_nl.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/dpll.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "dpll_nl.h"
+
+#include <uapi/linux/dpll.h>
+
+/* Common nested types */
+const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_STATE + 1] = {
+	[DPLL_A_PIN_PARENT_ID] = { .type = NLA_U32, },
+	[DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2),
+	[DPLL_A_PIN_PRIO] = { .type = NLA_U32, },
+	[DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3),
+};
+
+const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1] = {
+	[DPLL_A_PIN_PARENT_ID] = { .type = NLA_U32, },
+	[DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3),
+};
+
+/* DPLL_CMD_DEVICE_ID_GET - do */
+static const struct nla_policy dpll_device_id_get_nl_policy[DPLL_A_TYPE + 1] = {
+	[DPLL_A_MODULE_NAME] = { .type = NLA_NUL_STRING, },
+	[DPLL_A_CLOCK_ID] = { .type = NLA_U64, },
+	[DPLL_A_TYPE] = NLA_POLICY_RANGE(NLA_U32, 1, 2),
+};
+
+/* DPLL_CMD_DEVICE_GET - do */
+static const struct nla_policy dpll_device_get_nl_policy[DPLL_A_ID + 1] = {
+	[DPLL_A_ID] = { .type = NLA_U32, },
+};
+
+/* DPLL_CMD_DEVICE_SET - do */
+static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_ID + 1] = {
+	[DPLL_A_ID] = { .type = NLA_U32, },
+};
+
+/* DPLL_CMD_PIN_ID_GET - do */
+static const struct nla_policy dpll_pin_id_get_nl_policy[DPLL_A_PIN_TYPE + 1] = {
+	[DPLL_A_PIN_MODULE_NAME] = { .type = NLA_NUL_STRING, },
+	[DPLL_A_PIN_CLOCK_ID] = { .type = NLA_U64, },
+	[DPLL_A_PIN_BOARD_LABEL] = { .type = NLA_NUL_STRING, },
+	[DPLL_A_PIN_PANEL_LABEL] = { .type = NLA_NUL_STRING, },
+	[DPLL_A_PIN_PACKAGE_LABEL] = { .type = NLA_NUL_STRING, },
+	[DPLL_A_PIN_TYPE] = NLA_POLICY_RANGE(NLA_U32, 1, 5),
+};
+
+/* DPLL_CMD_PIN_GET - do */
+static const struct nla_policy dpll_pin_get_do_nl_policy[DPLL_A_PIN_ID + 1] = {
+	[DPLL_A_PIN_ID] = { .type = NLA_U32, },
+};
+
+/* DPLL_CMD_PIN_GET - dump */
+static const struct nla_policy dpll_pin_get_dump_nl_policy[DPLL_A_PIN_ID + 1] = {
+	[DPLL_A_PIN_ID] = { .type = NLA_U32, },
+};
+
+/* DPLL_CMD_PIN_SET - do */
+static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_PARENT_PIN + 1] = {
+	[DPLL_A_PIN_ID] = { .type = NLA_U32, },
+	[DPLL_A_PIN_FREQUENCY] = { .type = NLA_U64, },
+	[DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2),
+	[DPLL_A_PIN_PRIO] = { .type = NLA_U32, },
+	[DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3),
+	[DPLL_A_PIN_PARENT_DEVICE] = NLA_POLICY_NESTED(dpll_pin_parent_device_nl_policy),
+	[DPLL_A_PIN_PARENT_PIN] = NLA_POLICY_NESTED(dpll_pin_parent_pin_nl_policy),
+};
+
+/* Ops table for dpll */
+static const struct genl_split_ops dpll_nl_ops[] = {
+	{
+		.cmd		= DPLL_CMD_DEVICE_ID_GET,
+		.pre_doit	= dpll_lock_doit,
+		.doit		= dpll_nl_device_id_get_doit,
+		.post_doit	= dpll_unlock_doit,
+		.policy		= dpll_device_id_get_nl_policy,
+		.maxattr	= DPLL_A_TYPE,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= DPLL_CMD_DEVICE_GET,
+		.pre_doit	= dpll_pre_doit,
+		.doit		= dpll_nl_device_get_doit,
+		.post_doit	= dpll_post_doit,
+		.policy		= dpll_device_get_nl_policy,
+		.maxattr	= DPLL_A_ID,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd	= DPLL_CMD_DEVICE_GET,
+		.start	= dpll_lock_dumpit,
+		.dumpit	= dpll_nl_device_get_dumpit,
+		.done	= dpll_unlock_dumpit,
+		.flags	= GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP,
+	},
+	{
+		.cmd		= DPLL_CMD_DEVICE_SET,
+		.pre_doit	= dpll_pre_doit,
+		.doit		= dpll_nl_device_set_doit,
+		.post_doit	= dpll_post_doit,
+		.policy		= dpll_device_set_nl_policy,
+		.maxattr	= DPLL_A_ID,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= DPLL_CMD_PIN_ID_GET,
+		.pre_doit	= dpll_lock_doit,
+		.doit		= dpll_nl_pin_id_get_doit,
+		.post_doit	= dpll_unlock_doit,
+		.policy		= dpll_pin_id_get_nl_policy,
+		.maxattr	= DPLL_A_PIN_TYPE,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= DPLL_CMD_PIN_GET,
+		.pre_doit	= dpll_pin_pre_doit,
+		.doit		= dpll_nl_pin_get_doit,
+		.post_doit	= dpll_pin_post_doit,
+		.policy		= dpll_pin_get_do_nl_policy,
+		.maxattr	= DPLL_A_PIN_ID,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= DPLL_CMD_PIN_GET,
+		.start		= dpll_lock_dumpit,
+		.dumpit		= dpll_nl_pin_get_dumpit,
+		.done		= dpll_unlock_dumpit,
+		.policy		= dpll_pin_get_dump_nl_policy,
+		.maxattr	= DPLL_A_PIN_ID,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP,
+	},
+	{
+		.cmd		= DPLL_CMD_PIN_SET,
+		.pre_doit	= dpll_pin_pre_doit,
+		.doit		= dpll_nl_pin_set_doit,
+		.post_doit	= dpll_pin_post_doit,
+		.policy		= dpll_pin_set_nl_policy,
+		.maxattr	= DPLL_A_PIN_PARENT_PIN,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+};
+
+static const struct genl_multicast_group dpll_nl_mcgrps[] = {
+	[DPLL_NLGRP_MONITOR] = { "monitor", },
+};
+
+struct genl_family dpll_nl_family __ro_after_init = {
+	.name		= DPLL_FAMILY_NAME,
+	.version	= DPLL_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= dpll_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(dpll_nl_ops),
+	.mcgrps		= dpll_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(dpll_nl_mcgrps),
+};
diff --git a/drivers/dpll/dpll_nl.h b/drivers/dpll/dpll_nl.h
new file mode 100644
index 000000000000..1f67aaed4742
--- /dev/null
+++ b/drivers/dpll/dpll_nl.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/dpll.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_DPLL_GEN_H
+#define _LINUX_DPLL_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/dpll.h>
+
+/* Common nested types */
+extern const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_STATE + 1];
+extern const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1];
+
+int dpll_lock_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		   struct genl_info *info);
+int dpll_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		  struct genl_info *info);
+int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		      struct genl_info *info);
+void
+dpll_unlock_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		 struct genl_info *info);
+void
+dpll_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+	       struct genl_info *info);
+void
+dpll_pin_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		   struct genl_info *info);
+int dpll_lock_dumpit(struct netlink_callback *cb);
+int dpll_unlock_dumpit(struct netlink_callback *cb);
+
+int dpll_nl_device_id_get_doit(struct sk_buff *skb, struct genl_info *info);
+int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info);
+int dpll_nl_device_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int dpll_nl_device_set_doit(struct sk_buff *skb, struct genl_info *info);
+int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info);
+int dpll_nl_pin_get_doit(struct sk_buff *skb, struct genl_info *info);
+int dpll_nl_pin_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int dpll_nl_pin_set_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+	DPLL_NLGRP_MONITOR,
+};
+
+extern struct genl_family dpll_nl_family;
+
+#endif /* _LINUX_DPLL_GEN_H */
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
new file mode 100644
index 000000000000..20ef0718f8dc
--- /dev/null
+++ b/include/uapi/linux/dpll.h
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/dpll.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_DPLL_H
+#define _UAPI_LINUX_DPLL_H
+
+#define DPLL_FAMILY_NAME	"dpll"
+#define DPLL_FAMILY_VERSION	1
+
+/**
+ * enum dpll_mode - working modes a dpll can support, differentiates if and how
+ *   dpll selects one of its inputs to syntonize with it, valid values for
+ *   DPLL_A_MODE attribute
+ * @DPLL_MODE_MANUAL: input can be only selected by sending a request to dpll
+ * @DPLL_MODE_AUTOMATIC: highest prio input pin auto selected by dpll
+ */
+enum dpll_mode {
+	DPLL_MODE_MANUAL = 1,
+	DPLL_MODE_AUTOMATIC,
+
+	/* private: */
+	__DPLL_MODE_MAX,
+	DPLL_MODE_MAX = (__DPLL_MODE_MAX - 1)
+};
+
+/**
+ * enum dpll_lock_status - provides information of dpll device lock status,
+ *   valid values for DPLL_A_LOCK_STATUS attribute
+ * @DPLL_LOCK_STATUS_UNLOCKED: dpll was not yet locked to any valid input (or
+ *   forced by setting DPLL_A_MODE to DPLL_MODE_DETACHED)
+ * @DPLL_LOCK_STATUS_LOCKED: dpll is locked to a valid signal, but no holdover
+ *   available
+ * @DPLL_LOCK_STATUS_LOCKED_HO_ACQ: dpll is locked and holdover acquired
+ * @DPLL_LOCK_STATUS_HOLDOVER: dpll is in holdover state - lost a valid lock or
+ *   was forced by disconnecting all the pins (latter possible only when dpll
+ *   lock-state was already DPLL_LOCK_STATUS_LOCKED_HO_ACQ, if dpll lock-state
+ *   was not DPLL_LOCK_STATUS_LOCKED_HO_ACQ, the dpll's lock-state shall remain
+ *   DPLL_LOCK_STATUS_UNLOCKED)
+ */
+enum dpll_lock_status {
+	DPLL_LOCK_STATUS_UNLOCKED = 1,
+	DPLL_LOCK_STATUS_LOCKED,
+	DPLL_LOCK_STATUS_LOCKED_HO_ACQ,
+	DPLL_LOCK_STATUS_HOLDOVER,
+
+	/* private: */
+	__DPLL_LOCK_STATUS_MAX,
+	DPLL_LOCK_STATUS_MAX = (__DPLL_LOCK_STATUS_MAX - 1)
+};
+
+#define DPLL_TEMP_DIVIDER	1000
+
+/**
+ * enum dpll_type - type of dpll, valid values for DPLL_A_TYPE attribute
+ * @DPLL_TYPE_PPS: dpll produces Pulse-Per-Second signal
+ * @DPLL_TYPE_EEC: dpll drives the Ethernet Equipment Clock
+ */
+enum dpll_type {
+	DPLL_TYPE_PPS = 1,
+	DPLL_TYPE_EEC,
+
+	/* private: */
+	__DPLL_TYPE_MAX,
+	DPLL_TYPE_MAX = (__DPLL_TYPE_MAX - 1)
+};
+
+/**
+ * enum dpll_pin_type - defines possible types of a pin, valid values for
+ *   DPLL_A_PIN_TYPE attribute
+ * @DPLL_PIN_TYPE_MUX: aggregates another layer of selectable pins
+ * @DPLL_PIN_TYPE_EXT: external input
+ * @DPLL_PIN_TYPE_SYNCE_ETH_PORT: ethernet port PHY's recovered clock
+ * @DPLL_PIN_TYPE_INT_OSCILLATOR: device internal oscillator
+ * @DPLL_PIN_TYPE_GNSS: GNSS recovered clock
+ */
+enum dpll_pin_type {
+	DPLL_PIN_TYPE_MUX = 1,
+	DPLL_PIN_TYPE_EXT,
+	DPLL_PIN_TYPE_SYNCE_ETH_PORT,
+	DPLL_PIN_TYPE_INT_OSCILLATOR,
+	DPLL_PIN_TYPE_GNSS,
+
+	/* private: */
+	__DPLL_PIN_TYPE_MAX,
+	DPLL_PIN_TYPE_MAX = (__DPLL_PIN_TYPE_MAX - 1)
+};
+
+/**
+ * enum dpll_pin_direction - defines possible direction of a pin, valid values
+ *   for DPLL_A_PIN_DIRECTION attribute
+ * @DPLL_PIN_DIRECTION_INPUT: pin used as a input of a signal
+ * @DPLL_PIN_DIRECTION_OUTPUT: pin used to output the signal
+ */
+enum dpll_pin_direction {
+	DPLL_PIN_DIRECTION_INPUT = 1,
+	DPLL_PIN_DIRECTION_OUTPUT,
+
+	/* private: */
+	__DPLL_PIN_DIRECTION_MAX,
+	DPLL_PIN_DIRECTION_MAX = (__DPLL_PIN_DIRECTION_MAX - 1)
+};
+
+#define DPLL_PIN_FREQUENCY_1_HZ		1
+#define DPLL_PIN_FREQUENCY_10_KHZ	10000
+#define DPLL_PIN_FREQUENCY_77_5_KHZ	77500
+#define DPLL_PIN_FREQUENCY_10_MHZ	10000000
+
+/**
+ * enum dpll_pin_state - defines possible states of a pin, valid values for
+ *   DPLL_A_PIN_STATE attribute
+ * @DPLL_PIN_STATE_CONNECTED: pin connected, active input of phase locked loop
+ * @DPLL_PIN_STATE_DISCONNECTED: pin disconnected, not considered as a valid
+ *   input
+ * @DPLL_PIN_STATE_SELECTABLE: pin enabled for automatic input selection
+ */
+enum dpll_pin_state {
+	DPLL_PIN_STATE_CONNECTED = 1,
+	DPLL_PIN_STATE_DISCONNECTED,
+	DPLL_PIN_STATE_SELECTABLE,
+
+	/* private: */
+	__DPLL_PIN_STATE_MAX,
+	DPLL_PIN_STATE_MAX = (__DPLL_PIN_STATE_MAX - 1)
+};
+
+/**
+ * enum dpll_pin_capabilities - defines possible capabilities of a pin, valid
+ *   flags on DPLL_A_PIN_CAPABILITIES attribute
+ * @DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE: pin direction can be changed
+ * @DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE: pin priority can be changed
+ * @DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE: pin state can be changed
+ */
+enum dpll_pin_capabilities {
+	DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE = 1,
+	DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE = 2,
+	DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE = 4,
+};
+
+enum dpll_a {
+	DPLL_A_ID = 1,
+	DPLL_A_MODULE_NAME,
+	DPLL_A_PAD,
+	DPLL_A_CLOCK_ID,
+	DPLL_A_MODE,
+	DPLL_A_MODE_SUPPORTED,
+	DPLL_A_LOCK_STATUS,
+	DPLL_A_TEMP,
+	DPLL_A_TYPE,
+
+	__DPLL_A_MAX,
+	DPLL_A_MAX = (__DPLL_A_MAX - 1)
+};
+
+enum dpll_a_pin {
+	DPLL_A_PIN_ID = 1,
+	DPLL_A_PIN_PARENT_ID,
+	DPLL_A_PIN_MODULE_NAME,
+	DPLL_A_PIN_PAD,
+	DPLL_A_PIN_CLOCK_ID,
+	DPLL_A_PIN_BOARD_LABEL,
+	DPLL_A_PIN_PANEL_LABEL,
+	DPLL_A_PIN_PACKAGE_LABEL,
+	DPLL_A_PIN_TYPE,
+	DPLL_A_PIN_DIRECTION,
+	DPLL_A_PIN_FREQUENCY,
+	DPLL_A_PIN_FREQUENCY_SUPPORTED,
+	DPLL_A_PIN_FREQUENCY_MIN,
+	DPLL_A_PIN_FREQUENCY_MAX,
+	DPLL_A_PIN_PRIO,
+	DPLL_A_PIN_STATE,
+	DPLL_A_PIN_CAPABILITIES,
+	DPLL_A_PIN_PARENT_DEVICE,
+	DPLL_A_PIN_PARENT_PIN,
+
+	__DPLL_A_PIN_MAX,
+	DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1)
+};
+
+enum dpll_cmd {
+	DPLL_CMD_DEVICE_ID_GET = 1,
+	DPLL_CMD_DEVICE_GET,
+	DPLL_CMD_DEVICE_SET,
+	DPLL_CMD_DEVICE_CREATE_NTF,
+	DPLL_CMD_DEVICE_DELETE_NTF,
+	DPLL_CMD_DEVICE_CHANGE_NTF,
+	DPLL_CMD_PIN_ID_GET,
+	DPLL_CMD_PIN_GET,
+	DPLL_CMD_PIN_SET,
+	DPLL_CMD_PIN_CREATE_NTF,
+	DPLL_CMD_PIN_DELETE_NTF,
+	DPLL_CMD_PIN_CHANGE_NTF,
+
+	__DPLL_CMD_MAX,
+	DPLL_CMD_MAX = (__DPLL_CMD_MAX - 1)
+};
+
+#define DPLL_MCGRP_MONITOR	"monitor"
+
+#endif /* _UAPI_LINUX_DPLL_H */
-- 
cgit v1.2.3-70-g09d2


From 9431063ad323ac864750aeba4d304389bc42ca4e Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Wed, 13 Sep 2023 21:49:37 +0100
Subject: dpll: core: Add DPLL framework base functions

DPLL framework is used to represent and configure DPLL devices
in systems. Each device that has DPLL and can configure inputs
and outputs can use this framework.

Implement core framework functions for further interactions
with device drivers implementing dpll subsystem, as well as for
interactions of DPLL netlink framework part with the subsystem
itself.

Co-developed-by: Milena Olech <milena.olech@intel.com>
Signed-off-by: Milena Olech <milena.olech@intel.com>
Co-developed-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Co-developed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS              |  11 +
 drivers/Kconfig          |   2 +
 drivers/Makefile         |   1 +
 drivers/dpll/Kconfig     |   7 +
 drivers/dpll/Makefile    |   9 +
 drivers/dpll/dpll_core.c | 789 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/dpll/dpll_core.h |  89 ++++++
 include/linux/dpll.h     | 133 ++++++++
 8 files changed, 1041 insertions(+)
 create mode 100644 drivers/dpll/Kconfig
 create mode 100644 drivers/dpll/Makefile
 create mode 100644 drivers/dpll/dpll_core.c
 create mode 100644 drivers/dpll/dpll_core.h
 create mode 100644 include/linux/dpll.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 9d0027f389e1..c94489dff2fd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6333,6 +6333,17 @@ F:	Documentation/networking/device_drivers/ethernet/freescale/dpaa2/switch-drive
 F:	drivers/net/ethernet/freescale/dpaa2/dpaa2-switch*
 F:	drivers/net/ethernet/freescale/dpaa2/dpsw*
 
+DPLL SUBSYSTEM
+M:	Vadim Fedorenko <vadim.fedorenko@linux.dev>
+M:	Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+M:	Jiri Pirko <jiri@resnulli.us>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	Documentation/driver-api/dpll.rst
+F:	drivers/dpll/*
+F:	include/net/dpll.h
+F:	include/uapi/linux/dpll.h
+
 DRBD DRIVER
 M:	Philipp Reisner <philipp.reisner@linbit.com>
 M:	Lars Ellenberg <lars.ellenberg@linbit.com>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index efb66e25fa2d..8ba3e8b9ad72 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -243,4 +243,6 @@ source "drivers/hte/Kconfig"
 
 source "drivers/cdx/Kconfig"
 
+source "drivers/dpll/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 1bec7819a837..722d15be0eb7 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -197,5 +197,6 @@ obj-$(CONFIG_PECI)		+= peci/
 obj-$(CONFIG_HTE)		+= hte/
 obj-$(CONFIG_DRM_ACCEL)		+= accel/
 obj-$(CONFIG_CDX_BUS)		+= cdx/
+obj-$(CONFIG_DPLL)		+= dpll/
 
 obj-$(CONFIG_S390)		+= s390/
diff --git a/drivers/dpll/Kconfig b/drivers/dpll/Kconfig
new file mode 100644
index 000000000000..a4cae73f20d3
--- /dev/null
+++ b/drivers/dpll/Kconfig
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Generic DPLL drivers configuration
+#
+
+config DPLL
+  bool
diff --git a/drivers/dpll/Makefile b/drivers/dpll/Makefile
new file mode 100644
index 000000000000..2e5b27850110
--- /dev/null
+++ b/drivers/dpll/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for DPLL drivers.
+#
+
+obj-$(CONFIG_DPLL)      += dpll.o
+dpll-y                  += dpll_core.o
+dpll-y                  += dpll_netlink.o
+dpll-y                  += dpll_nl.o
diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
new file mode 100644
index 000000000000..6449ba6a383b
--- /dev/null
+++ b/drivers/dpll/dpll_core.c
@@ -0,0 +1,789 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  dpll_core.c - DPLL subsystem kernel-space interface implementation.
+ *
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel Corporation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "dpll_core.h"
+
+/* Mutex lock to protect DPLL subsystem devices and pins */
+DEFINE_MUTEX(dpll_lock);
+
+DEFINE_XARRAY_FLAGS(dpll_device_xa, XA_FLAGS_ALLOC);
+DEFINE_XARRAY_FLAGS(dpll_pin_xa, XA_FLAGS_ALLOC);
+
+static u32 dpll_xa_id;
+
+#define ASSERT_DPLL_REGISTERED(d)	\
+	WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
+#define ASSERT_DPLL_NOT_REGISTERED(d)	\
+	WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
+#define ASSERT_PIN_REGISTERED(p)	\
+	WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED))
+
+struct dpll_device_registration {
+	struct list_head list;
+	const struct dpll_device_ops *ops;
+	void *priv;
+};
+
+struct dpll_pin_registration {
+	struct list_head list;
+	const struct dpll_pin_ops *ops;
+	void *priv;
+};
+
+struct dpll_device *dpll_device_get_by_id(int id)
+{
+	if (xa_get_mark(&dpll_device_xa, id, DPLL_REGISTERED))
+		return xa_load(&dpll_device_xa, id);
+
+	return NULL;
+}
+
+static struct dpll_pin_registration *
+dpll_pin_registration_find(struct dpll_pin_ref *ref,
+			   const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+
+	list_for_each_entry(reg, &ref->registration_list, list) {
+		if (reg->ops == ops && reg->priv == priv)
+			return reg;
+	}
+	return NULL;
+}
+
+static int
+dpll_xa_ref_pin_add(struct xarray *xa_pins, struct dpll_pin *pin,
+		    const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+	bool ref_exists = false;
+	unsigned long i;
+	int ret;
+
+	xa_for_each(xa_pins, i, ref) {
+		if (ref->pin != pin)
+			continue;
+		reg = dpll_pin_registration_find(ref, ops, priv);
+		if (reg) {
+			refcount_inc(&ref->refcount);
+			return 0;
+		}
+		ref_exists = true;
+		break;
+	}
+
+	if (!ref_exists) {
+		ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (!ref)
+			return -ENOMEM;
+		ref->pin = pin;
+		INIT_LIST_HEAD(&ref->registration_list);
+		ret = xa_insert(xa_pins, pin->pin_idx, ref, GFP_KERNEL);
+		if (ret) {
+			kfree(ref);
+			return ret;
+		}
+		refcount_set(&ref->refcount, 1);
+	}
+
+	reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+	if (!reg) {
+		if (!ref_exists) {
+			xa_erase(xa_pins, pin->pin_idx);
+			kfree(ref);
+		}
+		return -ENOMEM;
+	}
+	reg->ops = ops;
+	reg->priv = priv;
+	if (ref_exists)
+		refcount_inc(&ref->refcount);
+	list_add_tail(&reg->list, &ref->registration_list);
+
+	return 0;
+}
+
+static int dpll_xa_ref_pin_del(struct xarray *xa_pins, struct dpll_pin *pin,
+			       const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+	unsigned long i;
+
+	xa_for_each(xa_pins, i, ref) {
+		if (ref->pin != pin)
+			continue;
+		reg = dpll_pin_registration_find(ref, ops, priv);
+		if (WARN_ON(!reg))
+			return -EINVAL;
+		if (refcount_dec_and_test(&ref->refcount)) {
+			list_del(&reg->list);
+			kfree(reg);
+			xa_erase(xa_pins, i);
+			WARN_ON(!list_empty(&ref->registration_list));
+			kfree(ref);
+		}
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int
+dpll_xa_ref_dpll_add(struct xarray *xa_dplls, struct dpll_device *dpll,
+		     const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+	bool ref_exists = false;
+	unsigned long i;
+	int ret;
+
+	xa_for_each(xa_dplls, i, ref) {
+		if (ref->dpll != dpll)
+			continue;
+		reg = dpll_pin_registration_find(ref, ops, priv);
+		if (reg) {
+			refcount_inc(&ref->refcount);
+			return 0;
+		}
+		ref_exists = true;
+		break;
+	}
+
+	if (!ref_exists) {
+		ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (!ref)
+			return -ENOMEM;
+		ref->dpll = dpll;
+		INIT_LIST_HEAD(&ref->registration_list);
+		ret = xa_insert(xa_dplls, dpll->id, ref, GFP_KERNEL);
+		if (ret) {
+			kfree(ref);
+			return ret;
+		}
+		refcount_set(&ref->refcount, 1);
+	}
+
+	reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+	if (!reg) {
+		if (!ref_exists) {
+			xa_erase(xa_dplls, dpll->id);
+			kfree(ref);
+		}
+		return -ENOMEM;
+	}
+	reg->ops = ops;
+	reg->priv = priv;
+	if (ref_exists)
+		refcount_inc(&ref->refcount);
+	list_add_tail(&reg->list, &ref->registration_list);
+
+	return 0;
+}
+
+static void
+dpll_xa_ref_dpll_del(struct xarray *xa_dplls, struct dpll_device *dpll,
+		     const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+	unsigned long i;
+
+	xa_for_each(xa_dplls, i, ref) {
+		if (ref->dpll != dpll)
+			continue;
+		reg = dpll_pin_registration_find(ref, ops, priv);
+		if (WARN_ON(!reg))
+			return;
+		if (refcount_dec_and_test(&ref->refcount)) {
+			list_del(&reg->list);
+			kfree(reg);
+			xa_erase(xa_dplls, i);
+			WARN_ON(!list_empty(&ref->registration_list));
+			kfree(ref);
+		}
+		return;
+	}
+}
+
+struct dpll_pin_ref *dpll_xa_ref_dpll_first(struct xarray *xa_refs)
+{
+	struct dpll_pin_ref *ref;
+	unsigned long i = 0;
+
+	ref = xa_find(xa_refs, &i, ULONG_MAX, XA_PRESENT);
+	WARN_ON(!ref);
+	return ref;
+}
+
+static struct dpll_device *
+dpll_device_alloc(const u64 clock_id, u32 device_idx, struct module *module)
+{
+	struct dpll_device *dpll;
+	int ret;
+
+	dpll = kzalloc(sizeof(*dpll), GFP_KERNEL);
+	if (!dpll)
+		return ERR_PTR(-ENOMEM);
+	refcount_set(&dpll->refcount, 1);
+	INIT_LIST_HEAD(&dpll->registration_list);
+	dpll->device_idx = device_idx;
+	dpll->clock_id = clock_id;
+	dpll->module = module;
+	ret = xa_alloc_cyclic(&dpll_device_xa, &dpll->id, dpll, xa_limit_32b,
+			      &dpll_xa_id, GFP_KERNEL);
+	if (ret < 0) {
+		kfree(dpll);
+		return ERR_PTR(ret);
+	}
+	xa_init_flags(&dpll->pin_refs, XA_FLAGS_ALLOC);
+
+	return dpll;
+}
+
+/**
+ * dpll_device_get - find existing or create new dpll device
+ * @clock_id: clock_id of creator
+ * @device_idx: idx given by device driver
+ * @module: reference to registering module
+ *
+ * Get existing object of a dpll device, unique for given arguments.
+ * Create new if doesn't exist yet.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * valid dpll_device struct pointer if succeeded
+ * * ERR_PTR(X) - error
+ */
+struct dpll_device *
+dpll_device_get(u64 clock_id, u32 device_idx, struct module *module)
+{
+	struct dpll_device *dpll, *ret = NULL;
+	unsigned long index;
+
+	mutex_lock(&dpll_lock);
+	xa_for_each(&dpll_device_xa, index, dpll) {
+		if (dpll->clock_id == clock_id &&
+		    dpll->device_idx == device_idx &&
+		    dpll->module == module) {
+			ret = dpll;
+			refcount_inc(&ret->refcount);
+			break;
+		}
+	}
+	if (!ret)
+		ret = dpll_device_alloc(clock_id, device_idx, module);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_device_get);
+
+/**
+ * dpll_device_put - decrease the refcount and free memory if possible
+ * @dpll: dpll_device struct pointer
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Drop reference for a dpll device, if all references are gone, delete
+ * dpll device object.
+ */
+void dpll_device_put(struct dpll_device *dpll)
+{
+	mutex_lock(&dpll_lock);
+	if (refcount_dec_and_test(&dpll->refcount)) {
+		ASSERT_DPLL_NOT_REGISTERED(dpll);
+		WARN_ON_ONCE(!xa_empty(&dpll->pin_refs));
+		xa_destroy(&dpll->pin_refs);
+		xa_erase(&dpll_device_xa, dpll->id);
+		WARN_ON(!list_empty(&dpll->registration_list));
+		kfree(dpll);
+	}
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_device_put);
+
+static struct dpll_device_registration *
+dpll_device_registration_find(struct dpll_device *dpll,
+			      const struct dpll_device_ops *ops, void *priv)
+{
+	struct dpll_device_registration *reg;
+
+	list_for_each_entry(reg, &dpll->registration_list, list) {
+		if (reg->ops == ops && reg->priv == priv)
+			return reg;
+	}
+	return NULL;
+}
+
+/**
+ * dpll_device_register - register the dpll device in the subsystem
+ * @dpll: pointer to a dpll
+ * @type: type of a dpll
+ * @ops: ops for a dpll device
+ * @priv: pointer to private information of owner
+ *
+ * Make dpll device available for user space.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * 0 on success
+ * * negative - error value
+ */
+int dpll_device_register(struct dpll_device *dpll, enum dpll_type type,
+			 const struct dpll_device_ops *ops, void *priv)
+{
+	struct dpll_device_registration *reg;
+	bool first_registration = false;
+
+	if (WARN_ON(!ops))
+		return -EINVAL;
+	if (WARN_ON(!ops->mode_get))
+		return -EINVAL;
+	if (WARN_ON(!ops->lock_status_get))
+		return -EINVAL;
+	if (WARN_ON(type < DPLL_TYPE_PPS || type > DPLL_TYPE_MAX))
+		return -EINVAL;
+
+	mutex_lock(&dpll_lock);
+	reg = dpll_device_registration_find(dpll, ops, priv);
+	if (reg) {
+		mutex_unlock(&dpll_lock);
+		return -EEXIST;
+	}
+
+	reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+	if (!reg) {
+		mutex_unlock(&dpll_lock);
+		return -ENOMEM;
+	}
+	reg->ops = ops;
+	reg->priv = priv;
+	dpll->type = type;
+	first_registration = list_empty(&dpll->registration_list);
+	list_add_tail(&reg->list, &dpll->registration_list);
+	if (!first_registration) {
+		mutex_unlock(&dpll_lock);
+		return 0;
+	}
+
+	xa_set_mark(&dpll_device_xa, dpll->id, DPLL_REGISTERED);
+	mutex_unlock(&dpll_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dpll_device_register);
+
+/**
+ * dpll_device_unregister - unregister dpll device
+ * @dpll: registered dpll pointer
+ * @ops: ops for a dpll device
+ * @priv: pointer to private information of owner
+ *
+ * Unregister device, make it unavailable for userspace.
+ * Note: It does not free the memory
+ * Context: Acquires a lock (dpll_lock)
+ */
+void dpll_device_unregister(struct dpll_device *dpll,
+			    const struct dpll_device_ops *ops, void *priv)
+{
+	struct dpll_device_registration *reg;
+
+	mutex_lock(&dpll_lock);
+	ASSERT_DPLL_REGISTERED(dpll);
+	reg = dpll_device_registration_find(dpll, ops, priv);
+	if (WARN_ON(!reg)) {
+		mutex_unlock(&dpll_lock);
+		return;
+	}
+	list_del(&reg->list);
+	kfree(reg);
+
+	if (!list_empty(&dpll->registration_list)) {
+		mutex_unlock(&dpll_lock);
+		return;
+	}
+	xa_clear_mark(&dpll_device_xa, dpll->id, DPLL_REGISTERED);
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_device_unregister);
+
+static struct dpll_pin *
+dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+	       const struct dpll_pin_properties *prop)
+{
+	struct dpll_pin *pin;
+	int ret;
+
+	pin = kzalloc(sizeof(*pin), GFP_KERNEL);
+	if (!pin)
+		return ERR_PTR(-ENOMEM);
+	pin->pin_idx = pin_idx;
+	pin->clock_id = clock_id;
+	pin->module = module;
+	if (WARN_ON(prop->type < DPLL_PIN_TYPE_MUX ||
+		    prop->type > DPLL_PIN_TYPE_MAX)) {
+		ret = -EINVAL;
+		goto err;
+	}
+	pin->prop = prop;
+	refcount_set(&pin->refcount, 1);
+	xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC);
+	xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
+	ret = xa_alloc(&dpll_pin_xa, &pin->id, pin, xa_limit_16b, GFP_KERNEL);
+	if (ret)
+		goto err;
+	return pin;
+err:
+	xa_destroy(&pin->dpll_refs);
+	xa_destroy(&pin->parent_refs);
+	kfree(pin);
+	return ERR_PTR(ret);
+}
+
+/**
+ * dpll_pin_get - find existing or create new dpll pin
+ * @clock_id: clock_id of creator
+ * @pin_idx: idx given by dev driver
+ * @module: reference to registering module
+ * @prop: dpll pin properties
+ *
+ * Get existing object of a pin (unique for given arguments) or create new
+ * if doesn't exist yet.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * valid allocated dpll_pin struct pointer if succeeded
+ * * ERR_PTR(X) - error
+ */
+struct dpll_pin *
+dpll_pin_get(u64 clock_id, u32 pin_idx, struct module *module,
+	     const struct dpll_pin_properties *prop)
+{
+	struct dpll_pin *pos, *ret = NULL;
+	unsigned long i;
+
+	mutex_lock(&dpll_lock);
+	xa_for_each(&dpll_pin_xa, i, pos) {
+		if (pos->clock_id == clock_id &&
+		    pos->pin_idx == pin_idx &&
+		    pos->module == module) {
+			ret = pos;
+			refcount_inc(&ret->refcount);
+			break;
+		}
+	}
+	if (!ret)
+		ret = dpll_pin_alloc(clock_id, pin_idx, module, prop);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_get);
+
+/**
+ * dpll_pin_put - decrease the refcount and free memory if possible
+ * @pin: pointer to a pin to be put
+ *
+ * Drop reference for a pin, if all references are gone, delete pin object.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ */
+void dpll_pin_put(struct dpll_pin *pin)
+{
+	mutex_lock(&dpll_lock);
+	if (refcount_dec_and_test(&pin->refcount)) {
+		xa_destroy(&pin->dpll_refs);
+		xa_destroy(&pin->parent_refs);
+		xa_erase(&dpll_pin_xa, pin->id);
+		kfree(pin);
+	}
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_pin_put);
+
+static int
+__dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+		    const struct dpll_pin_ops *ops, void *priv)
+{
+	int ret;
+
+	ret = dpll_xa_ref_pin_add(&dpll->pin_refs, pin, ops, priv);
+	if (ret)
+		return ret;
+	ret = dpll_xa_ref_dpll_add(&pin->dpll_refs, dpll, ops, priv);
+	if (ret)
+		goto ref_pin_del;
+	xa_set_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED);
+
+	return ret;
+
+ref_pin_del:
+	dpll_xa_ref_pin_del(&dpll->pin_refs, pin, ops, priv);
+	return ret;
+}
+
+/**
+ * dpll_pin_register - register the dpll pin in the subsystem
+ * @dpll: pointer to a dpll
+ * @pin: pointer to a dpll pin
+ * @ops: ops for a dpll pin ops
+ * @priv: pointer to private information of owner
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * 0 on success
+ * * negative - error value
+ */
+int
+dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+		  const struct dpll_pin_ops *ops, void *priv)
+{
+	int ret;
+
+	if (WARN_ON(!ops) ||
+	    WARN_ON(!ops->state_on_dpll_get) ||
+	    WARN_ON(!ops->direction_get))
+		return -EINVAL;
+	if (ASSERT_DPLL_REGISTERED(dpll))
+		return -EINVAL;
+
+	mutex_lock(&dpll_lock);
+	if (WARN_ON(!(dpll->module == pin->module &&
+		      dpll->clock_id == pin->clock_id)))
+		ret = -EINVAL;
+	else
+		ret = __dpll_pin_register(dpll, pin, ops, priv);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_register);
+
+static void
+__dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
+		      const struct dpll_pin_ops *ops, void *priv)
+{
+	dpll_xa_ref_pin_del(&dpll->pin_refs, pin, ops, priv);
+	dpll_xa_ref_dpll_del(&pin->dpll_refs, dpll, ops, priv);
+	if (xa_empty(&pin->dpll_refs))
+		xa_clear_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED);
+}
+
+/**
+ * dpll_pin_unregister - unregister dpll pin from dpll device
+ * @dpll: registered dpll pointer
+ * @pin: pointer to a pin
+ * @ops: ops for a dpll pin
+ * @priv: pointer to private information of owner
+ *
+ * Note: It does not free the memory
+ * Context: Acquires a lock (dpll_lock)
+ */
+void dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
+			 const struct dpll_pin_ops *ops, void *priv)
+{
+	if (WARN_ON(xa_empty(&dpll->pin_refs)))
+		return;
+	if (WARN_ON(!xa_empty(&pin->parent_refs)))
+		return;
+
+	mutex_lock(&dpll_lock);
+	__dpll_pin_unregister(dpll, pin, ops, priv);
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_pin_unregister);
+
+/**
+ * dpll_pin_on_pin_register - register a pin with a parent pin
+ * @parent: pointer to a parent pin
+ * @pin: pointer to a pin
+ * @ops: ops for a dpll pin
+ * @priv: pointer to private information of owner
+ *
+ * Register a pin with a parent pin, create references between them and
+ * between newly registered pin and dplls connected with a parent pin.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * 0 on success
+ * * negative - error value
+ */
+int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+			     const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_ref *ref;
+	unsigned long i, stop;
+	int ret;
+
+	if (WARN_ON(parent->prop->type != DPLL_PIN_TYPE_MUX))
+		return -EINVAL;
+
+	if (WARN_ON(!ops) ||
+	    WARN_ON(!ops->state_on_pin_get) ||
+	    WARN_ON(!ops->direction_get))
+		return -EINVAL;
+	if (ASSERT_PIN_REGISTERED(parent))
+		return -EINVAL;
+
+	mutex_lock(&dpll_lock);
+	ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv);
+	if (ret)
+		goto unlock;
+	refcount_inc(&pin->refcount);
+	xa_for_each(&parent->dpll_refs, i, ref) {
+		ret = __dpll_pin_register(ref->dpll, pin, ops, priv);
+		if (ret) {
+			stop = i;
+			goto dpll_unregister;
+		}
+	}
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+
+dpll_unregister:
+	xa_for_each(&parent->dpll_refs, i, ref)
+		if (i < stop)
+			__dpll_pin_unregister(ref->dpll, pin, ops, priv);
+	refcount_dec(&pin->refcount);
+	dpll_xa_ref_pin_del(&pin->parent_refs, parent, ops, priv);
+unlock:
+	mutex_unlock(&dpll_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_on_pin_register);
+
+/**
+ * dpll_pin_on_pin_unregister - unregister dpll pin from a parent pin
+ * @parent: pointer to a parent pin
+ * @pin: pointer to a pin
+ * @ops: ops for a dpll pin
+ * @priv: pointer to private information of owner
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Note: It does not free the memory
+ */
+void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
+				const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_ref *ref;
+	unsigned long i;
+
+	mutex_lock(&dpll_lock);
+	dpll_xa_ref_pin_del(&pin->parent_refs, parent, ops, priv);
+	refcount_dec(&pin->refcount);
+	xa_for_each(&pin->dpll_refs, i, ref)
+		__dpll_pin_unregister(ref->dpll, pin, ops, priv);
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_pin_on_pin_unregister);
+
+static struct dpll_device_registration *
+dpll_device_registration_first(struct dpll_device *dpll)
+{
+	struct dpll_device_registration *reg;
+
+	reg = list_first_entry_or_null((struct list_head *)&dpll->registration_list,
+				       struct dpll_device_registration, list);
+	WARN_ON(!reg);
+	return reg;
+}
+
+void *dpll_priv(struct dpll_device *dpll)
+{
+	struct dpll_device_registration *reg;
+
+	reg = dpll_device_registration_first(dpll);
+	return reg->priv;
+}
+
+const struct dpll_device_ops *dpll_device_ops(struct dpll_device *dpll)
+{
+	struct dpll_device_registration *reg;
+
+	reg = dpll_device_registration_first(dpll);
+	return reg->ops;
+}
+
+static struct dpll_pin_registration *
+dpll_pin_registration_first(struct dpll_pin_ref *ref)
+{
+	struct dpll_pin_registration *reg;
+
+	reg = list_first_entry_or_null(&ref->registration_list,
+				       struct dpll_pin_registration, list);
+	WARN_ON(!reg);
+	return reg;
+}
+
+void *dpll_pin_on_dpll_priv(struct dpll_device *dpll,
+			    struct dpll_pin *pin)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+
+	ref = xa_load(&dpll->pin_refs, pin->pin_idx);
+	if (!ref)
+		return NULL;
+	reg = dpll_pin_registration_first(ref);
+	return reg->priv;
+}
+
+void *dpll_pin_on_pin_priv(struct dpll_pin *parent,
+			   struct dpll_pin *pin)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+
+	ref = xa_load(&pin->parent_refs, parent->pin_idx);
+	if (!ref)
+		return NULL;
+	reg = dpll_pin_registration_first(ref);
+	return reg->priv;
+}
+
+const struct dpll_pin_ops *dpll_pin_ops(struct dpll_pin_ref *ref)
+{
+	struct dpll_pin_registration *reg;
+
+	reg = dpll_pin_registration_first(ref);
+	return reg->ops;
+}
+
+static int __init dpll_init(void)
+{
+	int ret;
+
+	ret = genl_register_family(&dpll_nl_family);
+	if (ret)
+		goto error;
+
+	return 0;
+
+error:
+	mutex_destroy(&dpll_lock);
+	return ret;
+}
+
+static void __exit dpll_exit(void)
+{
+	genl_unregister_family(&dpll_nl_family);
+	mutex_destroy(&dpll_lock);
+}
+
+subsys_initcall(dpll_init);
+module_exit(dpll_exit);
diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h
new file mode 100644
index 000000000000..5585873c5c1b
--- /dev/null
+++ b/drivers/dpll/dpll_core.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ */
+
+#ifndef __DPLL_CORE_H__
+#define __DPLL_CORE_H__
+
+#include <linux/dpll.h>
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include "dpll_nl.h"
+
+#define DPLL_REGISTERED		XA_MARK_1
+
+/**
+ * struct dpll_device - stores DPLL device internal data
+ * @id:			unique id number for device given by dpll subsystem
+ * @device_idx:		id given by dev driver
+ * @clock_id:		unique identifier (clock_id) of a dpll
+ * @module:		module of creator
+ * @type:		type of a dpll
+ * @pin_refs:		stores pins registered within a dpll
+ * @refcount:		refcount
+ * @registration_list:	list of registered ops and priv data of dpll owners
+ **/
+struct dpll_device {
+	u32 id;
+	u32 device_idx;
+	u64 clock_id;
+	struct module *module;
+	enum dpll_type type;
+	struct xarray pin_refs;
+	refcount_t refcount;
+	struct list_head registration_list;
+};
+
+/**
+ * struct dpll_pin - structure for a dpll pin
+ * @id:			unique id number for pin given by dpll subsystem
+ * @pin_idx:		index of a pin given by dev driver
+ * @clock_id:		clock_id of creator
+ * @module:		module of creator
+ * @dpll_refs:		hold referencees to dplls pin was registered with
+ * @parent_refs:	hold references to parent pins pin was registered with
+ * @prop:		pointer to pin properties given by registerer
+ * @rclk_dev_name:	holds name of device when pin can recover clock from it
+ * @refcount:		refcount
+ **/
+struct dpll_pin {
+	u32 id;
+	u32 pin_idx;
+	u64 clock_id;
+	struct module *module;
+	struct xarray dpll_refs;
+	struct xarray parent_refs;
+	const struct dpll_pin_properties *prop;
+	refcount_t refcount;
+};
+
+/**
+ * struct dpll_pin_ref - structure for referencing either dpll or pins
+ * @dpll:		pointer to a dpll
+ * @pin:		pointer to a pin
+ * @registration_list:	list of ops and priv data registered with the ref
+ * @refcount:		refcount
+ **/
+struct dpll_pin_ref {
+	union {
+		struct dpll_device *dpll;
+		struct dpll_pin *pin;
+	};
+	struct list_head registration_list;
+	refcount_t refcount;
+};
+
+void *dpll_priv(struct dpll_device *dpll);
+void *dpll_pin_on_dpll_priv(struct dpll_device *dpll, struct dpll_pin *pin);
+void *dpll_pin_on_pin_priv(struct dpll_pin *parent, struct dpll_pin *pin);
+
+const struct dpll_device_ops *dpll_device_ops(struct dpll_device *dpll);
+struct dpll_device *dpll_device_get_by_id(int id);
+const struct dpll_pin_ops *dpll_pin_ops(struct dpll_pin_ref *ref);
+struct dpll_pin_ref *dpll_xa_ref_dpll_first(struct xarray *xa_refs);
+extern struct xarray dpll_device_xa;
+extern struct xarray dpll_pin_xa;
+extern struct mutex dpll_lock;
+#endif
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
new file mode 100644
index 000000000000..b47c3560b937
--- /dev/null
+++ b/include/linux/dpll.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ */
+
+#ifndef __DPLL_H__
+#define __DPLL_H__
+
+#include <uapi/linux/dpll.h>
+#include <linux/device.h>
+#include <linux/netlink.h>
+
+struct dpll_device;
+struct dpll_pin;
+
+struct dpll_device_ops {
+	int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv,
+			enum dpll_mode *mode, struct netlink_ext_ack *extack);
+	bool (*mode_supported)(const struct dpll_device *dpll, void *dpll_priv,
+			       const enum dpll_mode mode,
+			       struct netlink_ext_ack *extack);
+	int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv,
+			       enum dpll_lock_status *status,
+			       struct netlink_ext_ack *extack);
+	int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv,
+			s32 *temp, struct netlink_ext_ack *extack);
+};
+
+struct dpll_pin_ops {
+	int (*frequency_set)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     const u64 frequency,
+			     struct netlink_ext_ack *extack);
+	int (*frequency_get)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     u64 *frequency, struct netlink_ext_ack *extack);
+	int (*direction_set)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     const enum dpll_pin_direction direction,
+			     struct netlink_ext_ack *extack);
+	int (*direction_get)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     enum dpll_pin_direction *direction,
+			     struct netlink_ext_ack *extack);
+	int (*state_on_pin_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_pin *parent_pin,
+				void *parent_pin_priv,
+				enum dpll_pin_state *state,
+				struct netlink_ext_ack *extack);
+	int (*state_on_dpll_get)(const struct dpll_pin *pin, void *pin_priv,
+				 const struct dpll_device *dpll,
+				 void *dpll_priv, enum dpll_pin_state *state,
+				 struct netlink_ext_ack *extack);
+	int (*state_on_pin_set)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_pin *parent_pin,
+				void *parent_pin_priv,
+				const enum dpll_pin_state state,
+				struct netlink_ext_ack *extack);
+	int (*state_on_dpll_set)(const struct dpll_pin *pin, void *pin_priv,
+				 const struct dpll_device *dpll,
+				 void *dpll_priv,
+				 const enum dpll_pin_state state,
+				 struct netlink_ext_ack *extack);
+	int (*prio_get)(const struct dpll_pin *pin,  void *pin_priv,
+			const struct dpll_device *dpll,  void *dpll_priv,
+			u32 *prio, struct netlink_ext_ack *extack);
+	int (*prio_set)(const struct dpll_pin *pin, void *pin_priv,
+			const struct dpll_device *dpll, void *dpll_priv,
+			const u32 prio, struct netlink_ext_ack *extack);
+};
+
+struct dpll_pin_frequency {
+	u64 min;
+	u64 max;
+};
+
+#define DPLL_PIN_FREQUENCY_RANGE(_min, _max)	\
+	{					\
+		.min = _min,			\
+		.max = _max,			\
+	}
+
+#define DPLL_PIN_FREQUENCY(_val) DPLL_PIN_FREQUENCY_RANGE(_val, _val)
+#define DPLL_PIN_FREQUENCY_1PPS \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_1_HZ)
+#define DPLL_PIN_FREQUENCY_10MHZ \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_10_MHZ)
+#define DPLL_PIN_FREQUENCY_IRIG_B \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_10_KHZ)
+#define DPLL_PIN_FREQUENCY_DCF77 \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_77_5_KHZ)
+
+struct dpll_pin_properties {
+	const char *board_label;
+	const char *panel_label;
+	const char *package_label;
+	enum dpll_pin_type type;
+	unsigned long capabilities;
+	u32 freq_supported_num;
+	struct dpll_pin_frequency *freq_supported;
+};
+
+struct dpll_device *
+dpll_device_get(u64 clock_id, u32 dev_driver_id, struct module *module);
+
+void dpll_device_put(struct dpll_device *dpll);
+
+int dpll_device_register(struct dpll_device *dpll, enum dpll_type type,
+			 const struct dpll_device_ops *ops, void *priv);
+
+void dpll_device_unregister(struct dpll_device *dpll,
+			    const struct dpll_device_ops *ops, void *priv);
+
+struct dpll_pin *
+dpll_pin_get(u64 clock_id, u32 dev_driver_id, struct module *module,
+	     const struct dpll_pin_properties *prop);
+
+int dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+		      const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
+			 const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_put(struct dpll_pin *pin);
+
+int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+			     const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
+				const struct dpll_pin_ops *ops, void *priv);
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 9d71b54b65b1fb6c0d3a6c5c88ba9b915c783fbc Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Wed, 13 Sep 2023 21:49:38 +0100
Subject: dpll: netlink: Add DPLL framework base functions

DPLL framework is used to represent and configure DPLL devices
in systems. Each device that has DPLL and can configure inputs
and outputs can use this framework.

Implement dpll netlink framework functions for enablement of dpll
subsystem netlink family.

Co-developed-by: Milena Olech <milena.olech@intel.com>
Signed-off-by: Milena Olech <milena.olech@intel.com>
Co-developed-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Co-developed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dpll/dpll_core.c    |   11 +-
 drivers/dpll/dpll_netlink.c | 1241 +++++++++++++++++++++++++++++++++++++++++++
 drivers/dpll/dpll_netlink.h |   13 +
 include/linux/dpll.h        |    4 +
 4 files changed, 1268 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dpll/dpll_netlink.c
 create mode 100644 drivers/dpll/dpll_netlink.h

(limited to 'include')

diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
index 6449ba6a383b..3568149b9562 100644
--- a/drivers/dpll/dpll_core.c
+++ b/drivers/dpll/dpll_core.c
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 
 #include "dpll_core.h"
+#include "dpll_netlink.h"
 
 /* Mutex lock to protect DPLL subsystem devices and pins */
 DEFINE_MUTEX(dpll_lock);
@@ -381,6 +382,7 @@ int dpll_device_register(struct dpll_device *dpll, enum dpll_type type,
 	}
 
 	xa_set_mark(&dpll_device_xa, dpll->id, DPLL_REGISTERED);
+	dpll_device_create_ntf(dpll);
 	mutex_unlock(&dpll_lock);
 
 	return 0;
@@ -404,6 +406,7 @@ void dpll_device_unregister(struct dpll_device *dpll,
 
 	mutex_lock(&dpll_lock);
 	ASSERT_DPLL_REGISTERED(dpll);
+	dpll_device_delete_ntf(dpll);
 	reg = dpll_device_registration_find(dpll, ops, priv);
 	if (WARN_ON(!reg)) {
 		mutex_unlock(&dpll_lock);
@@ -528,6 +531,7 @@ __dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
 	if (ret)
 		goto ref_pin_del;
 	xa_set_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED);
+	dpll_pin_create_ntf(pin);
 
 	return ret;
 
@@ -602,6 +606,7 @@ void dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
 		return;
 
 	mutex_lock(&dpll_lock);
+	dpll_pin_delete_ntf(pin);
 	__dpll_pin_unregister(dpll, pin, ops, priv);
 	mutex_unlock(&dpll_lock);
 }
@@ -650,6 +655,7 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
 			stop = i;
 			goto dpll_unregister;
 		}
+		dpll_pin_create_ntf(pin);
 	}
 	mutex_unlock(&dpll_lock);
 
@@ -657,8 +663,10 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
 
 dpll_unregister:
 	xa_for_each(&parent->dpll_refs, i, ref)
-		if (i < stop)
+		if (i < stop) {
 			__dpll_pin_unregister(ref->dpll, pin, ops, priv);
+			dpll_pin_delete_ntf(pin);
+		}
 	refcount_dec(&pin->refcount);
 	dpll_xa_ref_pin_del(&pin->parent_refs, parent, ops, priv);
 unlock:
@@ -684,6 +692,7 @@ void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
 	unsigned long i;
 
 	mutex_lock(&dpll_lock);
+	dpll_pin_delete_ntf(pin);
 	dpll_xa_ref_pin_del(&pin->parent_refs, parent, ops, priv);
 	refcount_dec(&pin->refcount);
 	xa_for_each(&pin->dpll_refs, i, ref)
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
new file mode 100644
index 000000000000..9464a6865977
--- /dev/null
+++ b/drivers/dpll/dpll_netlink.c
@@ -0,0 +1,1241 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic netlink for DPLL management framework
+ *
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/genetlink.h>
+#include "dpll_core.h"
+#include "dpll_netlink.h"
+#include "dpll_nl.h"
+#include <uapi/linux/dpll.h>
+
+#define ASSERT_NOT_NULL(ptr)	(WARN_ON(!ptr))
+
+#define xa_for_each_marked_start(xa, index, entry, filter, start) \
+	for (index = start, entry = xa_find(xa, &index, ULONG_MAX, filter); \
+	     entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))
+
+struct dpll_dump_ctx {
+	unsigned long idx;
+};
+
+static struct dpll_dump_ctx *dpll_dump_context(struct netlink_callback *cb)
+{
+	return (struct dpll_dump_ctx *)cb->ctx;
+}
+
+static int
+dpll_msg_add_dev_handle(struct sk_buff *msg, struct dpll_device *dpll)
+{
+	if (nla_put_u32(msg, DPLL_A_ID, dpll->id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id)
+{
+	if (nla_put_u32(msg, DPLL_A_PIN_PARENT_ID, id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+/**
+ * dpll_msg_add_pin_handle - attach pin handle attribute to a given message
+ * @msg: pointer to sk_buff message to attach a pin handle
+ * @pin: pin pointer
+ *
+ * Return:
+ * * 0 - success
+ * * -EMSGSIZE - no space in message to attach pin handle
+ */
+static int
+dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
+{
+	if (!pin)
+		return 0;
+	if (nla_put_u32(msg, DPLL_A_PIN_ID, pin->id))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int
+dpll_msg_add_mode(struct sk_buff *msg, struct dpll_device *dpll,
+		  struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_mode mode;
+	int ret;
+
+	ret = ops->mode_get(dpll, dpll_priv(dpll), &mode, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_MODE, mode))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_mode_supported(struct sk_buff *msg, struct dpll_device *dpll,
+			    struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_mode mode;
+
+	if (!ops->mode_supported)
+		return 0;
+	for (mode = DPLL_MODE_MANUAL; mode <= DPLL_MODE_MAX; mode++)
+		if (ops->mode_supported(dpll, dpll_priv(dpll), mode, extack))
+			if (nla_put_u32(msg, DPLL_A_MODE_SUPPORTED, mode))
+				return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_lock_status(struct sk_buff *msg, struct dpll_device *dpll,
+			 struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_lock_status status;
+	int ret;
+
+	ret = ops->lock_status_get(dpll, dpll_priv(dpll), &status, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_LOCK_STATUS, status))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_temp(struct sk_buff *msg, struct dpll_device *dpll,
+		  struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	s32 temp;
+	int ret;
+
+	if (!ops->temp_get)
+		return 0;
+	ret = ops->temp_get(dpll, dpll_priv(dpll), &temp, extack);
+	if (ret)
+		return ret;
+	if (nla_put_s32(msg, DPLL_A_TEMP, temp))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_pin_prio(struct sk_buff *msg, struct dpll_pin *pin,
+		      struct dpll_pin_ref *ref,
+		      struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	u32 prio;
+	int ret;
+
+	if (!ops->prio_get)
+		return 0;
+	ret = ops->prio_get(pin, dpll_pin_on_dpll_priv(dpll, pin), dpll,
+			    dpll_priv(dpll), &prio, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_PIN_PRIO, prio))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_pin_on_dpll_state(struct sk_buff *msg, struct dpll_pin *pin,
+			       struct dpll_pin_ref *ref,
+			       struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	enum dpll_pin_state state;
+	int ret;
+
+	if (!ops->state_on_dpll_get)
+		return 0;
+	ret = ops->state_on_dpll_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				     dpll, dpll_priv(dpll), &state, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_PIN_STATE, state))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_pin_direction(struct sk_buff *msg, struct dpll_pin *pin,
+			   struct dpll_pin_ref *ref,
+			   struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	enum dpll_pin_direction direction;
+	int ret;
+
+	ret = ops->direction_get(pin, dpll_pin_on_dpll_priv(dpll, pin), dpll,
+				 dpll_priv(dpll), &direction, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_PIN_DIRECTION, direction))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin,
+		      struct dpll_pin_ref *ref, struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	struct nlattr *nest;
+	int fs, ret;
+	u64 freq;
+
+	if (!ops->frequency_get)
+		return 0;
+	ret = ops->frequency_get(pin, dpll_pin_on_dpll_priv(dpll, pin), dpll,
+				 dpll_priv(dpll), &freq, extack);
+	if (ret)
+		return ret;
+	if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY, sizeof(freq), &freq,
+			  DPLL_A_PIN_PAD))
+		return -EMSGSIZE;
+	for (fs = 0; fs < pin->prop->freq_supported_num; fs++) {
+		nest = nla_nest_start(msg, DPLL_A_PIN_FREQUENCY_SUPPORTED);
+		if (!nest)
+			return -EMSGSIZE;
+		freq = pin->prop->freq_supported[fs].min;
+		if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MIN, sizeof(freq),
+				  &freq, DPLL_A_PIN_PAD)) {
+			nla_nest_cancel(msg, nest);
+			return -EMSGSIZE;
+		}
+		freq = pin->prop->freq_supported[fs].max;
+		if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MAX, sizeof(freq),
+				  &freq, DPLL_A_PIN_PAD)) {
+			nla_nest_cancel(msg, nest);
+			return -EMSGSIZE;
+		}
+		nla_nest_end(msg, nest);
+	}
+
+	return 0;
+}
+
+static bool dpll_pin_is_freq_supported(struct dpll_pin *pin, u32 freq)
+{
+	int fs;
+
+	for (fs = 0; fs < pin->prop->freq_supported_num; fs++)
+		if (freq >= pin->prop->freq_supported[fs].min &&
+		    freq <= pin->prop->freq_supported[fs].max)
+			return true;
+	return false;
+}
+
+static int
+dpll_msg_add_pin_parents(struct sk_buff *msg, struct dpll_pin *pin,
+			 struct dpll_pin_ref *dpll_ref,
+			 struct netlink_ext_ack *extack)
+{
+	enum dpll_pin_state state;
+	struct dpll_pin_ref *ref;
+	struct dpll_pin *ppin;
+	struct nlattr *nest;
+	unsigned long index;
+	int ret;
+
+	xa_for_each(&pin->parent_refs, index, ref) {
+		const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+		void *parent_priv;
+
+		ppin = ref->pin;
+		parent_priv = dpll_pin_on_dpll_priv(dpll_ref->dpll, ppin);
+		ret = ops->state_on_pin_get(pin,
+					    dpll_pin_on_pin_priv(ppin, pin),
+					    ppin, parent_priv, &state, extack);
+		if (ret)
+			return ret;
+		nest = nla_nest_start(msg, DPLL_A_PIN_PARENT_PIN);
+		if (!nest)
+			return -EMSGSIZE;
+		ret = dpll_msg_add_dev_parent_handle(msg, ppin->id);
+		if (ret)
+			goto nest_cancel;
+		if (nla_put_u32(msg, DPLL_A_PIN_STATE, state)) {
+			ret = -EMSGSIZE;
+			goto nest_cancel;
+		}
+		nla_nest_end(msg, nest);
+	}
+
+	return 0;
+
+nest_cancel:
+	nla_nest_cancel(msg, nest);
+	return ret;
+}
+
+static int
+dpll_msg_add_pin_dplls(struct sk_buff *msg, struct dpll_pin *pin,
+		       struct netlink_ext_ack *extack)
+{
+	struct dpll_pin_ref *ref;
+	struct nlattr *attr;
+	unsigned long index;
+	int ret;
+
+	xa_for_each(&pin->dpll_refs, index, ref) {
+		attr = nla_nest_start(msg, DPLL_A_PIN_PARENT_DEVICE);
+		if (!attr)
+			return -EMSGSIZE;
+		ret = dpll_msg_add_dev_parent_handle(msg, ref->dpll->id);
+		if (ret)
+			goto nest_cancel;
+		ret = dpll_msg_add_pin_on_dpll_state(msg, pin, ref, extack);
+		if (ret)
+			goto nest_cancel;
+		ret = dpll_msg_add_pin_prio(msg, pin, ref, extack);
+		if (ret)
+			goto nest_cancel;
+		ret = dpll_msg_add_pin_direction(msg, pin, ref, extack);
+		if (ret)
+			goto nest_cancel;
+		nla_nest_end(msg, attr);
+	}
+
+	return 0;
+
+nest_cancel:
+	nla_nest_end(msg, attr);
+	return ret;
+}
+
+static int
+dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
+		     struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_properties *prop = pin->prop;
+	struct dpll_pin_ref *ref;
+	int ret;
+
+	ref = dpll_xa_ref_dpll_first(&pin->dpll_refs);
+	ASSERT_NOT_NULL(ref);
+
+	ret = dpll_msg_add_pin_handle(msg, pin);
+	if (ret)
+		return ret;
+	if (nla_put_string(msg, DPLL_A_PIN_MODULE_NAME,
+			   module_name(pin->module)))
+		return -EMSGSIZE;
+	if (nla_put_64bit(msg, DPLL_A_PIN_CLOCK_ID, sizeof(pin->clock_id),
+			  &pin->clock_id, DPLL_A_PIN_PAD))
+		return -EMSGSIZE;
+	if (prop->board_label &&
+	    nla_put_string(msg, DPLL_A_PIN_BOARD_LABEL, prop->board_label))
+		return -EMSGSIZE;
+	if (prop->panel_label &&
+	    nla_put_string(msg, DPLL_A_PIN_PANEL_LABEL, prop->panel_label))
+		return -EMSGSIZE;
+	if (prop->package_label &&
+	    nla_put_string(msg, DPLL_A_PIN_PACKAGE_LABEL,
+			   prop->package_label))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DPLL_A_PIN_TYPE, prop->type))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DPLL_A_PIN_CAPABILITIES, prop->capabilities))
+		return -EMSGSIZE;
+	ret = dpll_msg_add_pin_freq(msg, pin, ref, extack);
+	if (ret)
+		return ret;
+	if (xa_empty(&pin->parent_refs))
+		ret = dpll_msg_add_pin_dplls(msg, pin, extack);
+	else
+		ret = dpll_msg_add_pin_parents(msg, pin, ref, extack);
+
+	return ret;
+}
+
+static int
+dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg,
+		    struct netlink_ext_ack *extack)
+{
+	int ret;
+
+	ret = dpll_msg_add_dev_handle(msg, dpll);
+	if (ret)
+		return ret;
+	if (nla_put_string(msg, DPLL_A_MODULE_NAME, module_name(dpll->module)))
+		return -EMSGSIZE;
+	if (nla_put_64bit(msg, DPLL_A_CLOCK_ID, sizeof(dpll->clock_id),
+			  &dpll->clock_id, DPLL_A_PAD))
+		return -EMSGSIZE;
+	ret = dpll_msg_add_temp(msg, dpll, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_lock_status(msg, dpll, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_mode(msg, dpll, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_mode_supported(msg, dpll, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_TYPE, dpll->type))
+		return -EMSGSIZE;
+
+	return ret;
+}
+
+static int
+dpll_device_event_send(enum dpll_cmd event, struct dpll_device *dpll)
+{
+	struct sk_buff *msg;
+	int ret = -ENOMEM;
+	void *hdr;
+
+	if (WARN_ON(!xa_get_mark(&dpll_device_xa, dpll->id, DPLL_REGISTERED)))
+		return -ENODEV;
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put(msg, 0, 0, &dpll_nl_family, 0, event);
+	if (!hdr)
+		goto err_free_msg;
+	ret = dpll_device_get_one(dpll, msg, NULL);
+	if (ret)
+		goto err_cancel_msg;
+	genlmsg_end(msg, hdr);
+	genlmsg_multicast(&dpll_nl_family, msg, 0, 0, GFP_KERNEL);
+
+	return 0;
+
+err_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+err_free_msg:
+	nlmsg_free(msg);
+
+	return ret;
+}
+
+int dpll_device_create_ntf(struct dpll_device *dpll)
+{
+	return dpll_device_event_send(DPLL_CMD_DEVICE_CREATE_NTF, dpll);
+}
+
+int dpll_device_delete_ntf(struct dpll_device *dpll)
+{
+	return dpll_device_event_send(DPLL_CMD_DEVICE_DELETE_NTF, dpll);
+}
+
+static int
+__dpll_device_change_ntf(struct dpll_device *dpll)
+{
+	return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll);
+}
+
+/**
+ * dpll_device_change_ntf - notify that the dpll device has been changed
+ * @dpll: registered dpll pointer
+ *
+ * Context: acquires and holds a dpll_lock.
+ * Return: 0 if succeeds, error code otherwise.
+ */
+int dpll_device_change_ntf(struct dpll_device *dpll)
+{
+	int ret;
+
+	mutex_lock(&dpll_lock);
+	ret = __dpll_device_change_ntf(dpll);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_device_change_ntf);
+
+static int
+dpll_pin_event_send(enum dpll_cmd event, struct dpll_pin *pin)
+{
+	struct sk_buff *msg;
+	int ret = -ENOMEM;
+	void *hdr;
+
+	if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)))
+		return -ENODEV;
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &dpll_nl_family, 0, event);
+	if (!hdr)
+		goto err_free_msg;
+	ret = dpll_cmd_pin_get_one(msg, pin, NULL);
+	if (ret)
+		goto err_cancel_msg;
+	genlmsg_end(msg, hdr);
+	genlmsg_multicast(&dpll_nl_family, msg, 0, 0, GFP_KERNEL);
+
+	return 0;
+
+err_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+err_free_msg:
+	nlmsg_free(msg);
+
+	return ret;
+}
+
+int dpll_pin_create_ntf(struct dpll_pin *pin)
+{
+	return dpll_pin_event_send(DPLL_CMD_PIN_CREATE_NTF, pin);
+}
+
+int dpll_pin_delete_ntf(struct dpll_pin *pin)
+{
+	return dpll_pin_event_send(DPLL_CMD_PIN_DELETE_NTF, pin);
+}
+
+static int __dpll_pin_change_ntf(struct dpll_pin *pin)
+{
+	return dpll_pin_event_send(DPLL_CMD_PIN_CHANGE_NTF, pin);
+}
+
+/**
+ * dpll_pin_change_ntf - notify that the pin has been changed
+ * @pin: registered pin pointer
+ *
+ * Context: acquires and holds a dpll_lock.
+ * Return: 0 if succeeds, error code otherwise.
+ */
+int dpll_pin_change_ntf(struct dpll_pin *pin)
+{
+	int ret;
+
+	mutex_lock(&dpll_lock);
+	ret = __dpll_pin_change_ntf(pin);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_change_ntf);
+
+static int
+dpll_pin_freq_set(struct dpll_pin *pin, struct nlattr *a,
+		  struct netlink_ext_ack *extack)
+{
+	u64 freq = nla_get_u64(a);
+	struct dpll_pin_ref *ref;
+	unsigned long i;
+	int ret;
+
+	if (!dpll_pin_is_freq_supported(pin, freq)) {
+		NL_SET_ERR_MSG_ATTR(extack, a, "frequency is not supported by the device");
+		return -EINVAL;
+	}
+
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+		struct dpll_device *dpll = ref->dpll;
+
+		if (!ops->frequency_set)
+			return -EOPNOTSUPP;
+		ret = ops->frequency_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+					 dpll, dpll_priv(dpll), freq, extack);
+		if (ret)
+			return ret;
+	}
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32 parent_idx,
+			  enum dpll_pin_state state,
+			  struct netlink_ext_ack *extack)
+{
+	struct dpll_pin_ref *parent_ref;
+	const struct dpll_pin_ops *ops;
+	struct dpll_pin_ref *dpll_ref;
+	void *pin_priv, *parent_priv;
+	struct dpll_pin *parent;
+	unsigned long i;
+	int ret;
+
+	if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
+	      pin->prop->capabilities)) {
+		NL_SET_ERR_MSG(extack, "state changing is not allowed");
+		return -EOPNOTSUPP;
+	}
+	parent = xa_load(&dpll_pin_xa, parent_idx);
+	if (!parent)
+		return -EINVAL;
+	parent_ref = xa_load(&pin->parent_refs, parent->pin_idx);
+	if (!parent_ref)
+		return -EINVAL;
+	xa_for_each(&parent->dpll_refs, i, dpll_ref) {
+		ops = dpll_pin_ops(parent_ref);
+		if (!ops->state_on_pin_set)
+			return -EOPNOTSUPP;
+		pin_priv = dpll_pin_on_pin_priv(parent, pin);
+		parent_priv = dpll_pin_on_dpll_priv(dpll_ref->dpll, parent);
+		ret = ops->state_on_pin_set(pin, pin_priv, parent, parent_priv,
+					    state, extack);
+		if (ret)
+			return ret;
+	}
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_state_set(struct dpll_device *dpll, struct dpll_pin *pin,
+		   enum dpll_pin_state state,
+		   struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops;
+	struct dpll_pin_ref *ref;
+	int ret;
+
+	if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
+	      pin->prop->capabilities)) {
+		NL_SET_ERR_MSG(extack, "state changing is not allowed");
+		return -EOPNOTSUPP;
+	}
+	ref = xa_load(&pin->dpll_refs, dpll->id);
+	ASSERT_NOT_NULL(ref);
+	ops = dpll_pin_ops(ref);
+	if (!ops->state_on_dpll_set)
+		return -EOPNOTSUPP;
+	ret = ops->state_on_dpll_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				     dpll, dpll_priv(dpll), state, extack);
+	if (ret)
+		return ret;
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_prio_set(struct dpll_device *dpll, struct dpll_pin *pin,
+		  u32 prio, struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops;
+	struct dpll_pin_ref *ref;
+	int ret;
+
+	if (!(DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE &
+	      pin->prop->capabilities)) {
+		NL_SET_ERR_MSG(extack, "prio changing is not allowed");
+		return -EOPNOTSUPP;
+	}
+	ref = xa_load(&pin->dpll_refs, dpll->id);
+	ASSERT_NOT_NULL(ref);
+	ops = dpll_pin_ops(ref);
+	if (!ops->prio_set)
+		return -EOPNOTSUPP;
+	ret = ops->prio_set(pin, dpll_pin_on_dpll_priv(dpll, pin), dpll,
+			    dpll_priv(dpll), prio, extack);
+	if (ret)
+		return ret;
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_direction_set(struct dpll_pin *pin, struct dpll_device *dpll,
+		       enum dpll_pin_direction direction,
+		       struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops;
+	struct dpll_pin_ref *ref;
+	int ret;
+
+	if (!(DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE &
+	      pin->prop->capabilities)) {
+		NL_SET_ERR_MSG(extack, "direction changing is not allowed");
+		return -EOPNOTSUPP;
+	}
+	ref = xa_load(&pin->dpll_refs, dpll->id);
+	ASSERT_NOT_NULL(ref);
+	ops = dpll_pin_ops(ref);
+	if (!ops->direction_set)
+		return -EOPNOTSUPP;
+	ret = ops->direction_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				 dpll, dpll_priv(dpll), direction, extack);
+	if (ret)
+		return ret;
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_parent_device_set(struct dpll_pin *pin, struct nlattr *parent_nest,
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[DPLL_A_PIN_MAX + 1];
+	enum dpll_pin_direction direction;
+	enum dpll_pin_state state;
+	struct dpll_pin_ref *ref;
+	struct dpll_device *dpll;
+	u32 pdpll_idx, prio;
+	int ret;
+
+	nla_parse_nested(tb, DPLL_A_PIN_MAX, parent_nest,
+			 dpll_pin_parent_device_nl_policy, extack);
+	if (!tb[DPLL_A_PIN_PARENT_ID]) {
+		NL_SET_ERR_MSG(extack, "device parent id expected");
+		return -EINVAL;
+	}
+	pdpll_idx = nla_get_u32(tb[DPLL_A_PIN_PARENT_ID]);
+	dpll = xa_load(&dpll_device_xa, pdpll_idx);
+	if (!dpll) {
+		NL_SET_ERR_MSG(extack, "parent device not found");
+		return -EINVAL;
+	}
+	ref = xa_load(&pin->dpll_refs, dpll->id);
+	if (!ref) {
+		NL_SET_ERR_MSG(extack, "pin not connected to given parent device");
+		return -EINVAL;
+	}
+	if (tb[DPLL_A_PIN_STATE]) {
+		state = nla_get_u32(tb[DPLL_A_PIN_STATE]);
+		ret = dpll_pin_state_set(dpll, pin, state, extack);
+		if (ret)
+			return ret;
+	}
+	if (tb[DPLL_A_PIN_PRIO]) {
+		prio = nla_get_u32(tb[DPLL_A_PIN_PRIO]);
+		ret = dpll_pin_prio_set(dpll, pin, prio, extack);
+		if (ret)
+			return ret;
+	}
+	if (tb[DPLL_A_PIN_DIRECTION]) {
+		direction = nla_get_u32(tb[DPLL_A_PIN_DIRECTION]);
+		ret = dpll_pin_direction_set(pin, dpll, direction, extack);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int
+dpll_pin_parent_pin_set(struct dpll_pin *pin, struct nlattr *parent_nest,
+			struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[DPLL_A_PIN_MAX + 1];
+	enum dpll_pin_state state;
+	u32 ppin_idx;
+	int ret;
+
+	nla_parse_nested(tb, DPLL_A_PIN_MAX, parent_nest,
+			 dpll_pin_parent_pin_nl_policy, extack);
+	if (!tb[DPLL_A_PIN_PARENT_ID]) {
+		NL_SET_ERR_MSG(extack, "device parent id expected");
+		return -EINVAL;
+	}
+	ppin_idx = nla_get_u32(tb[DPLL_A_PIN_PARENT_ID]);
+	state = nla_get_u32(tb[DPLL_A_PIN_STATE]);
+	ret = dpll_pin_on_pin_state_set(pin, ppin_idx, state, extack);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int
+dpll_pin_set_from_nlattr(struct dpll_pin *pin, struct genl_info *info)
+{
+	struct nlattr *a;
+	int rem, ret;
+
+	nla_for_each_attr(a, genlmsg_data(info->genlhdr),
+			  genlmsg_len(info->genlhdr), rem) {
+		switch (nla_type(a)) {
+		case DPLL_A_PIN_FREQUENCY:
+			ret = dpll_pin_freq_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
+		case DPLL_A_PIN_PARENT_DEVICE:
+			ret = dpll_pin_parent_device_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
+		case DPLL_A_PIN_PARENT_PIN:
+			ret = dpll_pin_parent_pin_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static struct dpll_pin *
+dpll_pin_find(u64 clock_id, struct nlattr *mod_name_attr,
+	      enum dpll_pin_type type, struct nlattr *board_label,
+	      struct nlattr *panel_label, struct nlattr *package_label,
+	      struct netlink_ext_ack *extack)
+{
+	bool board_match, panel_match, package_match;
+	struct dpll_pin *pin_match = NULL, *pin;
+	const struct dpll_pin_properties *prop;
+	bool cid_match, mod_match, type_match;
+	unsigned long i;
+
+	xa_for_each_marked(&dpll_pin_xa, i, pin, DPLL_REGISTERED) {
+		prop = pin->prop;
+		cid_match = clock_id ? pin->clock_id == clock_id : true;
+		mod_match = mod_name_attr && module_name(pin->module) ?
+			!nla_strcmp(mod_name_attr,
+				    module_name(pin->module)) : true;
+		type_match = type ? prop->type == type : true;
+		board_match = board_label ? (prop->board_label ?
+			!nla_strcmp(board_label, prop->board_label) : false) :
+			true;
+		panel_match = panel_label ? (prop->panel_label ?
+			!nla_strcmp(panel_label, prop->panel_label) : false) :
+			true;
+		package_match = package_label ? (prop->package_label ?
+			!nla_strcmp(package_label, prop->package_label) :
+			false) : true;
+		if (cid_match && mod_match && type_match && board_match &&
+		    panel_match && package_match) {
+			if (pin_match) {
+				NL_SET_ERR_MSG(extack, "multiple matches");
+				return ERR_PTR(-EINVAL);
+			}
+			pin_match = pin;
+		};
+	}
+	if (!pin_match) {
+		NL_SET_ERR_MSG(extack, "not found");
+		return ERR_PTR(-ENODEV);
+	}
+	return pin_match;
+}
+
+static struct dpll_pin *dpll_pin_find_from_nlattr(struct genl_info *info)
+{
+	struct nlattr *attr, *mod_name_attr = NULL, *board_label_attr = NULL,
+		*panel_label_attr = NULL, *package_label_attr = NULL;
+	enum dpll_pin_type type = 0;
+	u64 clock_id = 0;
+	int rem = 0;
+
+	nla_for_each_attr(attr, genlmsg_data(info->genlhdr),
+			  genlmsg_len(info->genlhdr), rem) {
+		switch (nla_type(attr)) {
+		case DPLL_A_PIN_CLOCK_ID:
+			if (clock_id)
+				goto duplicated_attr;
+			clock_id = nla_get_u64(attr);
+			break;
+		case DPLL_A_PIN_MODULE_NAME:
+			if (mod_name_attr)
+				goto duplicated_attr;
+			mod_name_attr = attr;
+			break;
+		case DPLL_A_PIN_TYPE:
+			if (type)
+				goto duplicated_attr;
+			type = nla_get_u32(attr);
+		break;
+		case DPLL_A_PIN_BOARD_LABEL:
+			if (board_label_attr)
+				goto duplicated_attr;
+			board_label_attr = attr;
+		break;
+		case DPLL_A_PIN_PANEL_LABEL:
+			if (panel_label_attr)
+				goto duplicated_attr;
+			panel_label_attr = attr;
+		break;
+		case DPLL_A_PIN_PACKAGE_LABEL:
+			if (package_label_attr)
+				goto duplicated_attr;
+			package_label_attr = attr;
+		break;
+		default:
+			break;
+		}
+	}
+	if (!(clock_id  || mod_name_attr || board_label_attr ||
+	      panel_label_attr || package_label_attr)) {
+		NL_SET_ERR_MSG(info->extack, "missing attributes");
+		return ERR_PTR(-EINVAL);
+	}
+	return dpll_pin_find(clock_id, mod_name_attr, type, board_label_attr,
+			     panel_label_attr, package_label_attr,
+			     info->extack);
+duplicated_attr:
+	NL_SET_ERR_MSG(info->extack, "duplicated attribute");
+	return ERR_PTR(-EINVAL);
+}
+
+int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_pin *pin;
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0,
+				DPLL_CMD_PIN_ID_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	pin = dpll_pin_find_from_nlattr(info);
+	if (!IS_ERR(pin)) {
+		ret = dpll_msg_add_pin_handle(msg, pin);
+		if (ret) {
+			nlmsg_free(msg);
+			return ret;
+		}
+	}
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+}
+
+int dpll_nl_pin_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_pin *pin = info->user_ptr[0];
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	if (!pin)
+		return -ENODEV;
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0,
+				DPLL_CMD_PIN_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+	ret = dpll_cmd_pin_get_one(msg, pin, info->extack);
+	if (ret) {
+		nlmsg_free(msg);
+		return ret;
+	}
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+}
+
+int dpll_nl_pin_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct dpll_dump_ctx *ctx = dpll_dump_context(cb);
+	struct dpll_pin *pin;
+	struct nlattr *hdr;
+	unsigned long i;
+	int ret = 0;
+
+	xa_for_each_marked_start(&dpll_pin_xa, i, pin, DPLL_REGISTERED,
+				 ctx->idx) {
+		hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq,
+				  &dpll_nl_family, NLM_F_MULTI,
+				  DPLL_CMD_PIN_GET);
+		if (!hdr) {
+			ret = -EMSGSIZE;
+			break;
+		}
+		ret = dpll_cmd_pin_get_one(skb, pin, cb->extack);
+		if (ret) {
+			genlmsg_cancel(skb, hdr);
+			break;
+		}
+		genlmsg_end(skb, hdr);
+	}
+	if (ret == -EMSGSIZE) {
+		ctx->idx = i;
+		return skb->len;
+	}
+	return ret;
+}
+
+int dpll_nl_pin_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_pin *pin = info->user_ptr[0];
+
+	return dpll_pin_set_from_nlattr(pin, info);
+}
+
+static struct dpll_device *
+dpll_device_find(u64 clock_id, struct nlattr *mod_name_attr,
+		 enum dpll_type type, struct netlink_ext_ack *extack)
+{
+	struct dpll_device *dpll_match = NULL, *dpll;
+	bool cid_match, mod_match, type_match;
+	unsigned long i;
+
+	xa_for_each_marked(&dpll_device_xa, i, dpll, DPLL_REGISTERED) {
+		cid_match = clock_id ? dpll->clock_id == clock_id : true;
+		mod_match = mod_name_attr ? (module_name(dpll->module) ?
+			!nla_strcmp(mod_name_attr,
+				    module_name(dpll->module)) : false) : true;
+		type_match = type ? dpll->type == type : true;
+		if (cid_match && mod_match && type_match) {
+			if (dpll_match) {
+				NL_SET_ERR_MSG(extack, "multiple matches");
+				return ERR_PTR(-EINVAL);
+			}
+			dpll_match = dpll;
+		}
+	}
+	if (!dpll_match) {
+		NL_SET_ERR_MSG(extack, "not found");
+		return ERR_PTR(-ENODEV);
+	}
+
+	return dpll_match;
+}
+
+static struct dpll_device *
+dpll_device_find_from_nlattr(struct genl_info *info)
+{
+	struct nlattr *attr, *mod_name_attr = NULL;
+	enum dpll_type type = 0;
+	u64 clock_id = 0;
+	int rem = 0;
+
+	nla_for_each_attr(attr, genlmsg_data(info->genlhdr),
+			  genlmsg_len(info->genlhdr), rem) {
+		switch (nla_type(attr)) {
+		case DPLL_A_CLOCK_ID:
+			if (clock_id)
+				goto duplicated_attr;
+			clock_id = nla_get_u64(attr);
+			break;
+		case DPLL_A_MODULE_NAME:
+			if (mod_name_attr)
+				goto duplicated_attr;
+			mod_name_attr = attr;
+			break;
+		case DPLL_A_TYPE:
+			if (type)
+				goto duplicated_attr;
+			type = nla_get_u32(attr);
+			break;
+		default:
+			break;
+		}
+	}
+	if (!clock_id && !mod_name_attr && !type) {
+		NL_SET_ERR_MSG(info->extack, "missing attributes");
+		return ERR_PTR(-EINVAL);
+	}
+	return dpll_device_find(clock_id, mod_name_attr, type, info->extack);
+duplicated_attr:
+	NL_SET_ERR_MSG(info->extack, "duplicated attribute");
+	return ERR_PTR(-EINVAL);
+}
+
+int dpll_nl_device_id_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_device *dpll;
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0,
+				DPLL_CMD_DEVICE_ID_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	dpll = dpll_device_find_from_nlattr(info);
+	if (!IS_ERR(dpll)) {
+		ret = dpll_msg_add_dev_handle(msg, dpll);
+		if (ret) {
+			nlmsg_free(msg);
+			return ret;
+		}
+	}
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+}
+
+int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_device *dpll = info->user_ptr[0];
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0,
+				DPLL_CMD_DEVICE_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	ret = dpll_device_get_one(dpll, msg, info->extack);
+	if (ret) {
+		nlmsg_free(msg);
+		return ret;
+	}
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+}
+
+int dpll_nl_device_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	/* placeholder for set command */
+	return 0;
+}
+
+int dpll_nl_device_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct dpll_dump_ctx *ctx = dpll_dump_context(cb);
+	struct dpll_device *dpll;
+	struct nlattr *hdr;
+	unsigned long i;
+	int ret = 0;
+
+	xa_for_each_marked_start(&dpll_device_xa, i, dpll, DPLL_REGISTERED,
+				 ctx->idx) {
+		hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq, &dpll_nl_family,
+				  NLM_F_MULTI, DPLL_CMD_DEVICE_GET);
+		if (!hdr) {
+			ret = -EMSGSIZE;
+			break;
+		}
+		ret = dpll_device_get_one(dpll, skb, cb->extack);
+		if (ret) {
+			genlmsg_cancel(skb, hdr);
+			break;
+		}
+		genlmsg_end(skb, hdr);
+	}
+	if (ret == -EMSGSIZE) {
+		ctx->idx = i;
+		return skb->len;
+	}
+	return ret;
+}
+
+int dpll_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		  struct genl_info *info)
+{
+	u32 id;
+
+	if (GENL_REQ_ATTR_CHECK(info, DPLL_A_ID))
+		return -EINVAL;
+
+	mutex_lock(&dpll_lock);
+	id = nla_get_u32(info->attrs[DPLL_A_ID]);
+	info->user_ptr[0] = dpll_device_get_by_id(id);
+	if (!info->user_ptr[0]) {
+		NL_SET_ERR_MSG(info->extack, "device not found");
+		goto unlock;
+	}
+	return 0;
+unlock:
+	mutex_unlock(&dpll_lock);
+	return -ENODEV;
+}
+
+void dpll_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		    struct genl_info *info)
+{
+	mutex_unlock(&dpll_lock);
+}
+
+int
+dpll_lock_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+	       struct genl_info *info)
+{
+	mutex_lock(&dpll_lock);
+
+	return 0;
+}
+
+void
+dpll_unlock_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		 struct genl_info *info)
+{
+	mutex_unlock(&dpll_lock);
+}
+
+int dpll_lock_dumpit(struct netlink_callback *cb)
+{
+	mutex_lock(&dpll_lock);
+
+	return 0;
+}
+
+int dpll_unlock_dumpit(struct netlink_callback *cb)
+{
+	mutex_unlock(&dpll_lock);
+
+	return 0;
+}
+
+int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		      struct genl_info *info)
+{
+	int ret;
+
+	mutex_lock(&dpll_lock);
+	if (GENL_REQ_ATTR_CHECK(info, DPLL_A_PIN_ID)) {
+		ret = -EINVAL;
+		goto unlock_dev;
+	}
+	info->user_ptr[0] = xa_load(&dpll_pin_xa,
+				    nla_get_u32(info->attrs[DPLL_A_PIN_ID]));
+	if (!info->user_ptr[0]) {
+		NL_SET_ERR_MSG(info->extack, "pin not found");
+		ret = -ENODEV;
+		goto unlock_dev;
+	}
+
+	return 0;
+
+unlock_dev:
+	mutex_unlock(&dpll_lock);
+	return ret;
+}
+
+void dpll_pin_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+			struct genl_info *info)
+{
+	mutex_unlock(&dpll_lock);
+}
diff --git a/drivers/dpll/dpll_netlink.h b/drivers/dpll/dpll_netlink.h
new file mode 100644
index 000000000000..a9cfd55f57fc
--- /dev/null
+++ b/drivers/dpll/dpll_netlink.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ */
+
+int dpll_device_create_ntf(struct dpll_device *dpll);
+
+int dpll_device_delete_ntf(struct dpll_device *dpll);
+
+int dpll_pin_create_ntf(struct dpll_pin *pin);
+
+int dpll_pin_delete_ntf(struct dpll_pin *pin);
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index b47c3560b937..2202310c10cd 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -130,4 +130,8 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
 void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
 				const struct dpll_pin_ops *ops, void *priv);
 
+int dpll_device_change_ntf(struct dpll_device *dpll);
+
+int dpll_pin_change_ntf(struct dpll_pin *pin);
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 5f18426928800c59fb0f9bc8fb0c182bb6f5ee24 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 21:49:39 +0100
Subject: netdev: expose DPLL pin handle for netdevice

In case netdevice represents a SyncE port, the user needs to understand
the connection between netdevice and associated DPLL pin. There might me
multiple netdevices pointing to the same pin, in case of VF/SF
implementation.

Add a IFLA Netlink attribute to nest the DPLL pin handle, similar to
how it is implemented for devlink port. Add a struct dpll_pin pointer
to netdev and protect access to it by RTNL. Expose netdev_dpll_pin_set()
and netdev_dpll_pin_clear() helpers to the drivers so they can set/clear
the DPLL pin relationship to netdev.

Note that during the lifetime of struct dpll_pin the pin handle does not
change. Therefore it is save to access it lockless. It is drivers
responsibility to call netdev_dpll_pin_clear() before dpll_pin_put().

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dpll/dpll_netlink.c  | 16 ++++++++++++++--
 include/linux/dpll.h         | 15 +++++++++++++++
 include/linux/netdevice.h    | 21 +++++++++++++++++++++
 include/uapi/linux/if_link.h |  2 +-
 net/core/dev.c               | 22 ++++++++++++++++++++++
 net/core/rtnetlink.c         | 36 ++++++++++++++++++++++++++++++++++++
 6 files changed, 109 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 9464a6865977..764437a0661b 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -47,6 +47,18 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id)
 	return 0;
 }
 
+/**
+ * dpll_msg_pin_handle_size - get size of pin handle attribute for given pin
+ * @pin: pin pointer
+ *
+ * Return: byte size of pin handle attribute for given pin.
+ */
+size_t dpll_msg_pin_handle_size(struct dpll_pin *pin)
+{
+	return pin ? nla_total_size(4) : 0; /* DPLL_A_PIN_ID */
+}
+EXPORT_SYMBOL_GPL(dpll_msg_pin_handle_size);
+
 /**
  * dpll_msg_add_pin_handle - attach pin handle attribute to a given message
  * @msg: pointer to sk_buff message to attach a pin handle
@@ -56,8 +68,7 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id)
  * * 0 - success
  * * -EMSGSIZE - no space in message to attach pin handle
  */
-static int
-dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
+int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
 {
 	if (!pin)
 		return 0;
@@ -65,6 +76,7 @@ dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
 		return -EMSGSIZE;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(dpll_msg_add_pin_handle);
 
 static int
 dpll_msg_add_mode(struct sk_buff *msg, struct dpll_device *dpll,
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 2202310c10cd..bbc480cd2932 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -101,6 +101,21 @@ struct dpll_pin_properties {
 	struct dpll_pin_frequency *freq_supported;
 };
 
+#if IS_ENABLED(CONFIG_DPLL)
+size_t dpll_msg_pin_handle_size(struct dpll_pin *pin);
+int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin);
+#else
+static inline size_t dpll_msg_pin_handle_size(struct dpll_pin *pin)
+{
+	return 0;
+}
+
+static inline int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
+{
+	return 0;
+}
+#endif
+
 struct dpll_device *
 dpll_device_get(u64 clock_id, u32 dev_driver_id, struct module *module);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0896aaa91dd7..db3d8429d50d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -79,6 +79,8 @@ struct xdp_buff;
 struct xdp_frame;
 struct xdp_metadata_ops;
 struct xdp_md;
+/* DPLL specific */
+struct dpll_pin;
 
 typedef u32 xdp_features_t;
 
@@ -2049,6 +2051,9 @@ enum netdev_ml_priv_type {
  *			SET_NETDEV_DEVLINK_PORT macro. This pointer is static
  *			during the time netdevice is registered.
  *
+ *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
+ *		   where the clock is recovered.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2405,6 +2410,10 @@ struct net_device {
 	struct rtnl_hw_stats64	*offload_xstats_l3;
 
 	struct devlink_port	*devlink_port;
+
+#if IS_ENABLED(CONFIG_DPLL)
+	struct dpll_pin		*dpll_pin;
+#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -3940,6 +3949,18 @@ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
 int dev_get_port_parent_id(struct net_device *dev,
 			   struct netdev_phys_item_id *ppid, bool recurse);
 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
+void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin);
+void netdev_dpll_pin_clear(struct net_device *dev);
+
+static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev)
+{
+#if IS_ENABLED(CONFIG_DPLL)
+	return dev->dpll_pin;
+#else
+	return NULL;
+#endif
+}
+
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ce3117df9cec..fac351a93aed 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -376,7 +376,7 @@ enum {
 
 	IFLA_GSO_IPV4_MAX_SIZE,
 	IFLA_GRO_IPV4_MAX_SIZE,
-
+	IFLA_DPLL_PIN,
 	__IFLA_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ccff2b6ef958..cc03a5758d2d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9023,6 +9023,28 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 }
 EXPORT_SYMBOL(netdev_port_same_parent_id);
 
+static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+#if IS_ENABLED(CONFIG_DPLL)
+	rtnl_lock();
+	dev->dpll_pin = dpll_pin;
+	rtnl_unlock();
+#endif
+}
+
+void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+	WARN_ON(!dpll_pin);
+	netdev_dpll_pin_assign(dev, dpll_pin);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_set);
+
+void netdev_dpll_pin_clear(struct net_device *dev)
+{
+	netdev_dpll_pin_assign(dev, NULL);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_clear);
+
 /**
  *	dev_change_proto_down - set carrier according to proto_down.
  *
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4a2ec33bfb51..7452a6d190c5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -57,6 +57,7 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/addrconf.h>
 #endif
+#include <linux/dpll.h>
 
 #include "dev.h"
 
@@ -1055,6 +1056,15 @@ static size_t rtnl_devlink_port_size(const struct net_device *dev)
 	return size;
 }
 
+static size_t rtnl_dpll_pin_size(const struct net_device *dev)
+{
+	size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */
+
+	size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev));
+
+	return size;
+}
+
 static noinline size_t if_nlmsg_size(const struct net_device *dev,
 				     u32 ext_filter_mask)
 {
@@ -1111,6 +1121,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_prop_list_size(dev)
 	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
 	       + rtnl_devlink_port_size(dev)
+	       + rtnl_dpll_pin_size(dev)
 	       + 0;
 }
 
@@ -1774,6 +1785,28 @@ nest_cancel:
 	return ret;
 }
 
+static int rtnl_fill_dpll_pin(struct sk_buff *skb,
+			      const struct net_device *dev)
+{
+	struct nlattr *dpll_pin_nest;
+	int ret;
+
+	dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
+	if (!dpll_pin_nest)
+		return -EMSGSIZE;
+
+	ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev));
+	if (ret < 0)
+		goto nest_cancel;
+
+	nla_nest_end(skb, dpll_pin_nest);
+	return 0;
+
+nest_cancel:
+	nla_nest_cancel(skb, dpll_pin_nest);
+	return ret;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			    struct net_device *dev, struct net *src_net,
 			    int type, u32 pid, u32 seq, u32 change,
@@ -1916,6 +1949,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	if (rtnl_fill_devlink_port(skb, dev))
 		goto nla_put_failure;
 
+	if (rtnl_fill_dpll_pin(skb, dev))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 496fd0a26bbf73b6b12407ee4fbe5ff49d659a6d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 21:49:43 +0100
Subject: mlx5: Implement SyncE support using DPLL infrastructure

Implement SyncE support using newly introduced DPLL support.
Make sure that each PFs/VFs/SFs probed with appropriate capability
will spawn a dpll auxiliary device and register appropriate dpll device
and pin instances.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig  |   8 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/dev.c    |  17 +
 drivers/net/ethernet/mellanox/mlx5/core/dpll.c   | 432 +++++++++++++++++++++++
 include/linux/mlx5/driver.h                      |   2 +
 include/linux/mlx5/mlx5_ifc.h                    |  59 +++-
 6 files changed, 520 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/dpll.c

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index c4f4de82e29e..685335832a93 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -189,3 +189,11 @@ config MLX5_SF_MANAGER
 	port is managed through devlink.  A subfunction supports RDMA, netdevice
 	and vdpa device. It is similar to a SRIOV VF but it doesn't require
 	SRIOV support.
+
+config MLX5_DPLL
+	tristate "Mellanox 5th generation network adapters (ConnectX series) DPLL support"
+	depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
+	select DPLL
+	help
+	  DPLL support in Mellanox Technologies ConnectX NICs.
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 7e94caca4888..c44870b175f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -128,3 +128,6 @@ mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_
 # SF manager
 #
 mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o
+
+obj-$(CONFIG_MLX5_DPLL) += mlx5_dpll.o
+mlx5_dpll-y :=	dpll.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 7909f378dc93..1fc03480c2ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -206,6 +206,19 @@ static bool is_ib_enabled(struct mlx5_core_dev *dev)
 	return err ? false : val.vbool;
 }
 
+static bool is_dpll_supported(struct mlx5_core_dev *dev)
+{
+	if (!IS_ENABLED(CONFIG_MLX5_DPLL))
+		return false;
+
+	if (!MLX5_CAP_MCAM_REG2(dev, synce_registers)) {
+		mlx5_core_warn(dev, "Missing SyncE capability\n");
+		return false;
+	}
+
+	return true;
+}
+
 enum {
 	MLX5_INTERFACE_PROTOCOL_ETH,
 	MLX5_INTERFACE_PROTOCOL_ETH_REP,
@@ -215,6 +228,8 @@ enum {
 	MLX5_INTERFACE_PROTOCOL_MPIB,
 
 	MLX5_INTERFACE_PROTOCOL_VNET,
+
+	MLX5_INTERFACE_PROTOCOL_DPLL,
 };
 
 static const struct mlx5_adev_device {
@@ -237,6 +252,8 @@ static const struct mlx5_adev_device {
 					   .is_supported = &is_ib_rep_supported },
 	[MLX5_INTERFACE_PROTOCOL_MPIB] = { .suffix = "multiport",
 					   .is_supported = &is_mp_supported },
+	[MLX5_INTERFACE_PROTOCOL_DPLL] = { .suffix = "dpll",
+					   .is_supported = &is_dpll_supported },
 };
 
 int mlx5_adev_idx_alloc(void)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
new file mode 100644
index 000000000000..74f0c7867120
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#include <linux/dpll.h>
+#include <linux/mlx5/driver.h>
+
+/* This structure represents a reference to DPLL, one is created
+ * per mdev instance.
+ */
+struct mlx5_dpll {
+	struct dpll_device *dpll;
+	struct dpll_pin *dpll_pin;
+	struct mlx5_core_dev *mdev;
+	struct workqueue_struct *wq;
+	struct delayed_work work;
+	struct {
+		bool valid;
+		enum dpll_lock_status lock_status;
+		enum dpll_pin_state pin_state;
+	} last;
+	struct notifier_block mdev_nb;
+	struct net_device *tracking_netdev;
+};
+
+static int mlx5_dpll_clock_id_get(struct mlx5_core_dev *mdev, u64 *clock_id)
+{
+	u32 out[MLX5_ST_SZ_DW(msecq_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(msecq_reg)] = {};
+	int err;
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MSECQ, 0, 0);
+	if (err)
+		return err;
+	*clock_id = MLX5_GET64(msecq_reg, out, local_clock_identity);
+	return 0;
+}
+
+static int
+mlx5_dpll_synce_status_get(struct mlx5_core_dev *mdev,
+			   enum mlx5_msees_admin_status *admin_status,
+			   enum mlx5_msees_oper_status *oper_status,
+			   bool *ho_acq)
+{
+	u32 out[MLX5_ST_SZ_DW(msees_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(msees_reg)] = {};
+	int err;
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MSEES, 0, 0);
+	if (err)
+		return err;
+	if (admin_status)
+		*admin_status = MLX5_GET(msees_reg, out, admin_status);
+	*oper_status = MLX5_GET(msees_reg, out, oper_status);
+	if (ho_acq)
+		*ho_acq = MLX5_GET(msees_reg, out, ho_acq);
+	return 0;
+}
+
+static int
+mlx5_dpll_synce_status_set(struct mlx5_core_dev *mdev,
+			   enum mlx5_msees_admin_status admin_status)
+{
+	u32 out[MLX5_ST_SZ_DW(msees_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(msees_reg)] = {};
+
+	MLX5_SET(msees_reg, in, field_select,
+		 MLX5_MSEES_FIELD_SELECT_ENABLE |
+		 MLX5_MSEES_FIELD_SELECT_ADMIN_STATUS);
+	MLX5_SET(msees_reg, in, admin_status, admin_status);
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out),
+				    MLX5_REG_MSEES, 0, 1);
+}
+
+static enum dpll_lock_status
+mlx5_dpll_lock_status_get(enum mlx5_msees_oper_status oper_status, bool ho_acq)
+{
+	switch (oper_status) {
+	case MLX5_MSEES_OPER_STATUS_SELF_TRACK:
+		fallthrough;
+	case MLX5_MSEES_OPER_STATUS_OTHER_TRACK:
+		return ho_acq ? DPLL_LOCK_STATUS_LOCKED_HO_ACQ :
+				DPLL_LOCK_STATUS_LOCKED;
+	case MLX5_MSEES_OPER_STATUS_HOLDOVER:
+		fallthrough;
+	case MLX5_MSEES_OPER_STATUS_FAIL_HOLDOVER:
+		return DPLL_LOCK_STATUS_HOLDOVER;
+	default:
+		return DPLL_LOCK_STATUS_UNLOCKED;
+	}
+}
+
+static enum dpll_pin_state
+mlx5_dpll_pin_state_get(enum mlx5_msees_admin_status admin_status,
+			enum mlx5_msees_oper_status oper_status)
+{
+	return (admin_status == MLX5_MSEES_ADMIN_STATUS_TRACK &&
+		(oper_status == MLX5_MSEES_OPER_STATUS_SELF_TRACK ||
+		 oper_status == MLX5_MSEES_OPER_STATUS_OTHER_TRACK)) ?
+	       DPLL_PIN_STATE_CONNECTED : DPLL_PIN_STATE_DISCONNECTED;
+}
+
+static int mlx5_dpll_device_lock_status_get(const struct dpll_device *dpll,
+					    void *priv,
+					    enum dpll_lock_status *status,
+					    struct netlink_ext_ack *extack)
+{
+	enum mlx5_msees_oper_status oper_status;
+	struct mlx5_dpll *mdpll = priv;
+	bool ho_acq;
+	int err;
+
+	err = mlx5_dpll_synce_status_get(mdpll->mdev, NULL,
+					 &oper_status, &ho_acq);
+	if (err)
+		return err;
+
+	*status = mlx5_dpll_lock_status_get(oper_status, ho_acq);
+	return 0;
+}
+
+static int mlx5_dpll_device_mode_get(const struct dpll_device *dpll,
+				     void *priv,
+				     u32 *mode, struct netlink_ext_ack *extack)
+{
+	*mode = DPLL_MODE_MANUAL;
+	return 0;
+}
+
+static bool mlx5_dpll_device_mode_supported(const struct dpll_device *dpll,
+					    void *priv,
+					    enum dpll_mode mode,
+					    struct netlink_ext_ack *extack)
+{
+	return mode == DPLL_MODE_MANUAL;
+}
+
+static const struct dpll_device_ops mlx5_dpll_device_ops = {
+	.lock_status_get = mlx5_dpll_device_lock_status_get,
+	.mode_get = mlx5_dpll_device_mode_get,
+	.mode_supported = mlx5_dpll_device_mode_supported,
+};
+
+static int mlx5_dpll_pin_direction_get(const struct dpll_pin *pin,
+				       void *pin_priv,
+				       const struct dpll_device *dpll,
+				       void *dpll_priv,
+				       enum dpll_pin_direction *direction,
+				       struct netlink_ext_ack *extack)
+{
+	*direction = DPLL_PIN_DIRECTION_INPUT;
+	return 0;
+}
+
+static int mlx5_dpll_state_on_dpll_get(const struct dpll_pin *pin,
+				       void *pin_priv,
+				       const struct dpll_device *dpll,
+				       void *dpll_priv,
+				       enum dpll_pin_state *state,
+				       struct netlink_ext_ack *extack)
+{
+	enum mlx5_msees_admin_status admin_status;
+	enum mlx5_msees_oper_status oper_status;
+	struct mlx5_dpll *mdpll = pin_priv;
+	int err;
+
+	err = mlx5_dpll_synce_status_get(mdpll->mdev, &admin_status,
+					 &oper_status, NULL);
+	if (err)
+		return err;
+	*state = mlx5_dpll_pin_state_get(admin_status, oper_status);
+	return 0;
+}
+
+static int mlx5_dpll_state_on_dpll_set(const struct dpll_pin *pin,
+				       void *pin_priv,
+				       const struct dpll_device *dpll,
+				       void *dpll_priv,
+				       enum dpll_pin_state state,
+				       struct netlink_ext_ack *extack)
+{
+	struct mlx5_dpll *mdpll = pin_priv;
+
+	return mlx5_dpll_synce_status_set(mdpll->mdev,
+					  state == DPLL_PIN_STATE_CONNECTED ?
+					  MLX5_MSEES_ADMIN_STATUS_TRACK :
+					  MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING);
+}
+
+static const struct dpll_pin_ops mlx5_dpll_pins_ops = {
+	.direction_get = mlx5_dpll_pin_direction_get,
+	.state_on_dpll_get = mlx5_dpll_state_on_dpll_get,
+	.state_on_dpll_set = mlx5_dpll_state_on_dpll_set,
+};
+
+static const struct dpll_pin_properties mlx5_dpll_pin_properties = {
+	.type = DPLL_PIN_TYPE_SYNCE_ETH_PORT,
+	.capabilities = DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE,
+};
+
+#define MLX5_DPLL_PERIODIC_WORK_INTERVAL 500 /* ms */
+
+static void mlx5_dpll_periodic_work_queue(struct mlx5_dpll *mdpll)
+{
+	queue_delayed_work(mdpll->wq, &mdpll->work,
+			   msecs_to_jiffies(MLX5_DPLL_PERIODIC_WORK_INTERVAL));
+}
+
+static void mlx5_dpll_periodic_work(struct work_struct *work)
+{
+	struct mlx5_dpll *mdpll = container_of(work, struct mlx5_dpll,
+					       work.work);
+	enum mlx5_msees_admin_status admin_status;
+	enum mlx5_msees_oper_status oper_status;
+	enum dpll_lock_status lock_status;
+	enum dpll_pin_state pin_state;
+	bool ho_acq;
+	int err;
+
+	err = mlx5_dpll_synce_status_get(mdpll->mdev, &admin_status,
+					 &oper_status, &ho_acq);
+	if (err)
+		goto err_out;
+	lock_status = mlx5_dpll_lock_status_get(oper_status, ho_acq);
+	pin_state = mlx5_dpll_pin_state_get(admin_status, oper_status);
+
+	if (!mdpll->last.valid)
+		goto invalid_out;
+
+	if (mdpll->last.lock_status != lock_status)
+		dpll_device_change_ntf(mdpll->dpll);
+	if (mdpll->last.pin_state != pin_state)
+		dpll_pin_change_ntf(mdpll->dpll_pin);
+
+invalid_out:
+	mdpll->last.lock_status = lock_status;
+	mdpll->last.pin_state = pin_state;
+	mdpll->last.valid = true;
+err_out:
+	mlx5_dpll_periodic_work_queue(mdpll);
+}
+
+static void mlx5_dpll_netdev_dpll_pin_set(struct mlx5_dpll *mdpll,
+					  struct net_device *netdev)
+{
+	if (mdpll->tracking_netdev)
+		return;
+	netdev_dpll_pin_set(netdev, mdpll->dpll_pin);
+	mdpll->tracking_netdev = netdev;
+}
+
+static void mlx5_dpll_netdev_dpll_pin_clear(struct mlx5_dpll *mdpll)
+{
+	if (!mdpll->tracking_netdev)
+		return;
+	netdev_dpll_pin_clear(mdpll->tracking_netdev);
+	mdpll->tracking_netdev = NULL;
+}
+
+static int mlx5_dpll_mdev_notifier_event(struct notifier_block *nb,
+					 unsigned long event, void *data)
+{
+	struct mlx5_dpll *mdpll = container_of(nb, struct mlx5_dpll, mdev_nb);
+	struct net_device *netdev = data;
+
+	switch (event) {
+	case MLX5_DRIVER_EVENT_UPLINK_NETDEV:
+		if (netdev)
+			mlx5_dpll_netdev_dpll_pin_set(mdpll, netdev);
+		else
+			mlx5_dpll_netdev_dpll_pin_clear(mdpll);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static void mlx5_dpll_mdev_netdev_track(struct mlx5_dpll *mdpll,
+					struct mlx5_core_dev *mdev)
+{
+	mdpll->mdev_nb.notifier_call = mlx5_dpll_mdev_notifier_event;
+	mlx5_blocking_notifier_register(mdev, &mdpll->mdev_nb);
+	mlx5_core_uplink_netdev_event_replay(mdev);
+}
+
+static void mlx5_dpll_mdev_netdev_untrack(struct mlx5_dpll *mdpll,
+					  struct mlx5_core_dev *mdev)
+{
+	mlx5_blocking_notifier_unregister(mdev, &mdpll->mdev_nb);
+	mlx5_dpll_netdev_dpll_pin_clear(mdpll);
+}
+
+static int mlx5_dpll_probe(struct auxiliary_device *adev,
+			   const struct auxiliary_device_id *id)
+{
+	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
+	struct mlx5_core_dev *mdev = edev->mdev;
+	struct mlx5_dpll *mdpll;
+	u64 clock_id;
+	int err;
+
+	err = mlx5_dpll_synce_status_set(mdev,
+					 MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING);
+	if (err)
+		return err;
+
+	err = mlx5_dpll_clock_id_get(mdev, &clock_id);
+	if (err)
+		return err;
+
+	mdpll = kzalloc(sizeof(*mdpll), GFP_KERNEL);
+	if (!mdpll)
+		return -ENOMEM;
+	mdpll->mdev = mdev;
+	auxiliary_set_drvdata(adev, mdpll);
+
+	/* Multiple mdev instances might share one DPLL device. */
+	mdpll->dpll = dpll_device_get(clock_id, 0, THIS_MODULE);
+	if (IS_ERR(mdpll->dpll)) {
+		err = PTR_ERR(mdpll->dpll);
+		goto err_free_mdpll;
+	}
+
+	err = dpll_device_register(mdpll->dpll, DPLL_TYPE_EEC,
+				   &mlx5_dpll_device_ops, mdpll);
+	if (err)
+		goto err_put_dpll_device;
+
+	/* Multiple mdev instances might share one DPLL pin. */
+	mdpll->dpll_pin = dpll_pin_get(clock_id, mlx5_get_dev_index(mdev),
+				       THIS_MODULE, &mlx5_dpll_pin_properties);
+	if (IS_ERR(mdpll->dpll_pin)) {
+		err = PTR_ERR(mdpll->dpll_pin);
+		goto err_unregister_dpll_device;
+	}
+
+	err = dpll_pin_register(mdpll->dpll, mdpll->dpll_pin,
+				&mlx5_dpll_pins_ops, mdpll);
+	if (err)
+		goto err_put_dpll_pin;
+
+	mdpll->wq = create_singlethread_workqueue("mlx5_dpll");
+	if (!mdpll->wq) {
+		err = -ENOMEM;
+		goto err_unregister_dpll_pin;
+	}
+
+	mlx5_dpll_mdev_netdev_track(mdpll, mdev);
+
+	INIT_DELAYED_WORK(&mdpll->work, &mlx5_dpll_periodic_work);
+	mlx5_dpll_periodic_work_queue(mdpll);
+
+	return 0;
+
+err_unregister_dpll_pin:
+	dpll_pin_unregister(mdpll->dpll, mdpll->dpll_pin,
+			    &mlx5_dpll_pins_ops, mdpll);
+err_put_dpll_pin:
+	dpll_pin_put(mdpll->dpll_pin);
+err_unregister_dpll_device:
+	dpll_device_unregister(mdpll->dpll, &mlx5_dpll_device_ops, mdpll);
+err_put_dpll_device:
+	dpll_device_put(mdpll->dpll);
+err_free_mdpll:
+	kfree(mdpll);
+	return err;
+}
+
+static void mlx5_dpll_remove(struct auxiliary_device *adev)
+{
+	struct mlx5_dpll *mdpll = auxiliary_get_drvdata(adev);
+	struct mlx5_core_dev *mdev = mdpll->mdev;
+
+	cancel_delayed_work(&mdpll->work);
+	mlx5_dpll_mdev_netdev_untrack(mdpll, mdev);
+	destroy_workqueue(mdpll->wq);
+	dpll_pin_unregister(mdpll->dpll, mdpll->dpll_pin,
+			    &mlx5_dpll_pins_ops, mdpll);
+	dpll_pin_put(mdpll->dpll_pin);
+	dpll_device_unregister(mdpll->dpll, &mlx5_dpll_device_ops, mdpll);
+	dpll_device_put(mdpll->dpll);
+	kfree(mdpll);
+
+	mlx5_dpll_synce_status_set(mdev,
+				   MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING);
+}
+
+static int mlx5_dpll_suspend(struct auxiliary_device *adev, pm_message_t state)
+{
+	return 0;
+}
+
+static int mlx5_dpll_resume(struct auxiliary_device *adev)
+{
+	return 0;
+}
+
+static const struct auxiliary_device_id mlx5_dpll_id_table[] = {
+	{ .name = MLX5_ADEV_NAME ".dpll", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, mlx5_dpll_id_table);
+
+static struct auxiliary_driver mlx5_dpll_driver = {
+	.name = "dpll",
+	.probe = mlx5_dpll_probe,
+	.remove = mlx5_dpll_remove,
+	.suspend = mlx5_dpll_suspend,
+	.resume = mlx5_dpll_resume,
+	.id_table = mlx5_dpll_id_table,
+};
+
+static int __init mlx5_dpll_init(void)
+{
+	return auxiliary_driver_register(&mlx5_dpll_driver);
+}
+
+static void __exit mlx5_dpll_exit(void)
+{
+	auxiliary_driver_unregister(&mlx5_dpll_driver);
+}
+
+module_init(mlx5_dpll_init);
+module_exit(mlx5_dpll_exit);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@nvidia.com>");
+MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) DPLL driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3033bbaeac81..92434814c855 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -155,6 +155,8 @@ enum {
 	MLX5_REG_MCC		 = 0x9062,
 	MLX5_REG_MCDA		 = 0x9063,
 	MLX5_REG_MCAM		 = 0x907f,
+	MLX5_REG_MSECQ		 = 0x9155,
+	MLX5_REG_MSEES		 = 0x9156,
 	MLX5_REG_MIRC		 = 0x9162,
 	MLX5_REG_SBCAM		 = 0xB01F,
 	MLX5_REG_RESOURCE_DUMP   = 0xC000,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fc3db401f8a2..dd8421d021cf 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10176,7 +10176,9 @@ struct mlx5_ifc_mcam_access_reg_bits2 {
 	u8         mirc[0x1];
 	u8         regs_97_to_96[0x2];
 
-	u8         regs_95_to_64[0x20];
+	u8         regs_95_to_87[0x09];
+	u8         synce_registers[0x2];
+	u8         regs_84_to_64[0x15];
 
 	u8         regs_63_to_32[0x20];
 
@@ -12549,4 +12551,59 @@ struct mlx5_ifc_modify_page_track_obj_in_bits {
 	struct mlx5_ifc_page_track_bits obj_context;
 };
 
+struct mlx5_ifc_msecq_reg_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x12];
+	u8         network_option[0x2];
+	u8         local_ssm_code[0x4];
+	u8         local_enhanced_ssm_code[0x8];
+
+	u8         local_clock_identity[0x40];
+
+	u8         reserved_at_80[0x180];
+};
+
+enum {
+	MLX5_MSEES_FIELD_SELECT_ENABLE			= BIT(0),
+	MLX5_MSEES_FIELD_SELECT_ADMIN_STATUS		= BIT(1),
+	MLX5_MSEES_FIELD_SELECT_ADMIN_FREQ_MEASURE	= BIT(2),
+};
+
+enum mlx5_msees_admin_status {
+	MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING		= 0x0,
+	MLX5_MSEES_ADMIN_STATUS_TRACK			= 0x1,
+};
+
+enum mlx5_msees_oper_status {
+	MLX5_MSEES_OPER_STATUS_FREE_RUNNING		= 0x0,
+	MLX5_MSEES_OPER_STATUS_SELF_TRACK		= 0x1,
+	MLX5_MSEES_OPER_STATUS_OTHER_TRACK		= 0x2,
+	MLX5_MSEES_OPER_STATUS_HOLDOVER			= 0x3,
+	MLX5_MSEES_OPER_STATUS_FAIL_HOLDOVER		= 0x4,
+	MLX5_MSEES_OPER_STATUS_FAIL_FREE_RUNNING	= 0x5,
+};
+
+struct mlx5_ifc_msees_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         lp_msb[0x2];
+	u8         reserved_at_14[0xc];
+
+	u8         field_select[0x20];
+
+	u8         admin_status[0x4];
+	u8         oper_status[0x4];
+	u8         ho_acq[0x1];
+	u8         reserved_at_49[0xc];
+	u8         admin_freq_measure[0x1];
+	u8         oper_freq_measure[0x1];
+	u8         failure_reason[0x9];
+
+	u8         frequency_diff[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3-70-g09d2


From 0b7a2721e36c11313f8b0f251a508d25a872cd28 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 09:12:39 +0200
Subject: devlink: expose peer SF devlink instance

Introduce a new helper devl_port_fn_devlink_set() to be used by driver
assigning a devlink instance to the peer devlink port function.

Expose this to user over new netlink attribute nested under port
function nest to expose devlink handle related to the port function.

This is particularly helpful for user to understand the relationship
between devlink instances created for SFs and the port functions
they belong to.

Note that caller of devlink_port_notify() needs to hold devlink
instance lock, put the assertion to devl_port_fn_devlink_set() to make
this requirement explicit. Also note the limitations that only allow to
make this assignment for registered objects.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  3 +++
 include/uapi/linux/devlink.h |  1 +
 net/devlink/port.c           | 51 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 29fd1b4ee654..2655ab6101ec 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -150,6 +150,7 @@ struct devlink_port {
 
 	struct devlink_rate *devlink_rate;
 	struct devlink_linecard *linecard;
+	u32 rel_index;
 };
 
 struct devlink_port_new_attrs {
@@ -1697,6 +1698,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
 void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 				   u32 controller, u16 pf, u32 sf,
 				   bool external);
+int devl_port_fn_devlink_set(struct devlink_port *devlink_port,
+			     struct devlink *fn_devlink);
 struct devlink_rate *
 devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name,
 		      struct devlink_rate *parent);
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 03875e078be8..cd4b82458d1b 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -680,6 +680,7 @@ enum devlink_port_function_attr {
 	DEVLINK_PORT_FN_ATTR_STATE,	/* u8 */
 	DEVLINK_PORT_FN_ATTR_OPSTATE,	/* u8 */
 	DEVLINK_PORT_FN_ATTR_CAPS,	/* bitfield32 */
+	DEVLINK_PORT_FN_ATTR_DEVLINK,	/* nested */
 
 	__DEVLINK_PORT_FUNCTION_ATTR_MAX,
 	DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1
diff --git a/net/devlink/port.c b/net/devlink/port.c
index 7b300a322ed9..4e9003242448 100644
--- a/net/devlink/port.c
+++ b/net/devlink/port.c
@@ -428,6 +428,13 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
 	if (err)
 		goto out;
 	err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated);
+	if (err)
+		goto out;
+	err = devlink_rel_devlink_handle_put(msg, port->devlink,
+					     port->rel_index,
+					     DEVLINK_PORT_FN_ATTR_DEVLINK,
+					     &msg_updated);
+
 out:
 	if (err || !msg_updated)
 		nla_nest_cancel(msg, function_attr);
@@ -1392,6 +1399,50 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
 
+static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index)
+{
+	struct devlink_port *devlink_port;
+
+	devlink_port = devlink_port_get_by_index(devlink, port_index);
+	if (!devlink_port)
+		return;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index,
+					u32 rel_index)
+{
+	struct devlink_port *devlink_port;
+
+	devlink_port = devlink_port_get_by_index(devlink, port_index);
+	if (devlink_port && devlink_port->rel_index == rel_index)
+		devlink_port->rel_index = 0;
+}
+
+/**
+ * devl_port_fn_devlink_set - Attach peer devlink
+ *			      instance to port function.
+ * @devlink_port: devlink port
+ * @fn_devlink: devlink instance to attach
+ */
+int devl_port_fn_devlink_set(struct devlink_port *devlink_port,
+			     struct devlink *fn_devlink)
+{
+	ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
+	if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF ||
+		    devlink_port->attrs.pci_sf.external))
+		return -EINVAL;
+
+	return devlink_rel_nested_in_add(&devlink_port->rel_index,
+					 devlink_port->devlink->index,
+					 devlink_port->index,
+					 devlink_port_rel_notify_cb,
+					 devlink_port_rel_cleanup_cb,
+					 fn_devlink);
+}
+EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set);
+
 /**
  *	devlink_port_linecard_set - Link port with a linecard
  *
-- 
cgit v1.2.3-70-g09d2


From ac5f395685bd16ca30c1c834dcbf8b555605ccae Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 09:12:40 +0200
Subject: net/mlx5: SF, Implement peer devlink set for SF representor devlink
 port

Benefit from the existence of internal mlx5 notifier and extend it by
event MLX5_DRIVER_EVENT_SF_PEER_DEVLINK. Use this event from SF
auxiliary device probe/remove functions to pass the registered SF
devlink instance to the SF representor.

Process the new event in SF representor code and call
devl_port_fn_devlink_set() to do the assignments. Implement this in work
to avoid possible deadlock when probe/remove function of SF may be
called with devlink instance lock held during devlink reload.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.h   |  6 ++++
 .../ethernet/mellanox/mlx5/core/sf/dev/driver.c    | 26 +++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/sf/devlink.c   | 34 ++++++++++++++++++++++
 include/linux/mlx5/device.h                        |  1 +
 4 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
index 2a66a427ef15..b99131e95e37 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
@@ -19,6 +19,12 @@ struct mlx5_sf_dev {
 	u16 fn_id;
 };
 
+struct mlx5_sf_peer_devlink_event_ctx {
+	u16 fn_id;
+	struct devlink *devlink;
+	int err;
+};
+
 void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev);
 void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
index 8fe82f1191bb..169c2c68ed5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
@@ -8,6 +8,20 @@
 #include "dev.h"
 #include "devlink.h"
 
+static int mlx5_core_peer_devlink_set(struct mlx5_sf_dev *sf_dev, struct devlink *devlink)
+{
+	struct mlx5_sf_peer_devlink_event_ctx event_ctx = {
+		.fn_id = sf_dev->fn_id,
+		.devlink = devlink,
+	};
+	int ret;
+
+	ret = mlx5_blocking_notifier_call_chain(sf_dev->parent_mdev,
+						MLX5_DRIVER_EVENT_SF_PEER_DEVLINK,
+						&event_ctx);
+	return ret == NOTIFY_OK ? event_ctx.err : 0;
+}
+
 static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id)
 {
 	struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev);
@@ -54,9 +68,21 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia
 		mlx5_core_warn(mdev, "mlx5_init_one err=%d\n", err);
 		goto init_one_err;
 	}
+
+	err = mlx5_core_peer_devlink_set(sf_dev, devlink);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_core_peer_devlink_set err=%d\n", err);
+		goto peer_devlink_set_err;
+	}
+
 	devlink_register(devlink);
 	return 0;
 
+peer_devlink_set_err:
+	if (mlx5_dev_is_lightweight(sf_dev->mdev))
+		mlx5_uninit_one_light(sf_dev->mdev);
+	else
+		mlx5_uninit_one(sf_dev->mdev);
 init_one_err:
 	iounmap(mdev->iseg);
 remap_err:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index e34a8f88c518..964a5b1876f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -28,6 +28,7 @@ struct mlx5_sf_table {
 	struct mutex sf_state_lock; /* Serializes sf state among user cmds & vhca event handler. */
 	struct notifier_block esw_nb;
 	struct notifier_block vhca_nb;
+	struct notifier_block mdev_nb;
 };
 
 static struct mlx5_sf *
@@ -511,6 +512,35 @@ static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, voi
 	return 0;
 }
 
+static int mlx5_sf_mdev_event(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, mdev_nb);
+	struct mlx5_sf_peer_devlink_event_ctx *event_ctx = data;
+	int ret = NOTIFY_DONE;
+	struct mlx5_sf *sf;
+
+	if (event != MLX5_DRIVER_EVENT_SF_PEER_DEVLINK)
+		return NOTIFY_DONE;
+
+	table = mlx5_sf_table_try_get(table->dev);
+	if (!table)
+		return NOTIFY_DONE;
+
+	mutex_lock(&table->sf_state_lock);
+	sf = mlx5_sf_lookup_by_function_id(table, event_ctx->fn_id);
+	if (!sf)
+		goto out;
+
+	event_ctx->err = devl_port_fn_devlink_set(&sf->dl_port.dl_port,
+						  event_ctx->devlink);
+
+	ret = NOTIFY_OK;
+out:
+	mutex_unlock(&table->sf_state_lock);
+	mlx5_sf_table_put(table);
+	return ret;
+}
+
 static bool mlx5_sf_table_supported(const struct mlx5_core_dev *dev)
 {
 	return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) &&
@@ -544,6 +574,9 @@ int mlx5_sf_table_init(struct mlx5_core_dev *dev)
 	if (err)
 		goto vhca_err;
 
+	table->mdev_nb.notifier_call = mlx5_sf_mdev_event;
+	mlx5_blocking_notifier_register(dev, &table->mdev_nb);
+
 	return 0;
 
 vhca_err:
@@ -562,6 +595,7 @@ void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
 	if (!table)
 		return;
 
+	mlx5_blocking_notifier_unregister(dev, &table->mdev_nb);
 	mlx5_vhca_event_notifier_unregister(table->dev, &table->vhca_nb);
 	mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb);
 	WARN_ON(refcount_read(&table->refcount));
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 4d5be378fa8c..8fbe22de16ef 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -366,6 +366,7 @@ enum mlx5_driver_event {
 	MLX5_DRIVER_EVENT_UPLINK_NETDEV,
 	MLX5_DRIVER_EVENT_MACSEC_SA_ADDED,
 	MLX5_DRIVER_EVENT_MACSEC_SA_DELETED,
+	MLX5_DRIVER_EVENT_SF_PEER_DEVLINK,
 };
 
 enum {
-- 
cgit v1.2.3-70-g09d2


From 9473bc0119e7e7630d7c1c7c3816c290a6f3ae19 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 09:12:41 +0200
Subject: devlink: convert linecard nested devlink to new rel infrastructure

Benefit from the newly introduced rel infrastructure, treat the linecard
nested devlink instances in the same way as port function instances.
Convert the code to use the rel infrastructure.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlxsw/core_linecard_dev.c    |  9 +++--
 include/net/devlink.h                              |  4 +-
 net/devlink/linecard.c                             | 47 ++++++++++++++++------
 3 files changed, 42 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_linecard_dev.c b/drivers/net/ethernet/mellanox/mlxsw/core_linecard_dev.c
index af37e650a8ad..e8d6fe35bf36 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_linecard_dev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_linecard_dev.c
@@ -132,6 +132,7 @@ static int mlxsw_linecard_bdev_probe(struct auxiliary_device *adev,
 	struct mlxsw_linecard *linecard = linecard_bdev->linecard;
 	struct mlxsw_linecard_dev *linecard_dev;
 	struct devlink *devlink;
+	int err;
 
 	devlink = devlink_alloc(&mlxsw_linecard_dev_devlink_ops,
 				sizeof(*linecard_dev), &adev->dev);
@@ -141,8 +142,12 @@ static int mlxsw_linecard_bdev_probe(struct auxiliary_device *adev,
 	linecard_dev->linecard = linecard_bdev->linecard;
 	linecard_bdev->linecard_dev = linecard_dev;
 
+	err = devlink_linecard_nested_dl_set(linecard->devlink_linecard, devlink);
+	if (err) {
+		devlink_free(devlink);
+		return err;
+	}
 	devlink_register(devlink);
-	devlink_linecard_nested_dl_set(linecard->devlink_linecard, devlink);
 	return 0;
 }
 
@@ -151,9 +156,7 @@ static void mlxsw_linecard_bdev_remove(struct auxiliary_device *adev)
 	struct mlxsw_linecard_bdev *linecard_bdev =
 			container_of(adev, struct mlxsw_linecard_bdev, adev);
 	struct devlink *devlink = priv_to_devlink(linecard_bdev->linecard_dev);
-	struct mlxsw_linecard *linecard = linecard_bdev->linecard;
 
-	devlink_linecard_nested_dl_set(linecard->devlink_linecard, NULL);
 	devlink_unregister(devlink);
 	devlink_free(devlink);
 }
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 2655ab6101ec..0dfcd7d7fa18 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1720,8 +1720,8 @@ void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard);
 void devlink_linecard_activate(struct devlink_linecard *linecard);
 void devlink_linecard_deactivate(struct devlink_linecard *linecard);
-void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
-				    struct devlink *nested_devlink);
+int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+				   struct devlink *nested_devlink);
 int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
 		     u32 size, u16 ingress_pools_count,
 		     u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/net/devlink/linecard.c b/net/devlink/linecard.c
index 36170f466878..9ff1813f88c5 100644
--- a/net/devlink/linecard.c
+++ b/net/devlink/linecard.c
@@ -17,7 +17,7 @@ struct devlink_linecard {
 	const char *type;
 	struct devlink_linecard_type *types;
 	unsigned int types_count;
-	struct devlink *nested_devlink;
+	u32 rel_index;
 };
 
 unsigned int devlink_linecard_index(struct devlink_linecard *linecard)
@@ -112,10 +112,10 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 		nla_nest_end(msg, attr);
 	}
 
-	if (linecard->nested_devlink &&
-	    devlink_nl_put_nested_handle(msg, devlink_net(devlink),
-					 linecard->nested_devlink,
-					 DEVLINK_ATTR_NESTED_DEVLINK))
+	if (devlink_rel_devlink_handle_put(msg, devlink,
+					   linecard->rel_index,
+					   DEVLINK_ATTR_NESTED_DEVLINK,
+					   NULL))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
@@ -524,7 +524,6 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
 {
 	mutex_lock(&linecard->state_lock);
-	WARN_ON(linecard->nested_devlink);
 	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
 	linecard->type = NULL;
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
@@ -543,7 +542,6 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
 {
 	mutex_lock(&linecard->state_lock);
-	WARN_ON(linecard->nested_devlink);
 	linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
 	mutex_unlock(&linecard->state_lock);
@@ -591,6 +589,27 @@ void devlink_linecard_deactivate(struct devlink_linecard *linecard)
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
 
+static void devlink_linecard_rel_notify_cb(struct devlink *devlink,
+					   u32 linecard_index)
+{
+	struct devlink_linecard *linecard;
+
+	linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+	if (!linecard)
+		return;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+}
+
+static void devlink_linecard_rel_cleanup_cb(struct devlink *devlink,
+					    u32 linecard_index, u32 rel_index)
+{
+	struct devlink_linecard *linecard;
+
+	linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+	if (linecard && linecard->rel_index == rel_index)
+		linecard->rel_index = 0;
+}
+
 /**
  *	devlink_linecard_nested_dl_set - Attach/detach nested devlink
  *					 instance to linecard.
@@ -598,12 +617,14 @@ EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
  *	@linecard: devlink linecard
  *	@nested_devlink: devlink instance to attach or NULL to detach
  */
-void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
-				    struct devlink *nested_devlink)
+int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+				   struct devlink *nested_devlink)
 {
-	mutex_lock(&linecard->state_lock);
-	linecard->nested_devlink = nested_devlink;
-	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
-	mutex_unlock(&linecard->state_lock);
+	return devlink_rel_nested_in_add(&linecard->rel_index,
+					 linecard->devlink->index,
+					 linecard->index,
+					 devlink_linecard_rel_notify_cb,
+					 devlink_linecard_rel_cleanup_cb,
+					 nested_devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
-- 
cgit v1.2.3-70-g09d2


From c5e1bf8a51cfe5060e91c7533098e329c0118f6d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 09:12:42 +0200
Subject: devlink: introduce possibility to expose info about nested devlinks

In mlx5, there is a devlink instance created for PCI device. Also, one
separate devlink instance is created for auxiliary device that
represents the netdev of uplink port. This relation is currently
invisible to the devlink user.

Benefit from the rel infrastructure and allow for nested devlink
instance to set the relationship for the nested-in devlink instance.
Note that there may be many nested instances, therefore use xarray to
hold the list of rel_indexes for individual nested instances.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h       |  2 ++
 net/devlink/core.c          |  2 ++
 net/devlink/dev.c           | 49 +++++++++++++++++++++++++++++++++++++++++++++
 net/devlink/devl_internal.h |  1 +
 4 files changed, 54 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0dfcd7d7fa18..fad8e36e3d98 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1921,6 +1921,8 @@ devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 void
 devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter);
 
+int devl_nested_devlink_set(struct devlink *devlink,
+			    struct devlink *nested_devlink);
 bool devlink_is_reload_failed(const struct devlink *devlink);
 void devlink_remote_reload_actions_performed(struct devlink *devlink,
 					     enum devlink_reload_limit limit,
diff --git a/net/devlink/core.c b/net/devlink/core.c
index 2a98ff9a2f6b..bcbbb952569f 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -430,6 +430,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 	xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC);
 	xa_init_flags(&devlink->params, XA_FLAGS_ALLOC);
 	xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
+	xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC);
 	write_pnet(&devlink->_net, net);
 	INIT_LIST_HEAD(&devlink->rate_list);
 	INIT_LIST_HEAD(&devlink->linecard_list);
@@ -476,6 +477,7 @@ void devlink_free(struct devlink *devlink)
 	WARN_ON(!list_empty(&devlink->linecard_list));
 	WARN_ON(!xa_empty(&devlink->ports));
 
+	xa_destroy(&devlink->nested_rels);
 	xa_destroy(&devlink->snapshot_ids);
 	xa_destroy(&devlink->params);
 	xa_destroy(&devlink->ports);
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
index 3ae26d9088ab..dc8039ca2b38 100644
--- a/net/devlink/dev.c
+++ b/net/devlink/dev.c
@@ -138,6 +138,23 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int devlink_nl_nested_fill(struct sk_buff *msg, struct devlink *devlink)
+{
+	unsigned long rel_index;
+	void *unused;
+	int err;
+
+	xa_for_each(&devlink->nested_rels, rel_index, unused) {
+		err = devlink_rel_devlink_handle_put(msg, devlink,
+						     rel_index,
+						     DEVLINK_ATTR_NESTED_DEVLINK,
+						     NULL);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
 			   enum devlink_command cmd, u32 portid,
 			   u32 seq, int flags)
@@ -164,6 +181,10 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
 		goto dev_stats_nest_cancel;
 
 	nla_nest_end(msg, dev_stats);
+
+	if (devlink_nl_nested_fill(msg, devlink))
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -230,6 +251,34 @@ int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
 	return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one);
 }
 
+static void devlink_rel_notify_cb(struct devlink *devlink, u32 obj_index)
+{
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void devlink_rel_cleanup_cb(struct devlink *devlink, u32 obj_index,
+				   u32 rel_index)
+{
+	xa_erase(&devlink->nested_rels, rel_index);
+}
+
+int devl_nested_devlink_set(struct devlink *devlink,
+			    struct devlink *nested_devlink)
+{
+	u32 rel_index;
+	int err;
+
+	err = devlink_rel_nested_in_add(&rel_index, devlink->index, 0,
+					devlink_rel_notify_cb,
+					devlink_rel_cleanup_cb,
+					nested_devlink);
+	if (err)
+		return err;
+	return xa_insert(&devlink->nested_rels, rel_index,
+			 xa_mk_value(0), GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(devl_nested_devlink_set);
+
 void devlink_notify_register(struct devlink *devlink)
 {
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
index 4cb534aff44d..741d1bf1bec8 100644
--- a/net/devlink/devl_internal.h
+++ b/net/devlink/devl_internal.h
@@ -58,6 +58,7 @@ struct devlink {
 	refcount_t refcount;
 	struct rcu_work rwork;
 	struct devlink_rel *rel;
+	struct xarray nested_rels;
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
-- 
cgit v1.2.3-70-g09d2


From ddd7f45c899f7524bdbe6a32fe4906cde8b07b9b Mon Sep 17 00:00:00 2001
From: Wen Gong <quic_wgong@quicinc.com>
Date: Thu, 14 Sep 2023 04:20:26 -0400
Subject: wifi: cfg80211: save power spectral density(psd) of regulatory rule

6 GHz regulatory domains introduces Power Spectral Density (PSD).
The PSD value of the regulatory rule should be taken into effect
for the ieee80211_channels falling into that particular regulatory
rule. Save the values in the channel which has PSD value and add
nl80211 attributes accordingly to handle it.

Co-developed-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Signed-off-by: Wen Gong <quic_wgong@quicinc.com>
Link: https://lore.kernel.org/r/20230914082026.3709-1-quic_wgong@quicinc.com
[use hole in chan flags, reword docs]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  6 +++++-
 include/net/regulatory.h     |  1 +
 include/uapi/linux/nl80211.h |  9 +++++++++
 net/wireless/nl80211.c       |  9 +++++++++
 net/wireless/reg.c           | 17 +++++++++++++++++
 5 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8fcfe1869424..9af714431b22 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -76,6 +76,8 @@ struct wiphy;
  * @IEEE80211_CHAN_DISABLED: This channel is disabled.
  * @IEEE80211_CHAN_NO_IR: do not initiate radiation, this includes
  *	sending probe requests or beaconing.
+ * @IEEE80211_CHAN_PSD: Power spectral density (in dBm) is set for this
+ *	channel.
  * @IEEE80211_CHAN_RADAR: Radar detection is required on this channel.
  * @IEEE80211_CHAN_NO_HT40PLUS: extension channel above this channel
  *	is not permitted.
@@ -119,7 +121,7 @@ struct wiphy;
 enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DISABLED		= 1<<0,
 	IEEE80211_CHAN_NO_IR		= 1<<1,
-	/* hole at 1<<2 */
+	IEEE80211_CHAN_PSD		= 1<<2,
 	IEEE80211_CHAN_RADAR		= 1<<3,
 	IEEE80211_CHAN_NO_HT40PLUS	= 1<<4,
 	IEEE80211_CHAN_NO_HT40MINUS	= 1<<5,
@@ -171,6 +173,7 @@ enum ieee80211_channel_flags {
  *	on this channel.
  * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered.
  * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels.
+ * @psd: power spectral density (in dBm)
  */
 struct ieee80211_channel {
 	enum nl80211_band band;
@@ -187,6 +190,7 @@ struct ieee80211_channel {
 	enum nl80211_dfs_state dfs_state;
 	unsigned long dfs_state_entered;
 	unsigned int dfs_cac_ms;
+	s8 psd;
 };
 
 /**
diff --git a/include/net/regulatory.h b/include/net/regulatory.h
index b2cb4a9eb04d..ebf9e028d1ef 100644
--- a/include/net/regulatory.h
+++ b/include/net/regulatory.h
@@ -213,6 +213,7 @@ struct ieee80211_reg_rule {
 	u32 flags;
 	u32 dfs_cac_ms;
 	bool has_wmm;
+	s8 psd;
 };
 
 struct ieee80211_regdomain {
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f797ab7a6547..367e5fbc8930 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4215,6 +4215,8 @@ enum nl80211_wmm_rule {
  *	as the primary or any of the secondary channels isn't possible
  * @NL80211_FREQUENCY_ATTR_NO_EHT: EHT operation is not allowed on this channel
  *	in current regulatory domain.
+ * @NL80211_FREQUENCY_ATTR_PSD: Power spectral density (in dBm) that
+ *	is allowed on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -4253,6 +4255,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_16MHZ,
 	NL80211_FREQUENCY_ATTR_NO_320MHZ,
 	NL80211_FREQUENCY_ATTR_NO_EHT,
+	NL80211_FREQUENCY_ATTR_PSD,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
@@ -4353,6 +4356,8 @@ enum nl80211_reg_type {
  * 	a given frequency range. The value is in mBm (100 * dBm).
  * @NL80211_ATTR_DFS_CAC_TIME: DFS CAC time in milliseconds.
  *	If not present or 0 default CAC time will be used.
+ * @NL80211_ATTR_POWER_RULE_PSD: power spectral density (in dBm).
+ *	This could be negative.
  * @NL80211_REG_RULE_ATTR_MAX: highest regulatory rule attribute number
  *	currently defined
  * @__NL80211_REG_RULE_ATTR_AFTER_LAST: internal use
@@ -4370,6 +4375,8 @@ enum nl80211_reg_rule_attr {
 
 	NL80211_ATTR_DFS_CAC_TIME,
 
+	NL80211_ATTR_POWER_RULE_PSD,
+
 	/* keep last */
 	__NL80211_REG_RULE_ATTR_AFTER_LAST,
 	NL80211_REG_RULE_ATTR_MAX = __NL80211_REG_RULE_ATTR_AFTER_LAST - 1
@@ -4453,6 +4460,7 @@ enum nl80211_sched_scan_match_attr {
  * @NL80211_RRF_NO_HE: HE operation not allowed
  * @NL80211_RRF_NO_320MHZ: 320MHz operation not allowed
  * @NL80211_RRF_NO_EHT: EHT operation not allowed
+ * @NL80211_RRF_PSD: Ruleset has power spectral density value
  */
 enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_OFDM		= 1<<0,
@@ -4473,6 +4481,7 @@ enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_HE		= 1<<17,
 	NL80211_RRF_NO_320MHZ		= 1<<18,
 	NL80211_RRF_NO_EHT		= 1<<19,
+	NL80211_RRF_PSD			= 1<<20,
 };
 
 #define NL80211_RRF_PASSIVE_SCAN	NL80211_RRF_NO_IR
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 218093607b29..e64bf2a58b36 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1115,6 +1115,10 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 	if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_OFFSET, chan->freq_offset))
 		goto nla_put_failure;
 
+	if ((chan->flags & IEEE80211_CHAN_PSD) &&
+	    nla_put_s8(msg, NL80211_FREQUENCY_ATTR_PSD, chan->psd))
+		goto nla_put_failure;
+
 	if ((chan->flags & IEEE80211_CHAN_DISABLED) &&
 	    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED))
 		goto nla_put_failure;
@@ -8529,6 +8533,11 @@ static int nl80211_put_regdom(const struct ieee80211_regdomain *regdom,
 				reg_rule->dfs_cac_ms))
 			goto nla_put_failure;
 
+		if ((reg_rule->flags & NL80211_RRF_PSD) &&
+		    nla_put_s8(msg, NL80211_ATTR_POWER_RULE_PSD,
+			       reg_rule->psd))
+			goto nla_put_failure;
+
 		nla_nest_end(msg, nl_reg_rule);
 	}
 
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 33e2570f2bd6..eb2fa97457b4 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1589,6 +1589,8 @@ static u32 map_regdom_flags(u32 rd_flags)
 		channel_flags |= IEEE80211_CHAN_NO_320MHZ;
 	if (rd_flags & NL80211_RRF_NO_EHT)
 		channel_flags |= IEEE80211_CHAN_NO_EHT;
+	if (rd_flags & NL80211_RRF_PSD)
+		channel_flags |= IEEE80211_CHAN_PSD;
 	return channel_flags;
 }
 
@@ -1795,6 +1797,9 @@ static void handle_channel_single_rule(struct wiphy *wiphy,
 				chan->dfs_cac_ms = reg_rule->dfs_cac_ms;
 		}
 
+		if (chan->flags & IEEE80211_CHAN_PSD)
+			chan->psd = reg_rule->psd;
+
 		return;
 	}
 
@@ -1815,6 +1820,9 @@ static void handle_channel_single_rule(struct wiphy *wiphy,
 			chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
 	}
 
+	if (chan->flags & IEEE80211_CHAN_PSD)
+		chan->psd = reg_rule->psd;
+
 	if (chan->orig_mpwr) {
 		/*
 		 * Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER
@@ -1884,6 +1892,12 @@ static void handle_channel_adjacent_rules(struct wiphy *wiphy,
 							 rrule2->dfs_cac_ms);
 		}
 
+		if ((rrule1->flags & NL80211_RRF_PSD) &&
+		    (rrule2->flags & NL80211_RRF_PSD))
+			chan->psd = min_t(s8, rrule1->psd, rrule2->psd);
+		else
+			chan->flags &= ~NL80211_RRF_PSD;
+
 		return;
 	}
 
@@ -2570,6 +2584,9 @@ static void handle_channel_custom(struct wiphy *wiphy,
 			chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
 	}
 
+	if (chan->flags & IEEE80211_CHAN_PSD)
+		chan->psd = reg_rule->psd;
+
 	chan->max_power = chan->max_reg_power;
 }
 
-- 
cgit v1.2.3-70-g09d2


From f7b5bd725b737de3f2c4a836e07c82ba156d75df Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Thu, 14 Sep 2023 15:31:57 -0700
Subject: pds_core: check health in devcmd wait

Similar to what we do in the AdminQ, check for devcmd health
while waiting for an answer.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/dev.c | 11 +++++++++--
 include/linux/pds/pds_core_if.h         |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c
index f77cd9f5a2fd..7c1b965d61a9 100644
--- a/drivers/net/ethernet/amd/pds_core/dev.c
+++ b/drivers/net/ethernet/amd/pds_core/dev.c
@@ -42,6 +42,8 @@ int pdsc_err_to_errno(enum pds_core_status_code code)
 		return -ERANGE;
 	case PDS_RC_BAD_ADDR:
 		return -EFAULT;
+	case PDS_RC_BAD_PCI:
+		return -ENXIO;
 	case PDS_RC_EOPCODE:
 	case PDS_RC_EINTR:
 	case PDS_RC_DEV_CMD:
@@ -62,7 +64,7 @@ bool pdsc_is_fw_running(struct pdsc *pdsc)
 	/* Firmware is useful only if the running bit is set and
 	 * fw_status != 0xff (bad PCI read)
 	 */
-	return (pdsc->fw_status != 0xff) &&
+	return (pdsc->fw_status != PDS_RC_BAD_PCI) &&
 		(pdsc->fw_status & PDS_CORE_FW_STS_F_RUNNING);
 }
 
@@ -128,6 +130,7 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds)
 	unsigned long max_wait;
 	unsigned long duration;
 	int timeout = 0;
+	bool running;
 	int done = 0;
 	int err = 0;
 	int status;
@@ -136,6 +139,10 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds)
 	max_wait = start_time + (max_seconds * HZ);
 
 	while (!done && !timeout) {
+		running = pdsc_is_fw_running(pdsc);
+		if (!running)
+			break;
+
 		done = pdsc_devcmd_done(pdsc);
 		if (done)
 			break;
@@ -152,7 +159,7 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds)
 		dev_dbg(dev, "DEVCMD %d %s after %ld secs\n",
 			opcode, pdsc_devcmd_str(opcode), duration / HZ);
 
-	if (!done || timeout) {
+	if ((!done || timeout) && running) {
 		dev_err(dev, "DEVCMD %d %s timeout, done %d timeout %d max_seconds=%d\n",
 			opcode, pdsc_devcmd_str(opcode), done, timeout,
 			max_seconds);
diff --git a/include/linux/pds/pds_core_if.h b/include/linux/pds/pds_core_if.h
index e838a2b90440..17a87c1a55d7 100644
--- a/include/linux/pds/pds_core_if.h
+++ b/include/linux/pds/pds_core_if.h
@@ -79,6 +79,7 @@ enum pds_core_status_code {
 	PDS_RC_EVFID	= 31,	/* VF ID does not exist */
 	PDS_RC_BAD_FW	= 32,	/* FW file is invalid or corrupted */
 	PDS_RC_ECLIENT	= 33,   /* No such client id */
+	PDS_RC_BAD_PCI	= 255,  /* Broken PCI when reading status */
 };
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 1cb6422ecac8804ebe0b71f4b3440674955fec73 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:15:10 -0700
Subject: ceph: Annotate struct ceph_monmap with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ceph_monmap.
Additionally, since the element count member must be set before accessing
the annotated flexible array member, move its initialization earlier.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: ceph-devel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ceph/mon_client.h | 2 +-
 net/ceph/mon_client.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index b658961156a0..7a9a40163c0f 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -19,7 +19,7 @@ struct ceph_monmap {
 	struct ceph_fsid fsid;
 	u32 epoch;
 	u32 num_mon;
-	struct ceph_entity_inst mon_inst[];
+	struct ceph_entity_inst mon_inst[] __counted_by(num_mon);
 };
 
 struct ceph_mon_client;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index faabad6603db..f263f7e91a21 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1136,6 +1136,7 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 			       GFP_KERNEL);
 	if (!monc->monmap)
 		return -ENOMEM;
+	monc->monmap->num_mon = num_mon;
 
 	for (i = 0; i < num_mon; i++) {
 		struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
@@ -1147,7 +1148,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 		inst->name.type = CEPH_ENTITY_TYPE_MON;
 		inst->name.num = cpu_to_le64(i);
 	}
-	monc->monmap->num_mon = num_mon;
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 8452a05b2c633b708dbe3e742f71b24bf21fe42d Mon Sep 17 00:00:00 2001
From: Rohan G Thomas <rohan.g.thomas@intel.com>
Date: Sat, 16 Sep 2023 14:33:12 +0800
Subject: net: stmmac: Tx coe sw fallback

Add sw fallback of tx checksum calculation for those tx queues that
don't support tx checksum offloading. DW xGMAC IP can be synthesized
such that it can support tx checksum offloading only for a few
initial tx queues. Also as Serge pointed out, for the DW QoS IP, tx
coe can be individually configured for each tx queue.

So when tx coe is enabled, for any tx queue that doesn't support
tx coe with 'coe-unsupported' flag set will have a sw fallback
happen in the driver for tx checksum calculation when any packets to
be transmitted on these tx queues.

Signed-off-by: Rohan G Thomas <rohan.g.thomas@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     | 10 ++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c |  3 +++
 include/linux/stmmac.h                                |  1 +
 3 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 2206789802bf..9201ed778ebc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4401,6 +4401,16 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	WARN_ON(tx_q->tx_skbuff[first_entry]);
 
 	csum_insertion = (skb->ip_summed == CHECKSUM_PARTIAL);
+	/* DWMAC IPs can be synthesized to support tx coe only for a few tx
+	 * queues. In that case, checksum offloading for those queues that don't
+	 * support tx coe needs to fallback to software checksum calculation.
+	 */
+	if (csum_insertion &&
+	    priv->plat->tx_queues_cfg[queue].coe_unsupported) {
+		if (unlikely(skb_checksum_help(skb)))
+			goto dma_map_err;
+		csum_insertion = !csum_insertion;
+	}
 
 	if (likely(priv->extend_desc))
 		desc = (struct dma_desc *)(tx_q->dma_etx + entry);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 0f28795e581c..a09014c9e7d0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -276,6 +276,9 @@ static int stmmac_mtl_setup(struct platform_device *pdev,
 			plat->tx_queues_cfg[queue].use_prio = true;
 		}
 
+		plat->tx_queues_cfg[queue].coe_unsupported =
+			of_property_read_bool(q_node, "snps,coe-unsupported");
+
 		queue++;
 	}
 	if (queue != plat->tx_queues_to_use) {
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index ce89cc3e4913..c0079a7574ae 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -139,6 +139,7 @@ struct stmmac_rxq_cfg {
 
 struct stmmac_txq_cfg {
 	u32 weight;
+	bool coe_unsupported;
 	u8 mode_to_use;
 	/* Credit Base Shaper parameters */
 	u32 send_slope;
-- 
cgit v1.2.3-70-g09d2


From 6a23c555f7eb436d6799533675ffa179db3d5834 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 18 Sep 2023 14:25:36 +0100
Subject: net: phy: fix regression with AX88772A PHY driver

Marek reports that a deadlock occurs with the AX88772A PHY used on the
ASIX USB network driver:

asix 1-1.4:1.0 (unnamed net_device) (uninitialized): PHY [usb-001:003:10] driver [Asix Electronics AX88772A] (irq=POLL)
Asix Electronics AX88772A usb-001:003:10: attached PHY driver(mii_bus:phy_addr=usb-001:003:10, irq=POLL)
asix 1-1.4:1.0 eth0: register 'asix' at usb-12110000.usb-1.4, ASIX AX88772 USB 2.0 Ethernet, a2:99:b6:cd:11:eb
asix 1-1.4:1.0 eth0: configuring for phy/internal link mode

============================================
WARNING: possible recursive locking detected
6.6.0-rc1-00239-g8da77df649c4-dirty #13949 Not tainted
--------------------------------------------
kworker/3:3/71 is trying to acquire lock:
c6c704cc (&dev->lock){+.+.}-{3:3}, at: phy_start_aneg+0x1c/0x38

but task is already holding lock:
c6c704cc (&dev->lock){+.+.}-{3:3}, at: phy_state_machine+0x100/0x2b8

This is because we now consistently call phy_process_state_change()
while holding phydev->lock, but the AX88772A PHY driver then goes on
to call phy_start_aneg() which tries to grab the same lock - causing
deadlock.

Fix this by exporting the unlocked version, and use this in the PHY
driver instead.

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Fixes: ef113a60d0a9 ("net: phy: call phy_error_precise() while holding the lock")
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/E1qiEFs-007g7b-Lq@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/ax88796b.c | 2 +-
 drivers/net/phy/phy.c      | 3 ++-
 include/linux/phy.h        | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/ax88796b.c b/drivers/net/phy/ax88796b.c
index 0f1e617a26c9..eb74a8cf8df1 100644
--- a/drivers/net/phy/ax88796b.c
+++ b/drivers/net/phy/ax88796b.c
@@ -90,7 +90,7 @@ static void asix_ax88772a_link_change_notify(struct phy_device *phydev)
 	 */
 	if (phydev->state == PHY_NOLINK) {
 		phy_init_hw(phydev);
-		phy_start_aneg(phydev);
+		_phy_start_aneg(phydev);
 	}
 }
 
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 93a8676dd8d8..a5fa077650e8 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -981,7 +981,7 @@ static int phy_check_link_status(struct phy_device *phydev)
  *   If the PHYCONTROL Layer is operating, we change the state to
  *   reflect the beginning of Auto-negotiation or forcing.
  */
-static int _phy_start_aneg(struct phy_device *phydev)
+int _phy_start_aneg(struct phy_device *phydev)
 {
 	int err;
 
@@ -1002,6 +1002,7 @@ static int _phy_start_aneg(struct phy_device *phydev)
 
 	return err;
 }
+EXPORT_SYMBOL(_phy_start_aneg);
 
 /**
  * phy_start_aneg - start auto-negotiation for this PHY device
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1351b802ffcf..3cc52826f18e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1736,6 +1736,7 @@ void phy_detach(struct phy_device *phydev);
 void phy_start(struct phy_device *phydev);
 void phy_stop(struct phy_device *phydev);
 int phy_config_aneg(struct phy_device *phydev);
+int _phy_start_aneg(struct phy_device *phydev);
 int phy_start_aneg(struct phy_device *phydev);
 int phy_aneg_done(struct phy_device *phydev);
 int phy_speed_down(struct phy_device *phydev, bool sync);
-- 
cgit v1.2.3-70-g09d2


From fa17a6d8a5bd0cd7565b613cb804242cd0f6b7ab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 18 Sep 2023 14:23:21 +0000
Subject: ipv6: lockless IPV6_ADDR_PREFERENCES implementation

We have data-races while reading np->srcprefs

Switch the field to a plain byte, add READ_ONCE()
and WRITE_ONCE() annotations where needed,
and IPV6_ADDR_PREFERENCES setsockopt() can now be lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230918142321.1794107-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/ipv6.h     |  2 +-
 include/net/ip6_route.h  |  5 ++---
 include/net/ipv6.h       | 20 +++++++-------------
 net/ipv6/ip6_output.c    |  2 +-
 net/ipv6/ipv6_sockglue.c | 19 ++++++++++---------
 net/ipv6/route.c         |  2 +-
 6 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 09253825c99c..e400ff757f13 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,7 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u8			srcprefs:3;	/* 001: prefer temporary address
+	__u8			srcprefs;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index b1ea49900b4a..28b065790261 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -53,13 +53,12 @@ struct route_info {
  */
 static inline int rt6_srcprefs2flags(unsigned int srcprefs)
 {
-	/* No need to bitmask because srcprefs have only 3 bits. */
-	return srcprefs << 3;
+	return (srcprefs & IPV6_PREFER_SRC_MASK) << 3;
 }
 
 static inline unsigned int rt6_flags2srcprefs(int flags)
 {
-	return (flags >> 3) & 7;
+	return (flags >> 3) & IPV6_PREFER_SRC_MASK;
 }
 
 static inline bool rt6_need_strict(const struct in6_addr *daddr)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index bd115980809f..b3444c8a6f74 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1306,10 +1306,13 @@ static inline void ip6_sock_set_recverr(struct sock *sk)
 	inet6_set_bit(RECVERR6, sk);
 }
 
-static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
+#define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \
+			      IPV6_PREFER_SRC_COA)
+
+static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val)
 {
+	unsigned int prefmask = ~IPV6_PREFER_SRC_MASK;
 	unsigned int pref = 0;
-	unsigned int prefmask = ~0;
 
 	/* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
 	switch (val & (IPV6_PREFER_SRC_PUBLIC |
@@ -1359,20 +1362,11 @@ static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
 		return -EINVAL;
 	}
 
-	inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref;
+	WRITE_ONCE(inet6_sk(sk)->srcprefs,
+		   (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref);
 	return 0;
 }
 
-static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val)
-{
-	int ret;
-
-	lock_sock(sk);
-	ret = __ip6_sock_set_addr_preferences(sk, val);
-	release_sock(sk);
-	return ret;
-}
-
 static inline void ip6_sock_set_recvpktinfo(struct sock *sk)
 {
 	lock_sock(sk);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7e5d9eeb990f..951ba8089b5b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1113,7 +1113,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 		rcu_read_lock();
 		from = rt ? rcu_dereference(rt->from) : NULL;
 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
-					  sk ? inet6_sk(sk)->srcprefs : 0,
+					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
 					  &fl6->saddr);
 		rcu_read_unlock();
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index e9dc6f881bb9..7d661735cb9d 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -505,6 +505,10 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(SNDFLOW, sk, valbool);
 		return 0;
+	case IPV6_ADDR_PREFERENCES:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		return ip6_sock_set_addr_preferences(sk, val);
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -964,11 +968,6 @@ done:
 		retv = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
 
-	case IPV6_ADDR_PREFERENCES:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		retv = __ip6_sock_set_addr_preferences(sk, val);
-		break;
 	case IPV6_RECVFRAGSIZE:
 		np->rxopt.bits.recvfragsize = valbool;
 		retv = 0;
@@ -1415,23 +1414,25 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 	}
 
 	case IPV6_ADDR_PREFERENCES:
+		{
+		u8 srcprefs = READ_ONCE(np->srcprefs);
 		val = 0;
 
-		if (np->srcprefs & IPV6_PREFER_SRC_TMP)
+		if (srcprefs & IPV6_PREFER_SRC_TMP)
 			val |= IPV6_PREFER_SRC_TMP;
-		else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC)
+		else if (srcprefs & IPV6_PREFER_SRC_PUBLIC)
 			val |= IPV6_PREFER_SRC_PUBLIC;
 		else {
 			/* XXX: should we return system default? */
 			val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT;
 		}
 
-		if (np->srcprefs & IPV6_PREFER_SRC_COA)
+		if (srcprefs & IPV6_PREFER_SRC_COA)
 			val |= IPV6_PREFER_SRC_COA;
 		else
 			val |= IPV6_PREFER_SRC_HOME;
 		break;
-
+		}
 	case IPV6_MINHOPCOUNT:
 		val = READ_ONCE(np->min_hopcount);
 		break;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9d8dfc7423e4..b132feae3393 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2622,7 +2622,7 @@ static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
 	if (!any_src)
 		flags |= RT6_LOOKUP_F_HAS_SADDR;
 	else if (sk)
-		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
+		flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));
 
 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
 }
-- 
cgit v1.2.3-70-g09d2


From bafd764a8baa87e19e6beeaa58eb85fcbbdd6b20 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 18 Sep 2023 12:29:07 +0200
Subject: net: ethernet: mtk_wed: rename mtk_rxbm_desc in mtk_wed_bm_desc

Rename mtk_rxbm_desc structure in mtk_wed_bm_desc since it will be used
even on tx side by MT7988 SoC.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c          | 4 ++--
 drivers/net/wireless/mediatek/mt76/mt7915/mmio.c | 2 +-
 include/linux/soc/mediatek/mtk_wed.h             | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 750326b298dc..f166d4f0b793 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -422,7 +422,7 @@ free_pagelist:
 static int
 mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
 {
-	struct mtk_rxbm_desc *desc;
+	struct mtk_wed_bm_desc *desc;
 	dma_addr_t desc_phys;
 
 	dev->rx_buf_ring.size = dev->wlan.rx_nbuf;
@@ -442,7 +442,7 @@ mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
 static void
 mtk_wed_free_rx_buffer(struct mtk_wed_device *dev)
 {
-	struct mtk_rxbm_desc *desc = dev->rx_buf_ring.desc;
+	struct mtk_wed_bm_desc *desc = dev->rx_buf_ring.desc;
 
 	if (!desc)
 		return;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
index fc7ace638ce8..e7d8e03f826f 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
@@ -591,7 +591,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device *wed)
 
 static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size)
 {
-	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
+	struct mtk_wed_bm_desc *desc = wed->rx_buf_ring.desc;
 	struct mt76_txwi_cache *t = NULL;
 	struct mt7915_dev *dev;
 	struct mt76_queue *q;
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index b2b28180dff7..c6512c216b27 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -45,7 +45,7 @@ enum mtk_wed_wo_cmd {
 	MTK_WED_WO_CMD_WED_END
 };
 
-struct mtk_rxbm_desc {
+struct mtk_wed_bm_desc {
 	__le32 buf0;
 	__le32 token;
 } __packed __aligned(4);
@@ -104,7 +104,7 @@ struct mtk_wed_device {
 
 	struct {
 		int size;
-		struct mtk_rxbm_desc *desc;
+		struct mtk_wed_bm_desc *desc;
 		dma_addr_t desc_phys;
 	} rx_buf_ring;
 
-- 
cgit v1.2.3-70-g09d2


From ff0ea57fa30e860d3373acd1383e9d9599144b58 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 18 Sep 2023 12:29:08 +0200
Subject: net: ethernet: mtk_wed: introduce mtk_wed_buf structure

Introduce mtk_wed_buf structure to store both virtual and physical
addresses allocated in mtk_wed_tx_buffer_alloc() routine. This is a
preliminary patch to add WED support for MT7988 SoC since it relies on a
different dma descriptor layout not storing page dma addresses.

Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c | 12 ++++++------
 include/linux/soc/mediatek/mtk_wed.h    |  7 ++++++-
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index f166d4f0b793..592e497984e3 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -300,9 +300,9 @@ out:
 static int
 mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 {
+	struct mtk_wed_buf *page_list;
 	struct mtk_wdma_desc *desc;
 	dma_addr_t desc_phys;
-	void **page_list;
 	int token = dev->wlan.token_start;
 	int ring_size;
 	int n_pages;
@@ -343,7 +343,8 @@ mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 			return -ENOMEM;
 		}
 
-		page_list[page_idx++] = page;
+		page_list[page_idx].p = page;
+		page_list[page_idx++].phy_addr = page_phys;
 		dma_sync_single_for_cpu(dev->hw->dev, page_phys, PAGE_SIZE,
 					DMA_BIDIRECTIONAL);
 
@@ -387,8 +388,8 @@ mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 static void
 mtk_wed_free_tx_buffer(struct mtk_wed_device *dev)
 {
+	struct mtk_wed_buf *page_list = dev->tx_buf_ring.pages;
 	struct mtk_wdma_desc *desc = dev->tx_buf_ring.desc;
-	void **page_list = dev->tx_buf_ring.pages;
 	int page_idx;
 	int i;
 
@@ -400,13 +401,12 @@ mtk_wed_free_tx_buffer(struct mtk_wed_device *dev)
 
 	for (i = 0, page_idx = 0; i < dev->tx_buf_ring.size;
 	     i += MTK_WED_BUF_PER_PAGE) {
-		void *page = page_list[page_idx++];
-		dma_addr_t buf_addr;
+		dma_addr_t buf_addr = page_list[page_idx].phy_addr;
+		void *page = page_list[page_idx++].p;
 
 		if (!page)
 			break;
 
-		buf_addr = le32_to_cpu(desc[i].buf0);
 		dma_unmap_page(dev->hw->dev, buf_addr, PAGE_SIZE,
 			       DMA_BIDIRECTIONAL);
 		__free_page(page);
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index c6512c216b27..5f00dc26582b 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -76,6 +76,11 @@ struct mtk_wed_wo_rx_stats {
 	__le32 rx_drop_cnt;
 };
 
+struct mtk_wed_buf {
+	void *p;
+	dma_addr_t phy_addr;
+};
+
 struct mtk_wed_device {
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 	const struct mtk_wed_ops *ops;
@@ -97,7 +102,7 @@ struct mtk_wed_device {
 
 	struct {
 		int size;
-		void **pages;
+		struct mtk_wed_buf *pages;
 		struct mtk_wdma_desc *desc;
 		dma_addr_t desc_phys;
 	} tx_buf_ring;
-- 
cgit v1.2.3-70-g09d2


From e2f64db13aa1d08e32621067e4fe16bbc114b375 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:13 +0200
Subject: net: ethernet: mtk_wed: introduce WED support for MT7988

Similar to MT7986 and MT7622, enable Wireless Ethernet Ditpatcher for
MT7988 in order to offload traffic forwarded from LAN/WLAN to WLAN/LAN

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c     |   1 +
 drivers/net/ethernet/mediatek/mtk_eth_soc.h     |   2 +-
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c |   3 +
 drivers/net/ethernet/mediatek/mtk_wed.c         | 442 +++++++++++++++++-------
 drivers/net/ethernet/mediatek/mtk_wed.h         |  29 ++
 drivers/net/ethernet/mediatek/mtk_wed_mcu.c     |  33 +-
 drivers/net/ethernet/mediatek/mtk_wed_regs.h    | 223 +++++++++++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.h      |   2 +
 include/linux/soc/mediatek/mtk_wed.h            |   8 +-
 9 files changed, 603 insertions(+), 140 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 3cffd1bd3067..697620c6354b 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -197,6 +197,7 @@ static const struct mtk_reg_map mt7988_reg_map = {
 	.wdma_base = {
 		[0]		= 0x4800,
 		[1]		= 0x4c00,
+		[2]		= 0x5000,
 	},
 	.pse_iq_sta		= 0x0180,
 	.pse_oq_sta		= 0x01a0,
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
index 403219d987ef..9ae3b8a71d0e 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -1132,7 +1132,7 @@ struct mtk_reg_map {
 	u32	gdm1_cnt;
 	u32	gdma_to_ppe;
 	u32	ppe_base;
-	u32	wdma_base[2];
+	u32	wdma_base[3];
 	u32	pse_iq_sta;
 	u32	pse_oq_sta;
 };
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index ef3980840695..95f76975f258 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -201,6 +201,9 @@ mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe,
 			case 1:
 				pse_port = PSE_WDMA1_PORT;
 				break;
+			case 2:
+				pse_port = PSE_WDMA2_PORT;
+				break;
 			default:
 				return -EINVAL;
 			}
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index c2ff2d6405f6..b6ca12686beb 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -17,17 +17,19 @@
 #include <net/flow_offload.h>
 #include <net/pkt_cls.h>
 #include "mtk_eth_soc.h"
-#include "mtk_wed_regs.h"
 #include "mtk_wed.h"
 #include "mtk_ppe.h"
 #include "mtk_wed_wo.h"
 
 #define MTK_PCIE_BASE(n)		(0x1a143000 + (n) * 0x2000)
 
-#define MTK_WED_PKT_SIZE		1900
+#define MTK_WED_PKT_SIZE		1920
 #define MTK_WED_BUF_SIZE		2048
+#define MTK_WED_PAGE_BUF_SIZE		128
 #define MTK_WED_BUF_PER_PAGE		(PAGE_SIZE / 2048)
+#define MTK_WED_RX_PAGE_BUF_PER_PAGE	(PAGE_SIZE / 128)
 #define MTK_WED_RX_RING_SIZE		1536
+#define MTK_WED_RX_PG_BM_CNT		8192
 
 #define MTK_WED_TX_RING_SIZE		2048
 #define MTK_WED_WDMA_RING_SIZE		1024
@@ -41,7 +43,10 @@
 #define MTK_WED_RRO_QUE_CNT		8192
 #define MTK_WED_MIOD_ENTRY_CNT		128
 
-static struct mtk_wed_hw *hw_list[2];
+#define MTK_WED_TX_BM_DMA_SIZE		65536
+#define MTK_WED_TX_BM_PKT_CNT		32768
+
+static struct mtk_wed_hw *hw_list[3];
 static DEFINE_MUTEX(hw_lock);
 
 struct mtk_wed_flow_block_priv {
@@ -56,6 +61,7 @@ static const struct mtk_wed_soc_data mt7622_data = {
 		.reset_idx_tx_mask	= GENMASK(3, 0),
 		.reset_idx_rx_mask	= GENMASK(17, 16),
 	},
+	.tx_ring_desc_size = sizeof(struct mtk_wdma_desc),
 	.wdma_desc_size = sizeof(struct mtk_wdma_desc),
 };
 
@@ -66,6 +72,18 @@ static const struct mtk_wed_soc_data mt7986_data = {
 		.reset_idx_tx_mask	= GENMASK(1, 0),
 		.reset_idx_rx_mask	= GENMASK(7, 6),
 	},
+	.tx_ring_desc_size = sizeof(struct mtk_wdma_desc),
+	.wdma_desc_size = 2 * sizeof(struct mtk_wdma_desc),
+};
+
+static const struct mtk_wed_soc_data mt7988_data = {
+	.regmap = {
+		.tx_bm_tkid		= 0x0c8,
+		.wpdma_rx_ring0		= 0x7d0,
+		.reset_idx_tx_mask	= GENMASK(1, 0),
+		.reset_idx_rx_mask	= GENMASK(7, 6),
+	},
+	.tx_ring_desc_size = sizeof(struct mtk_wed_bm_desc),
 	.wdma_desc_size = 2 * sizeof(struct mtk_wdma_desc),
 };
 
@@ -320,33 +338,38 @@ out:
 static int
 mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 {
+	u32 desc_size = dev->hw->soc->tx_ring_desc_size;
+	int i, page_idx = 0, n_pages, ring_size;
+	int token = dev->wlan.token_start;
 	struct mtk_wed_buf *page_list;
-	struct mtk_wdma_desc *desc;
 	dma_addr_t desc_phys;
-	int token = dev->wlan.token_start;
-	int ring_size;
-	int n_pages;
-	int i, page_idx;
+	void *desc_ptr;
 
-	ring_size = dev->wlan.nbuf & ~(MTK_WED_BUF_PER_PAGE - 1);
-	n_pages = ring_size / MTK_WED_BUF_PER_PAGE;
+	if (!mtk_wed_is_v3_or_greater(dev->hw)) {
+		ring_size = dev->wlan.nbuf & ~(MTK_WED_BUF_PER_PAGE - 1);
+		dev->tx_buf_ring.size = ring_size;
+	} else {
+		dev->tx_buf_ring.size = MTK_WED_TX_BM_DMA_SIZE;
+		ring_size = MTK_WED_TX_BM_PKT_CNT;
+	}
+	n_pages = dev->tx_buf_ring.size / MTK_WED_BUF_PER_PAGE;
 
 	page_list = kcalloc(n_pages, sizeof(*page_list), GFP_KERNEL);
 	if (!page_list)
 		return -ENOMEM;
 
-	dev->tx_buf_ring.size = ring_size;
 	dev->tx_buf_ring.pages = page_list;
 
-	desc = dma_alloc_coherent(dev->hw->dev, ring_size * sizeof(*desc),
-				  &desc_phys, GFP_KERNEL);
-	if (!desc)
+	desc_ptr = dma_alloc_coherent(dev->hw->dev,
+				      dev->tx_buf_ring.size * desc_size,
+				      &desc_phys, GFP_KERNEL);
+	if (!desc_ptr)
 		return -ENOMEM;
 
-	dev->tx_buf_ring.desc = desc;
+	dev->tx_buf_ring.desc = desc_ptr;
 	dev->tx_buf_ring.desc_phys = desc_phys;
 
-	for (i = 0, page_idx = 0; i < ring_size; i += MTK_WED_BUF_PER_PAGE) {
+	for (i = 0; i < ring_size; i += MTK_WED_BUF_PER_PAGE) {
 		dma_addr_t page_phys, buf_phys;
 		struct page *page;
 		void *buf;
@@ -372,28 +395,31 @@ mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 		buf_phys = page_phys;
 
 		for (s = 0; s < MTK_WED_BUF_PER_PAGE; s++) {
-			u32 txd_size;
-			u32 ctrl;
-
-			txd_size = dev->wlan.init_buf(buf, buf_phys, token++);
+			struct mtk_wdma_desc *desc = desc_ptr;
 
 			desc->buf0 = cpu_to_le32(buf_phys);
-			desc->buf1 = cpu_to_le32(buf_phys + txd_size);
-
-			if (mtk_wed_is_v1(dev->hw))
-				ctrl = FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN0, txd_size) |
-				       FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1,
-						  MTK_WED_BUF_SIZE - txd_size) |
-				       MTK_WDMA_DESC_CTRL_LAST_SEG1;
-			else
-				ctrl = FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN0, txd_size) |
-				       FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1_V2,
-						  MTK_WED_BUF_SIZE - txd_size) |
-				       MTK_WDMA_DESC_CTRL_LAST_SEG0;
-			desc->ctrl = cpu_to_le32(ctrl);
-			desc->info = 0;
-			desc++;
-
+			if (!mtk_wed_is_v3_or_greater(dev->hw)) {
+				u32 txd_size, ctrl;
+
+				txd_size = dev->wlan.init_buf(buf, buf_phys,
+							      token++);
+				desc->buf1 = cpu_to_le32(buf_phys + txd_size);
+				ctrl = FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN0, txd_size);
+				if (mtk_wed_is_v1(dev->hw))
+					ctrl |= MTK_WDMA_DESC_CTRL_LAST_SEG1 |
+						FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1,
+							   MTK_WED_BUF_SIZE - txd_size);
+				else
+					ctrl |= MTK_WDMA_DESC_CTRL_LAST_SEG0 |
+						FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1_V2,
+							   MTK_WED_BUF_SIZE - txd_size);
+				desc->ctrl = cpu_to_le32(ctrl);
+				desc->info = 0;
+			} else {
+				desc->ctrl = cpu_to_le32(token << 16);
+			}
+
+			desc_ptr += desc_size;
 			buf += MTK_WED_BUF_SIZE;
 			buf_phys += MTK_WED_BUF_SIZE;
 		}
@@ -409,31 +435,31 @@ static void
 mtk_wed_free_tx_buffer(struct mtk_wed_device *dev)
 {
 	struct mtk_wed_buf *page_list = dev->tx_buf_ring.pages;
-	struct mtk_wdma_desc *desc = dev->tx_buf_ring.desc;
-	int page_idx;
-	int i;
+	struct mtk_wed_hw *hw = dev->hw;
+	int i, page_idx = 0;
 
 	if (!page_list)
 		return;
 
-	if (!desc)
+	if (!dev->tx_buf_ring.desc)
 		goto free_pagelist;
 
-	for (i = 0, page_idx = 0; i < dev->tx_buf_ring.size;
-	     i += MTK_WED_BUF_PER_PAGE) {
-		dma_addr_t buf_addr = page_list[page_idx].phy_addr;
+	for (i = 0; i < dev->tx_buf_ring.size; i += MTK_WED_BUF_PER_PAGE) {
+		dma_addr_t page_phy = page_list[page_idx].phy_addr;
 		void *page = page_list[page_idx++].p;
 
 		if (!page)
 			break;
 
-		dma_unmap_page(dev->hw->dev, buf_addr, PAGE_SIZE,
+		dma_unmap_page(dev->hw->dev, page_phy, PAGE_SIZE,
 			       DMA_BIDIRECTIONAL);
 		__free_page(page);
 	}
 
-	dma_free_coherent(dev->hw->dev, dev->tx_buf_ring.size * sizeof(*desc),
-			  desc, dev->tx_buf_ring.desc_phys);
+	dma_free_coherent(dev->hw->dev,
+			  dev->tx_buf_ring.size * hw->soc->tx_ring_desc_size,
+			  dev->tx_buf_ring.desc,
+			  dev->tx_buf_ring.desc_phys);
 
 free_pagelist:
 	kfree(page_list);
@@ -518,13 +544,23 @@ mtk_wed_set_ext_int(struct mtk_wed_device *dev, bool en)
 {
 	u32 mask = MTK_WED_EXT_INT_STATUS_ERROR_MASK;
 
-	if (mtk_wed_is_v1(dev->hw))
+	switch (dev->hw->version) {
+	case 1:
 		mask |= MTK_WED_EXT_INT_STATUS_TX_DRV_R_RESP_ERR;
-	else
+		break;
+	case 2:
 		mask |= MTK_WED_EXT_INT_STATUS_RX_FBUF_LO_TH |
 			MTK_WED_EXT_INT_STATUS_RX_FBUF_HI_TH |
 			MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT |
 			MTK_WED_EXT_INT_STATUS_TX_DMA_W_RESP_ERR;
+		break;
+	case 3:
+		mask = MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT |
+		       MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD;
+		break;
+	default:
+		break;
+	}
 
 	if (!dev->hw->num_flows)
 		mask &= ~MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD;
@@ -536,6 +572,9 @@ mtk_wed_set_ext_int(struct mtk_wed_device *dev, bool en)
 static void
 mtk_wed_set_512_support(struct mtk_wed_device *dev, bool enable)
 {
+	if (!mtk_wed_is_v2(dev->hw))
+		return;
+
 	if (enable) {
 		wed_w32(dev, MTK_WED_TXDP_CTRL, MTK_WED_TXDP_DW9_OVERWR);
 		wed_w32(dev, MTK_WED_TXP_DW1,
@@ -610,6 +649,14 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev)
 			MTK_WED_WPDMA_RX_D_RX_DRV_EN);
 		wed_clr(dev, MTK_WED_WDMA_GLO_CFG,
 			MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK);
+
+		if (mtk_wed_is_v3_or_greater(dev->hw) &&
+		    mtk_wed_get_rx_capa(dev)) {
+			wdma_clr(dev, MTK_WDMA_PREF_TX_CFG,
+				 MTK_WDMA_PREF_TX_CFG_PREF_EN);
+			wdma_clr(dev, MTK_WDMA_PREF_RX_CFG,
+				 MTK_WDMA_PREF_RX_CFG_PREF_EN);
+		}
 	}
 
 	mtk_wed_set_512_support(dev, false);
@@ -652,6 +699,14 @@ mtk_wed_deinit(struct mtk_wed_device *dev)
 		MTK_WED_CTRL_RX_ROUTE_QM_EN |
 		MTK_WED_CTRL_WED_RX_BM_EN |
 		MTK_WED_CTRL_RX_RRO_QM_EN);
+
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		wed_clr(dev, MTK_WED_CTRL, MTK_WED_CTRL_TX_AMSDU_EN);
+		wed_clr(dev, MTK_WED_RESET, MTK_WED_RESET_TX_AMSDU);
+		wed_clr(dev, MTK_WED_PCIE_INT_CTRL,
+			MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA |
+			MTK_WED_PCIE_INT_CTRL_MSK_IRQ_FILTER);
+	}
 }
 
 static void
@@ -701,21 +756,37 @@ mtk_wed_detach(struct mtk_wed_device *dev)
 	mutex_unlock(&hw_lock);
 }
 
-#define PCIE_BASE_ADDR0		0x11280000
 static void
 mtk_wed_bus_init(struct mtk_wed_device *dev)
 {
 	switch (dev->wlan.bus_type) {
 	case MTK_WED_BUS_PCIE: {
 		struct device_node *np = dev->hw->eth->dev->of_node;
-		struct regmap *regs;
 
-		regs = syscon_regmap_lookup_by_phandle(np,
-						       "mediatek,wed-pcie");
-		if (IS_ERR(regs))
-			break;
+		if (mtk_wed_is_v2(dev->hw)) {
+			struct regmap *regs;
+
+			regs = syscon_regmap_lookup_by_phandle(np,
+							       "mediatek,wed-pcie");
+			if (IS_ERR(regs))
+				break;
 
-		regmap_update_bits(regs, 0, BIT(0), BIT(0));
+			regmap_update_bits(regs, 0, BIT(0), BIT(0));
+		}
+
+		if (dev->wlan.msi) {
+			wed_w32(dev, MTK_WED_PCIE_CFG_INTM,
+				dev->hw->pcie_base | 0xc08);
+			wed_w32(dev, MTK_WED_PCIE_CFG_BASE,
+				dev->hw->pcie_base | 0xc04);
+			wed_w32(dev, MTK_WED_PCIE_INT_TRIGGER, BIT(8));
+		} else {
+			wed_w32(dev, MTK_WED_PCIE_CFG_INTM,
+				dev->hw->pcie_base | 0x180);
+			wed_w32(dev, MTK_WED_PCIE_CFG_BASE,
+				dev->hw->pcie_base | 0x184);
+			wed_w32(dev, MTK_WED_PCIE_INT_TRIGGER, BIT(24));
+		}
 
 		wed_w32(dev, MTK_WED_PCIE_INT_CTRL,
 			FIELD_PREP(MTK_WED_PCIE_INT_CTRL_POLL_EN, 2));
@@ -723,19 +794,9 @@ mtk_wed_bus_init(struct mtk_wed_device *dev)
 		/* pcie interrupt control: pola/source selection */
 		wed_set(dev, MTK_WED_PCIE_INT_CTRL,
 			MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA |
-			FIELD_PREP(MTK_WED_PCIE_INT_CTRL_SRC_SEL, 1));
-		wed_r32(dev, MTK_WED_PCIE_INT_CTRL);
-
-		wed_w32(dev, MTK_WED_PCIE_CFG_INTM, PCIE_BASE_ADDR0 | 0x180);
-		wed_w32(dev, MTK_WED_PCIE_CFG_BASE, PCIE_BASE_ADDR0 | 0x184);
-
-		/* pcie interrupt status trigger register */
-		wed_w32(dev, MTK_WED_PCIE_INT_TRIGGER, BIT(24));
-		wed_r32(dev, MTK_WED_PCIE_INT_TRIGGER);
-
-		/* pola setting */
-		wed_set(dev, MTK_WED_PCIE_INT_CTRL,
-			MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA);
+			MTK_WED_PCIE_INT_CTRL_MSK_IRQ_FILTER  |
+			FIELD_PREP(MTK_WED_PCIE_INT_CTRL_SRC_SEL,
+				   dev->hw->index));
 		break;
 	}
 	case MTK_WED_BUS_AXI:
@@ -773,18 +834,19 @@ mtk_wed_set_wpdma(struct mtk_wed_device *dev)
 static void
 mtk_wed_hw_init_early(struct mtk_wed_device *dev)
 {
-	u32 mask, set;
+	u32 set = FIELD_PREP(MTK_WED_WDMA_GLO_CFG_BT_SIZE, 2);
+	u32 mask = MTK_WED_WDMA_GLO_CFG_BT_SIZE;
 
 	mtk_wed_deinit(dev);
 	mtk_wed_reset(dev, MTK_WED_RESET_WED);
 	mtk_wed_set_wpdma(dev);
 
-	mask = MTK_WED_WDMA_GLO_CFG_BT_SIZE |
-	       MTK_WED_WDMA_GLO_CFG_DYNAMIC_DMAD_RECYCLE |
-	       MTK_WED_WDMA_GLO_CFG_RX_DIS_FSM_AUTO_IDLE;
-	set = FIELD_PREP(MTK_WED_WDMA_GLO_CFG_BT_SIZE, 2) |
-	      MTK_WED_WDMA_GLO_CFG_DYNAMIC_SKIP_DMAD_PREP |
-	      MTK_WED_WDMA_GLO_CFG_IDLE_DMAD_SUPPLY;
+	if (!mtk_wed_is_v3_or_greater(dev->hw)) {
+		mask |= MTK_WED_WDMA_GLO_CFG_DYNAMIC_DMAD_RECYCLE |
+			MTK_WED_WDMA_GLO_CFG_RX_DIS_FSM_AUTO_IDLE;
+		set |= MTK_WED_WDMA_GLO_CFG_DYNAMIC_SKIP_DMAD_PREP |
+		       MTK_WED_WDMA_GLO_CFG_IDLE_DMAD_SUPPLY;
+	}
 	wed_m32(dev, MTK_WED_WDMA_GLO_CFG, mask, set);
 
 	if (mtk_wed_is_v1(dev->hw)) {
@@ -932,11 +994,18 @@ mtk_wed_route_qm_hw_init(struct mtk_wed_device *dev)
 	}
 
 	/* configure RX_ROUTE_QM */
-	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
-	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_TXDMAD_FPORT);
-	wed_set(dev, MTK_WED_RTQM_GLO_CFG,
-		FIELD_PREP(MTK_WED_RTQM_TXDMAD_FPORT, 0x3 + dev->hw->index));
-	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
+	if (mtk_wed_is_v2(dev->hw)) {
+		wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
+		wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_TXDMAD_FPORT);
+		wed_set(dev, MTK_WED_RTQM_GLO_CFG,
+			FIELD_PREP(MTK_WED_RTQM_TXDMAD_FPORT,
+				   0x3 + dev->hw->index));
+		wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
+	} else {
+		wed_set(dev, MTK_WED_RTQM_ENQ_CFG0,
+			FIELD_PREP(MTK_WED_RTQM_ENQ_CFG_TXDMAD_FPORT,
+				   0x3 + dev->hw->index));
+	}
 	/* enable RX_ROUTE_QM */
 	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_RX_ROUTE_QM_EN);
 }
@@ -949,22 +1018,30 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 
 	dev->init_done = true;
 	mtk_wed_set_ext_int(dev, false);
-	wed_w32(dev, MTK_WED_TX_BM_CTRL,
-		MTK_WED_TX_BM_CTRL_PAUSE |
-		FIELD_PREP(MTK_WED_TX_BM_CTRL_VLD_GRP_NUM,
-			   dev->tx_buf_ring.size / 128) |
-		FIELD_PREP(MTK_WED_TX_BM_CTRL_RSV_GRP_NUM,
-			   MTK_WED_TX_RING_SIZE / 256));
 
 	wed_w32(dev, MTK_WED_TX_BM_BASE, dev->tx_buf_ring.desc_phys);
-
 	wed_w32(dev, MTK_WED_TX_BM_BUF_LEN, MTK_WED_PKT_SIZE);
 
 	if (mtk_wed_is_v1(dev->hw)) {
+		wed_w32(dev, MTK_WED_TX_BM_CTRL,
+			MTK_WED_TX_BM_CTRL_PAUSE |
+			FIELD_PREP(MTK_WED_TX_BM_CTRL_VLD_GRP_NUM,
+				   dev->tx_buf_ring.size / 128) |
+			FIELD_PREP(MTK_WED_TX_BM_CTRL_RSV_GRP_NUM,
+				   MTK_WED_TX_RING_SIZE / 256));
 		wed_w32(dev, MTK_WED_TX_BM_DYN_THR,
 			FIELD_PREP(MTK_WED_TX_BM_DYN_THR_LO, 1) |
 			MTK_WED_TX_BM_DYN_THR_HI);
-	} else {
+	} else if (mtk_wed_is_v2(dev->hw)) {
+		wed_w32(dev, MTK_WED_TX_BM_CTRL,
+			MTK_WED_TX_BM_CTRL_PAUSE |
+			FIELD_PREP(MTK_WED_TX_BM_CTRL_VLD_GRP_NUM,
+				   dev->tx_buf_ring.size / 128) |
+			FIELD_PREP(MTK_WED_TX_BM_CTRL_RSV_GRP_NUM,
+				   MTK_WED_TX_RING_SIZE / 256));
+		wed_w32(dev, MTK_WED_TX_TKID_DYN_THR,
+			FIELD_PREP(MTK_WED_TX_TKID_DYN_THR_LO, 0) |
+			MTK_WED_TX_TKID_DYN_THR_HI);
 		wed_w32(dev, MTK_WED_TX_BM_DYN_THR,
 			FIELD_PREP(MTK_WED_TX_BM_DYN_THR_LO_V2, 0) |
 			MTK_WED_TX_BM_DYN_THR_HI_V2);
@@ -974,9 +1051,6 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 				   dev->tx_buf_ring.size / 128) |
 			FIELD_PREP(MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM,
 				   dev->tx_buf_ring.size / 128));
-		wed_w32(dev, MTK_WED_TX_TKID_DYN_THR,
-			FIELD_PREP(MTK_WED_TX_TKID_DYN_THR_LO, 0) |
-			MTK_WED_TX_TKID_DYN_THR_HI);
 	}
 
 	wed_w32(dev, dev->hw->soc->regmap.tx_bm_tkid,
@@ -986,26 +1060,62 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 
 	mtk_wed_reset(dev, MTK_WED_RESET_TX_BM);
 
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		/* switch to new bm architecture */
+		wed_clr(dev, MTK_WED_TX_BM_CTRL,
+			MTK_WED_TX_BM_CTRL_LEGACY_EN);
+
+		wed_w32(dev, MTK_WED_TX_TKID_CTRL,
+			MTK_WED_TX_TKID_CTRL_PAUSE |
+			FIELD_PREP(MTK_WED_TX_TKID_CTRL_VLD_GRP_NUM_V3,
+				   dev->wlan.nbuf / 128) |
+			FIELD_PREP(MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM_V3,
+				   dev->wlan.nbuf / 128));
+		/* return SKBID + SDP back to bm */
+		wed_set(dev, MTK_WED_TX_TKID_CTRL,
+			MTK_WED_TX_TKID_CTRL_FREE_FORMAT);
+
+		wed_w32(dev, MTK_WED_TX_BM_INIT_PTR,
+			MTK_WED_TX_BM_PKT_CNT |
+			MTK_WED_TX_BM_INIT_SW_TAIL_IDX);
+	}
+
 	if (mtk_wed_is_v1(dev->hw)) {
 		wed_set(dev, MTK_WED_CTRL,
 			MTK_WED_CTRL_WED_TX_BM_EN |
 			MTK_WED_CTRL_WED_TX_FREE_AGENT_EN);
-	} else {
-		wed_clr(dev, MTK_WED_TX_TKID_CTRL, MTK_WED_TX_TKID_CTRL_PAUSE);
-		if (mtk_wed_get_rx_capa(dev)) {
-			/* rx hw init */
-			wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX,
-				MTK_WED_WPDMA_RX_D_RST_CRX_IDX |
-				MTK_WED_WPDMA_RX_D_RST_DRV_IDX);
-			wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX, 0);
-
-			mtk_wed_rx_buffer_hw_init(dev);
-			mtk_wed_rro_hw_init(dev);
-			mtk_wed_route_qm_hw_init(dev);
-		}
+	} else if (mtk_wed_get_rx_capa(dev)) {
+		/* rx hw init */
+		wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX,
+			MTK_WED_WPDMA_RX_D_RST_CRX_IDX |
+			MTK_WED_WPDMA_RX_D_RST_DRV_IDX);
+		wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX, 0);
+
+		/* reset prefetch index of ring */
+		wed_set(dev, MTK_WED_WPDMA_RX_D_PREF_RX0_SIDX,
+			MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR);
+		wed_clr(dev, MTK_WED_WPDMA_RX_D_PREF_RX0_SIDX,
+			MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR);
+
+		wed_set(dev, MTK_WED_WPDMA_RX_D_PREF_RX1_SIDX,
+			MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR);
+		wed_clr(dev, MTK_WED_WPDMA_RX_D_PREF_RX1_SIDX,
+			MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR);
+
+		/* reset prefetch FIFO of ring */
+		wed_set(dev, MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG,
+			MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG_R0_CLR |
+			MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG_R1_CLR);
+		wed_w32(dev, MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG, 0);
+
+		mtk_wed_rx_buffer_hw_init(dev);
+		mtk_wed_rro_hw_init(dev);
+		mtk_wed_route_qm_hw_init(dev);
 	}
 
 	wed_clr(dev, MTK_WED_TX_BM_CTRL, MTK_WED_TX_BM_CTRL_PAUSE);
+	if (!mtk_wed_is_v1(dev->hw))
+		wed_clr(dev, MTK_WED_TX_TKID_CTRL, MTK_WED_TX_TKID_CTRL_PAUSE);
 }
 
 static void
@@ -1303,6 +1413,24 @@ mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size,
 					 dev->hw->soc->wdma_desc_size, true))
 		return -ENOMEM;
 
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		struct mtk_wdma_desc *desc = wdma->desc;
+		int i;
+
+		for (i = 0; i < MTK_WED_WDMA_RING_SIZE; i++) {
+			desc->buf0 = 0;
+			desc->ctrl = cpu_to_le32(MTK_WDMA_DESC_CTRL_DMA_DONE);
+			desc->buf1 = 0;
+			desc->info = cpu_to_le32(MTK_WDMA_TXD0_DESC_INFO_DMA_DONE);
+			desc++;
+			desc->buf0 = 0;
+			desc->ctrl = cpu_to_le32(MTK_WDMA_DESC_CTRL_DMA_DONE);
+			desc->buf1 = 0;
+			desc->info = cpu_to_le32(MTK_WDMA_TXD1_DESC_INFO_DMA_DONE);
+			desc++;
+		}
+	}
+
 	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_BASE,
 		 wdma->desc_phys);
 	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_COUNT,
@@ -1368,6 +1496,9 @@ mtk_wed_configure_irq(struct mtk_wed_device *dev, u32 irq_mask)
 
 		wed_clr(dev, MTK_WED_WDMA_INT_CTRL, wdma_mask);
 	} else {
+		if (mtk_wed_is_v3_or_greater(dev->hw))
+			wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_TX_TKID_ALI_EN);
+
 		/* initail tx interrupt trigger */
 		wed_w32(dev, MTK_WED_WPDMA_INT_CTRL_TX,
 			MTK_WED_WPDMA_INT_CTRL_TX0_DONE_EN |
@@ -1420,33 +1551,60 @@ mtk_wed_dma_enable(struct mtk_wed_device *dev)
 {
 	int i;
 
-	wed_set(dev, MTK_WED_WPDMA_INT_CTRL, MTK_WED_WPDMA_INT_CTRL_SUBRT_ADV);
+	if (!mtk_wed_is_v3_or_greater(dev->hw)) {
+		wed_set(dev, MTK_WED_WPDMA_INT_CTRL,
+			MTK_WED_WPDMA_INT_CTRL_SUBRT_ADV);
+		wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
+			MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN |
+			MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN);
+		wdma_set(dev, MTK_WDMA_GLO_CFG,
+			 MTK_WDMA_GLO_CFG_TX_DMA_EN |
+			 MTK_WDMA_GLO_CFG_RX_INFO1_PRERES |
+			 MTK_WDMA_GLO_CFG_RX_INFO2_PRERES);
+		wed_set(dev, MTK_WED_WPDMA_CTRL, MTK_WED_WPDMA_CTRL_SDL1_FIXED);
+	} else {
+		wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
+			MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN |
+			MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN |
+			MTK_WED_WPDMA_GLO_CFG_RX_DDONE2_WR);
+		wdma_set(dev, MTK_WDMA_GLO_CFG, MTK_WDMA_GLO_CFG_TX_DMA_EN);
+	}
 
 	wed_set(dev, MTK_WED_GLO_CFG,
 		MTK_WED_GLO_CFG_TX_DMA_EN |
 		MTK_WED_GLO_CFG_RX_DMA_EN);
-	wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
-		MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN |
-		MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN);
+
 	wed_set(dev, MTK_WED_WDMA_GLO_CFG,
 		MTK_WED_WDMA_GLO_CFG_RX_DRV_EN);
 
-	wdma_set(dev, MTK_WDMA_GLO_CFG,
-		 MTK_WDMA_GLO_CFG_TX_DMA_EN |
-		 MTK_WDMA_GLO_CFG_RX_INFO1_PRERES |
-		 MTK_WDMA_GLO_CFG_RX_INFO2_PRERES);
-
 	if (mtk_wed_is_v1(dev->hw)) {
 		wdma_set(dev, MTK_WDMA_GLO_CFG,
 			 MTK_WDMA_GLO_CFG_RX_INFO3_PRERES);
 		return;
 	}
 
-	wed_set(dev, MTK_WED_WPDMA_CTRL,
-		MTK_WED_WPDMA_CTRL_SDL1_FIXED);
 	wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
 		MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_PKT_PROC |
 		MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_CRX_SYNC);
+
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		wed_set(dev, MTK_WED_WDMA_RX_PREF_CFG,
+			FIELD_PREP(MTK_WED_WDMA_RX_PREF_BURST_SIZE, 0x10) |
+			FIELD_PREP(MTK_WED_WDMA_RX_PREF_LOW_THRES, 0x8));
+		wed_clr(dev, MTK_WED_WDMA_RX_PREF_CFG,
+			MTK_WED_WDMA_RX_PREF_DDONE2_EN);
+		wed_set(dev, MTK_WED_WDMA_RX_PREF_CFG, MTK_WED_WDMA_RX_PREF_EN);
+
+		wed_clr(dev, MTK_WED_WPDMA_GLO_CFG,
+			MTK_WED_WPDMA_GLO_CFG_TX_DDONE_CHK_LAST);
+		wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
+			MTK_WED_WPDMA_GLO_CFG_TX_DDONE_CHK |
+			MTK_WED_WPDMA_GLO_CFG_RX_DRV_EVENT_PKT_FMT_CHK |
+			MTK_WED_WPDMA_GLO_CFG_RX_DRV_UNS_VER_FORCE_4);
+
+		wdma_set(dev, MTK_WDMA_PREF_RX_CFG, MTK_WDMA_PREF_RX_CFG_PREF_EN);
+	}
+
 	wed_clr(dev, MTK_WED_WPDMA_GLO_CFG,
 		MTK_WED_WPDMA_GLO_CFG_TX_TKID_KEEP |
 		MTK_WED_WPDMA_GLO_CFG_TX_DMAD_DW3_PREV);
@@ -1458,11 +1616,22 @@ mtk_wed_dma_enable(struct mtk_wed_device *dev)
 		MTK_WED_WDMA_GLO_CFG_TX_DRV_EN |
 		MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK);
 
+	wed_clr(dev, MTK_WED_WPDMA_RX_D_GLO_CFG, MTK_WED_WPDMA_RX_D_RXD_READ_LEN);
 	wed_set(dev, MTK_WED_WPDMA_RX_D_GLO_CFG,
 		MTK_WED_WPDMA_RX_D_RX_DRV_EN |
 		FIELD_PREP(MTK_WED_WPDMA_RX_D_RXD_READ_LEN, 0x18) |
-		FIELD_PREP(MTK_WED_WPDMA_RX_D_INIT_PHASE_RXEN_SEL,
-			   0x2));
+		FIELD_PREP(MTK_WED_WPDMA_RX_D_INIT_PHASE_RXEN_SEL, 0x2));
+
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		wed_set(dev, MTK_WED_WPDMA_RX_D_PREF_CFG,
+			MTK_WED_WPDMA_RX_D_PREF_EN |
+			FIELD_PREP(MTK_WED_WPDMA_RX_D_PREF_BURST_SIZE, 0x10) |
+			FIELD_PREP(MTK_WED_WPDMA_RX_D_PREF_LOW_THRES, 0x8));
+
+		wed_set(dev, MTK_WED_RRO_RX_D_CFG(2), MTK_WED_RRO_RX_D_DRV_EN);
+		wdma_set(dev, MTK_WDMA_PREF_TX_CFG, MTK_WDMA_PREF_TX_CFG_PREF_EN);
+		wdma_set(dev, MTK_WDMA_WRBK_TX_CFG, MTK_WDMA_WRBK_TX_CFG_WRBK_EN);
+	}
 
 	for (i = 0; i < MTK_WED_RX_QUEUES; i++)
 		mtk_wed_check_wfdma_rx_fill(dev, i);
@@ -1502,6 +1671,12 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 		wed_r32(dev, MTK_WED_EXT_INT_MASK1);
 		wed_r32(dev, MTK_WED_EXT_INT_MASK2);
 
+		if (mtk_wed_is_v3_or_greater(dev->hw)) {
+			wed_w32(dev, MTK_WED_EXT_INT_MASK3,
+				MTK_WED_EXT_INT_STATUS_WPDMA_MID_RDY);
+			wed_r32(dev, MTK_WED_EXT_INT_MASK3);
+		}
+
 		if (mtk_wed_rro_cfg(dev))
 			return;
 	}
@@ -1553,6 +1728,7 @@ mtk_wed_attach(struct mtk_wed_device *dev)
 	dev->irq = hw->irq;
 	dev->wdma_idx = hw->index;
 	dev->version = hw->version;
+	dev->hw->pcie_base = mtk_wed_get_pcie_base(dev);
 
 	if (hw->eth->dma_dev == hw->eth->dev &&
 	    of_dma_is_coherent(hw->eth->dev->of_node))
@@ -1620,6 +1796,23 @@ mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs,
 	ring->reg_base = MTK_WED_RING_TX(idx);
 	ring->wpdma = regs;
 
+	if (mtk_wed_is_v3_or_greater(dev->hw) && idx == 1) {
+		/* reset prefetch index */
+		wed_set(dev, MTK_WED_WDMA_RX_PREF_CFG,
+			MTK_WED_WDMA_RX_PREF_RX0_SIDX_CLR |
+			MTK_WED_WDMA_RX_PREF_RX1_SIDX_CLR);
+
+		wed_clr(dev, MTK_WED_WDMA_RX_PREF_CFG,
+			MTK_WED_WDMA_RX_PREF_RX0_SIDX_CLR |
+			MTK_WED_WDMA_RX_PREF_RX1_SIDX_CLR);
+
+		/* reset prefetch FIFO */
+		wed_w32(dev, MTK_WED_WDMA_RX_PREF_FIFO_CFG,
+			MTK_WED_WDMA_RX_PREF_FIFO_RX0_CLR |
+			MTK_WED_WDMA_RX_PREF_FIFO_RX1_CLR);
+		wed_w32(dev, MTK_WED_WDMA_RX_PREF_FIFO_CFG, 0);
+	}
+
 	/* WED -> WPDMA */
 	wpdma_tx_w32(dev, idx, MTK_WED_RING_OFS_BASE, ring->desc_phys);
 	wpdma_tx_w32(dev, idx, MTK_WED_RING_OFS_COUNT, MTK_WED_TX_RING_SIZE);
@@ -1694,15 +1887,13 @@ mtk_wed_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs,
 static u32
 mtk_wed_irq_get(struct mtk_wed_device *dev, u32 mask)
 {
-	u32 val, ext_mask = MTK_WED_EXT_INT_STATUS_ERROR_MASK;
+	u32 val, ext_mask;
 
-	if (mtk_wed_is_v1(dev->hw))
-		ext_mask |= MTK_WED_EXT_INT_STATUS_TX_DRV_R_RESP_ERR;
+	if (mtk_wed_is_v3_or_greater(dev->hw))
+		ext_mask = MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT |
+			   MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD;
 	else
-		ext_mask |= MTK_WED_EXT_INT_STATUS_RX_FBUF_LO_TH |
-			    MTK_WED_EXT_INT_STATUS_RX_FBUF_HI_TH |
-			    MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT |
-			    MTK_WED_EXT_INT_STATUS_TX_DMA_W_RESP_ERR;
+		ext_mask = MTK_WED_EXT_INT_STATUS_ERROR_MASK;
 
 	val = wed_r32(dev, MTK_WED_EXT_INT_STATUS);
 	wed_w32(dev, MTK_WED_EXT_INT_STATUS, val);
@@ -1943,6 +2134,9 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 	case 2:
 		hw->soc = &mt7986_data;
 		break;
+	case 3:
+		hw->soc = &mt7988_data;
+		break;
 	default:
 	case 1:
 		hw->mirror = syscon_regmap_lookup_by_phandle(eth_np,
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h
index afaf5a46fbb3..27d336db4d4d 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed.h
@@ -9,6 +9,8 @@
 #include <linux/regmap.h>
 #include <linux/netdevice.h>
 
+#include "mtk_wed_regs.h"
+
 struct mtk_eth;
 struct mtk_wed_wo;
 
@@ -19,6 +21,7 @@ struct mtk_wed_soc_data {
 		u32 reset_idx_tx_mask;
 		u32 reset_idx_rx_mask;
 	} regmap;
+	u32 tx_ring_desc_size;
 	u32 wdma_desc_size;
 };
 
@@ -35,6 +38,7 @@ struct mtk_wed_hw {
 	struct dentry *debugfs_dir;
 	struct mtk_wed_device *wed_dev;
 	struct mtk_wed_wo *wed_wo;
+	u32 pcie_base;
 	u32 debugfs_reg;
 	u32 num_flows;
 	u8 version;
@@ -61,6 +65,16 @@ static inline bool mtk_wed_is_v2(struct mtk_wed_hw *hw)
 	return hw->version == 2;
 }
 
+static inline bool mtk_wed_is_v3(struct mtk_wed_hw *hw)
+{
+	return hw->version == 3;
+}
+
+static inline bool mtk_wed_is_v3_or_greater(struct mtk_wed_hw *hw)
+{
+	return hw->version > 2;
+}
+
 static inline void
 wed_w32(struct mtk_wed_device *dev, u32 reg, u32 val)
 {
@@ -143,6 +157,21 @@ wpdma_txfree_w32(struct mtk_wed_device *dev, u32 reg, u32 val)
 	writel(val, dev->txfree_ring.wpdma + reg);
 }
 
+static inline u32 mtk_wed_get_pcie_base(struct mtk_wed_device *dev)
+{
+	if (!mtk_wed_is_v3_or_greater(dev->hw))
+		return MTK_WED_PCIE_BASE;
+
+	switch (dev->hw->index) {
+	case 1:
+		return MTK_WED_PCIE_BASE1;
+	case 2:
+		return MTK_WED_PCIE_BASE2;
+	default:
+		return MTK_WED_PCIE_BASE0;
+	}
+}
+
 void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		    void __iomem *wdma, phys_addr_t wdma_phy,
 		    int index);
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
index e53531252bd9..65a78e274009 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
@@ -331,10 +331,22 @@ mtk_wed_mcu_load_firmware(struct mtk_wed_wo *wo)
 		wo->hw->index + 1);
 
 	/* load firmware */
-	if (of_device_is_compatible(wo->hw->node, "mediatek,mt7981-wed"))
-		fw_name = MT7981_FIRMWARE_WO;
-	else
-		fw_name = wo->hw->index ? MT7986_FIRMWARE_WO1 : MT7986_FIRMWARE_WO0;
+	switch (wo->hw->version) {
+	case 2:
+		if (of_device_is_compatible(wo->hw->node,
+					    "mediatek,mt7981-wed"))
+			fw_name = MT7981_FIRMWARE_WO;
+		else
+			fw_name = wo->hw->index ? MT7986_FIRMWARE_WO1
+						: MT7986_FIRMWARE_WO0;
+		break;
+	case 3:
+		fw_name = wo->hw->index ? MT7988_FIRMWARE_WO1
+					: MT7988_FIRMWARE_WO0;
+		break;
+	default:
+		return -EINVAL;
+	}
 
 	ret = request_firmware(&fw, fw_name, wo->hw->dev);
 	if (ret)
@@ -355,15 +367,16 @@ mtk_wed_mcu_load_firmware(struct mtk_wed_wo *wo)
 	}
 
 	/* set the start address */
-	boot_cr = wo->hw->index ? MTK_WO_MCU_CFG_LS_WA_BOOT_ADDR_ADDR
-				: MTK_WO_MCU_CFG_LS_WM_BOOT_ADDR_ADDR;
+	if (!mtk_wed_is_v3_or_greater(wo->hw) && wo->hw->index)
+		boot_cr = MTK_WO_MCU_CFG_LS_WA_BOOT_ADDR_ADDR;
+	else
+		boot_cr = MTK_WO_MCU_CFG_LS_WM_BOOT_ADDR_ADDR;
 	wo_w32(wo, boot_cr, mem_region[MTK_WED_WO_REGION_EMI].phy_addr >> 16);
 	/* wo firmware reset */
 	wo_w32(wo, MTK_WO_MCU_CFG_LS_WF_MCCR_CLR_ADDR, 0xc00);
 
-	val = wo_r32(wo, MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR);
-	val |= wo->hw->index ? MTK_WO_MCU_CFG_LS_WF_WM_WA_WA_CPU_RSTB_MASK
-			     : MTK_WO_MCU_CFG_LS_WF_WM_WA_WM_CPU_RSTB_MASK;
+	val = wo_r32(wo, MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR) |
+	      MTK_WO_MCU_CFG_LS_WF_WM_WA_WM_CPU_RSTB_MASK;
 	wo_w32(wo, MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR, val);
 out:
 	release_firmware(fw);
@@ -398,3 +411,5 @@ int mtk_wed_mcu_init(struct mtk_wed_wo *wo)
 MODULE_FIRMWARE(MT7981_FIRMWARE_WO);
 MODULE_FIRMWARE(MT7986_FIRMWARE_WO0);
 MODULE_FIRMWARE(MT7986_FIRMWARE_WO1);
+MODULE_FIRMWARE(MT7988_FIRMWARE_WO0);
+MODULE_FIRMWARE(MT7988_FIRMWARE_WO1);
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_regs.h b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
index 2253f4eb5bc1..a4d3cf64d090 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_regs.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
@@ -13,6 +13,9 @@
 #define MTK_WDMA_DESC_CTRL_LAST_SEG0		BIT(30)
 #define MTK_WDMA_DESC_CTRL_DMA_DONE		BIT(31)
 
+#define MTK_WDMA_TXD0_DESC_INFO_DMA_DONE	BIT(29)
+#define MTK_WDMA_TXD1_DESC_INFO_DMA_DONE	BIT(31)
+
 struct mtk_wdma_desc {
 	__le32 buf0;
 	__le32 ctrl;
@@ -37,6 +40,7 @@ struct mtk_wdma_desc {
 #define MTK_WED_RESET_WDMA_INT_AGENT			BIT(19)
 #define MTK_WED_RESET_RX_RRO_QM				BIT(20)
 #define MTK_WED_RESET_RX_ROUTE_QM			BIT(21)
+#define MTK_WED_RESET_TX_AMSDU				BIT(22)
 #define MTK_WED_RESET_WED				BIT(31)
 
 #define MTK_WED_CTRL					0x00c
@@ -44,6 +48,9 @@ struct mtk_wdma_desc {
 #define MTK_WED_CTRL_WPDMA_INT_AGENT_BUSY		BIT(1)
 #define MTK_WED_CTRL_WDMA_INT_AGENT_EN			BIT(2)
 #define MTK_WED_CTRL_WDMA_INT_AGENT_BUSY		BIT(3)
+#define MTK_WED_CTRL_WED_RX_IND_CMD_EN			BIT(5)
+#define MTK_WED_CTRL_WED_RX_PG_BM_EN			BIT(6)
+#define MTK_WED_CTRL_WED_RX_PG_BM_BUSY			BIT(7)
 #define MTK_WED_CTRL_WED_TX_BM_EN			BIT(8)
 #define MTK_WED_CTRL_WED_TX_BM_BUSY			BIT(9)
 #define MTK_WED_CTRL_WED_TX_FREE_AGENT_EN		BIT(10)
@@ -54,9 +61,14 @@ struct mtk_wdma_desc {
 #define MTK_WED_CTRL_RX_RRO_QM_BUSY			BIT(15)
 #define MTK_WED_CTRL_RX_ROUTE_QM_EN			BIT(16)
 #define MTK_WED_CTRL_RX_ROUTE_QM_BUSY			BIT(17)
+#define MTK_WED_CTRL_TX_TKID_ALI_EN			BIT(20)
+#define MTK_WED_CTRL_TX_TKID_ALI_BUSY			BIT(21)
+#define MTK_WED_CTRL_TX_AMSDU_EN			BIT(22)
+#define MTK_WED_CTRL_TX_AMSDU_BUSY			BIT(23)
 #define MTK_WED_CTRL_FINAL_DIDX_READ			BIT(24)
 #define MTK_WED_CTRL_ETH_DMAD_FMT			BIT(25)
 #define MTK_WED_CTRL_MIB_READ_CLEAR			BIT(28)
+#define MTK_WED_CTRL_FLD_MIB_RD_CLR			BIT(28)
 
 #define MTK_WED_EXT_INT_STATUS				0x020
 #define MTK_WED_EXT_INT_STATUS_TF_LEN_ERR		BIT(0)
@@ -89,6 +101,7 @@ struct mtk_wdma_desc {
 #define MTK_WED_EXT_INT_MASK				0x028
 #define MTK_WED_EXT_INT_MASK1				0x02c
 #define MTK_WED_EXT_INT_MASK2				0x030
+#define MTK_WED_EXT_INT_MASK3				0x034
 
 #define MTK_WED_STATUS					0x060
 #define MTK_WED_STATUS_TX				GENMASK(15, 8)
@@ -96,9 +109,14 @@ struct mtk_wdma_desc {
 #define MTK_WED_TX_BM_CTRL				0x080
 #define MTK_WED_TX_BM_CTRL_VLD_GRP_NUM			GENMASK(6, 0)
 #define MTK_WED_TX_BM_CTRL_RSV_GRP_NUM			GENMASK(22, 16)
+#define MTK_WED_TX_BM_CTRL_LEGACY_EN			BIT(26)
+#define MTK_WED_TX_TKID_CTRL_FREE_FORMAT		BIT(27)
 #define MTK_WED_TX_BM_CTRL_PAUSE			BIT(28)
 
 #define MTK_WED_TX_BM_BASE				0x084
+#define MTK_WED_TX_BM_INIT_PTR				0x088
+#define MTK_WED_TX_BM_SW_TAIL_IDX			GENMASK(16, 0)
+#define MTK_WED_TX_BM_INIT_SW_TAIL_IDX			BIT(16)
 
 #define MTK_WED_TX_BM_TKID_START			GENMASK(15, 0)
 #define MTK_WED_TX_BM_TKID_END				GENMASK(31, 16)
@@ -122,6 +140,9 @@ struct mtk_wdma_desc {
 #define MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM		GENMASK(22, 16)
 #define MTK_WED_TX_TKID_CTRL_PAUSE			BIT(28)
 
+#define MTK_WED_TX_TKID_CTRL_VLD_GRP_NUM_V3		GENMASK(7, 0)
+#define MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM_V3		GENMASK(23, 16)
+
 #define MTK_WED_TX_TKID_DYN_THR				0x0e0
 #define MTK_WED_TX_TKID_DYN_THR_LO			GENMASK(6, 0)
 #define MTK_WED_TX_TKID_DYN_THR_HI			GENMASK(22, 16)
@@ -199,12 +220,15 @@ struct mtk_wdma_desc {
 #define MTK_WED_WPDMA_GLO_CFG_RX_DRV_R1_PKT_PROC	BIT(5)
 #define MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_CRX_SYNC	BIT(6)
 #define MTK_WED_WPDMA_GLO_CFG_RX_DRV_R1_CRX_SYNC	BIT(7)
-#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_EVENT_PKT_FMT_VER	GENMASK(18, 16)
+#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_EVENT_PKT_FMT_VER	GENMASK(15, 12)
+#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_UNS_VER_FORCE_4	BIT(18)
 #define MTK_WED_WPDMA_GLO_CFG_RX_DRV_UNSUPPORT_FMT	BIT(19)
-#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_UEVENT_PKT_FMT_CHK BIT(20)
+#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_EVENT_PKT_FMT_CHK	BIT(20)
 #define MTK_WED_WPDMA_GLO_CFG_RX_DDONE2_WR		BIT(21)
 #define MTK_WED_WPDMA_GLO_CFG_TX_TKID_KEEP		BIT(24)
+#define MTK_WED_WPDMA_GLO_CFG_TX_DDONE_CHK_LAST		BIT(25)
 #define MTK_WED_WPDMA_GLO_CFG_TX_DMAD_DW3_PREV		BIT(28)
+#define MTK_WED_WPDMA_GLO_CFG_TX_DDONE_CHK		BIT(30)
 
 #define MTK_WED_WPDMA_RESET_IDX				0x50c
 #define MTK_WED_WPDMA_RESET_IDX_TX			GENMASK(3, 0)
@@ -250,9 +274,10 @@ struct mtk_wdma_desc {
 #define MTK_WED_PCIE_INT_TRIGGER_STATUS			BIT(16)
 
 #define MTK_WED_PCIE_INT_CTRL				0x57c
-#define MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA		BIT(20)
-#define MTK_WED_PCIE_INT_CTRL_SRC_SEL			GENMASK(17, 16)
 #define MTK_WED_PCIE_INT_CTRL_POLL_EN			GENMASK(13, 12)
+#define MTK_WED_PCIE_INT_CTRL_SRC_SEL			GENMASK(17, 16)
+#define MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA		BIT(20)
+#define MTK_WED_PCIE_INT_CTRL_MSK_IRQ_FILTER		BIT(21)
 
 #define MTK_WED_WPDMA_CFG_BASE				0x580
 #define MTK_WED_WPDMA_CFG_INT_MASK			0x584
@@ -286,6 +311,20 @@ struct mtk_wdma_desc {
 #define MTK_WED_WPDMA_RX_D_PROCESSED_MIB(_n)		(0x784 + (_n) * 4)
 #define MTK_WED_WPDMA_RX_D_COHERENT_MIB			0x78c
 
+#define MTK_WED_WPDMA_RX_D_PREF_CFG			0x7b4
+#define MTK_WED_WPDMA_RX_D_PREF_EN			BIT(0)
+#define MTK_WED_WPDMA_RX_D_PREF_BURST_SIZE		GENMASK(12, 8)
+#define MTK_WED_WPDMA_RX_D_PREF_LOW_THRES		GENMASK(21, 16)
+
+#define MTK_WED_WPDMA_RX_D_PREF_RX0_SIDX		0x7b8
+#define MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR		BIT(15)
+
+#define MTK_WED_WPDMA_RX_D_PREF_RX1_SIDX		0x7bc
+
+#define MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG		0x7c0
+#define MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG_R0_CLR		BIT(0)
+#define MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG_R1_CLR		BIT(16)
+
 #define MTK_WED_WDMA_RING_TX				0x800
 
 #define MTK_WED_WDMA_TX_MIB				0x810
@@ -293,6 +332,18 @@ struct mtk_wdma_desc {
 #define MTK_WED_WDMA_RING_RX(_n)			(0x900 + (_n) * 0x10)
 #define MTK_WED_WDMA_RX_THRES(_n)			(0x940 + (_n) * 0x4)
 
+#define MTK_WED_WDMA_RX_PREF_CFG			0x950
+#define MTK_WED_WDMA_RX_PREF_EN				BIT(0)
+#define MTK_WED_WDMA_RX_PREF_BURST_SIZE			GENMASK(12, 8)
+#define MTK_WED_WDMA_RX_PREF_LOW_THRES			GENMASK(21, 16)
+#define MTK_WED_WDMA_RX_PREF_RX0_SIDX_CLR		BIT(24)
+#define MTK_WED_WDMA_RX_PREF_RX1_SIDX_CLR		BIT(25)
+#define MTK_WED_WDMA_RX_PREF_DDONE2_EN			BIT(26)
+
+#define MTK_WED_WDMA_RX_PREF_FIFO_CFG			0x95C
+#define MTK_WED_WDMA_RX_PREF_FIFO_RX0_CLR		BIT(0)
+#define MTK_WED_WDMA_RX_PREF_FIFO_RX1_CLR		BIT(16)
+
 #define MTK_WED_WDMA_GLO_CFG				0xa04
 #define MTK_WED_WDMA_GLO_CFG_TX_DRV_EN			BIT(0)
 #define MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK		BIT(1)
@@ -325,6 +376,7 @@ struct mtk_wdma_desc {
 #define MTK_WED_WDMA_INT_TRIGGER_RX_DONE		GENMASK(17, 16)
 
 #define MTK_WED_WDMA_INT_CTRL				0xa2c
+#define MTK_WED_WDMA_INT_POLL_PRD			GENMASK(7, 0)
 #define MTK_WED_WDMA_INT_CTRL_POLL_SRC_SEL		GENMASK(17, 16)
 
 #define MTK_WED_WDMA_CFG_BASE				0xaa0
@@ -388,6 +440,18 @@ struct mtk_wdma_desc {
 #define MTK_WDMA_INT_GRP1				0x250
 #define MTK_WDMA_INT_GRP2				0x254
 
+#define MTK_WDMA_PREF_TX_CFG				0x2d0
+#define MTK_WDMA_PREF_TX_CFG_PREF_EN			BIT(0)
+
+#define MTK_WDMA_PREF_RX_CFG				0x2dc
+#define MTK_WDMA_PREF_RX_CFG_PREF_EN			BIT(0)
+
+#define MTK_WDMA_WRBK_TX_CFG				0x300
+#define MTK_WDMA_WRBK_TX_CFG_WRBK_EN			BIT(30)
+
+#define MTK_WDMA_WRBK_RX_CFG				0x344
+#define MTK_WDMA_WRBK_RX_CFG_WRBK_EN			BIT(30)
+
 #define MTK_PCIE_MIRROR_MAP(n)				((n) ? 0x4 : 0x0)
 #define MTK_PCIE_MIRROR_MAP_EN				BIT(0)
 #define MTK_PCIE_MIRROR_MAP_WED_ID			BIT(1)
@@ -401,6 +465,30 @@ struct mtk_wdma_desc {
 #define MTK_WED_RTQM_Q_DBG_BYPASS			BIT(5)
 #define MTK_WED_RTQM_TXDMAD_FPORT			GENMASK(23, 20)
 
+#define MTK_WED_RTQM_IGRS0_I2HW_DMAD_CNT		0xb1c
+#define MTK_WED_RTQM_IGRS0_I2H_DMAD_CNT(_n)		(0xb20 + (_n) * 0x4)
+#define	MTK_WED_RTQM_IGRS0_I2HW_PKT_CNT			0xb28
+#define MTK_WED_RTQM_IGRS0_I2H_PKT_CNT(_n)		(0xb2c + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS0_FDROP_CNT			0xb34
+
+#define MTK_WED_RTQM_IGRS1_I2HW_DMAD_CNT		0xb44
+#define MTK_WED_RTQM_IGRS1_I2H_DMAD_CNT(_n)		(0xb48 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS1_I2HW_PKT_CNT			0xb50
+#define MTK_WED_RTQM_IGRS1_I2H_PKT_CNT(_n)		(0xb54 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS1_FDROP_CNT			0xb5c
+
+#define MTK_WED_RTQM_IGRS2_I2HW_DMAD_CNT		0xb6c
+#define MTK_WED_RTQM_IGRS2_I2H_DMAD_CNT(_n)		(0xb70 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS2_I2HW_PKT_CNT			0xb78
+#define MTK_WED_RTQM_IGRS2_I2H_PKT_CNT(_n)		(0xb7c + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS2_FDROP_CNT			0xb84
+
+#define MTK_WED_RTQM_IGRS3_I2HW_DMAD_CNT		0xb94
+#define MTK_WED_RTQM_IGRS3_I2H_DMAD_CNT(_n)		(0xb98 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS3_I2HW_PKT_CNT			0xba0
+#define MTK_WED_RTQM_IGRS3_I2H_PKT_CNT(_n)		(0xba4 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS3_FDROP_CNT			0xbac
+
 #define MTK_WED_RTQM_R2H_MIB(_n)			(0xb70 + (_n) * 0x4)
 #define MTK_WED_RTQM_R2Q_MIB(_n)			(0xb78 + (_n) * 0x4)
 #define MTK_WED_RTQM_Q2N_MIB				0xb80
@@ -409,6 +497,24 @@ struct mtk_wdma_desc {
 #define MTK_WED_RTQM_Q2B_MIB				0xb8c
 #define MTK_WED_RTQM_PFDBK_MIB				0xb90
 
+#define MTK_WED_RTQM_ENQ_CFG0				0xbb8
+#define MTK_WED_RTQM_ENQ_CFG_TXDMAD_FPORT		GENMASK(15, 12)
+
+#define MTK_WED_RTQM_FDROP_MIB				0xb84
+#define MTK_WED_RTQM_ENQ_I2Q_DMAD_CNT			0xbbc
+#define MTK_WED_RTQM_ENQ_I2N_DMAD_CNT			0xbc0
+#define MTK_WED_RTQM_ENQ_I2Q_PKT_CNT			0xbc4
+#define MTK_WED_RTQM_ENQ_I2N_PKT_CNT			0xbc8
+#define MTK_WED_RTQM_ENQ_USED_ENTRY_CNT			0xbcc
+#define MTK_WED_RTQM_ENQ_ERR_CNT			0xbd0
+
+#define MTK_WED_RTQM_DEQ_DMAD_CNT			0xbd8
+#define MTK_WED_RTQM_DEQ_Q2I_DMAD_CNT			0xbdc
+#define MTK_WED_RTQM_DEQ_PKT_CNT			0xbe0
+#define MTK_WED_RTQM_DEQ_Q2I_PKT_CNT			0xbe4
+#define MTK_WED_RTQM_DEQ_USED_PFDBK_CNT			0xbe8
+#define MTK_WED_RTQM_DEQ_ERR_CNT			0xbec
+
 #define MTK_WED_RROQM_GLO_CFG				0xc04
 #define MTK_WED_RROQM_RST_IDX				0xc08
 #define MTK_WED_RROQM_RST_IDX_MIOD			BIT(0)
@@ -458,7 +564,116 @@ struct mtk_wdma_desc {
 #define MTK_WED_RX_BM_INTF				0xd9c
 #define MTK_WED_RX_BM_ERR_STS				0xda8
 
+#define MTK_RRO_IND_CMD_SIGNATURE			0xe00
+#define MTK_RRO_IND_CMD_DMA_IDX				GENMASK(11, 0)
+#define MTK_RRO_IND_CMD_MAGIC_CNT			GENMASK(30, 28)
+
+#define MTK_WED_IND_CMD_RX_CTRL0			0xe04
+#define MTK_WED_IND_CMD_PROC_IDX			GENMASK(11, 0)
+#define MTK_WED_IND_CMD_PREFETCH_FREE_CNT		GENMASK(19, 16)
+#define MTK_WED_IND_CMD_MAGIC_CNT			GENMASK(30, 28)
+
+#define MTK_WED_IND_CMD_RX_CTRL1			0xe08
+#define MTK_WED_IND_CMD_RX_CTRL2			0xe0c
+#define MTK_WED_IND_CMD_MAX_CNT				GENMASK(11, 0)
+#define MTK_WED_IND_CMD_BASE_M				GENMASK(19, 16)
+
+#define MTK_WED_RRO_CFG0				0xe10
+#define MTK_WED_RRO_CFG1				0xe14
+#define MTK_WED_RRO_CFG1_MAX_WIN_SZ			GENMASK(31, 29)
+#define MTK_WED_RRO_CFG1_ACK_SN_BASE_M			GENMASK(19, 16)
+#define MTK_WED_RRO_CFG1_PARTICL_SE_ID			GENMASK(11, 0)
+
+#define MTK_WED_ADDR_ELEM_CFG0				0xe18
+#define MTK_WED_ADDR_ELEM_CFG1				0xe1c
+#define MTK_WED_ADDR_ELEM_PREFETCH_FREE_CNT		GENMASK(19, 16)
+
+#define MTK_WED_ADDR_ELEM_TBL_CFG			0xe20
+#define MTK_WED_ADDR_ELEM_TBL_OFFSET			GENMASK(6, 0)
+#define MTK_WED_ADDR_ELEM_TBL_RD_RDY			BIT(28)
+#define MTK_WED_ADDR_ELEM_TBL_WR_RDY			BIT(29)
+#define MTK_WED_ADDR_ELEM_TBL_RD			BIT(30)
+#define MTK_WED_ADDR_ELEM_TBL_WR			BIT(31)
+
+#define MTK_WED_RADDR_ELEM_TBL_WDATA			0xe24
+#define MTK_WED_RADDR_ELEM_TBL_RDATA			0xe28
+
+#define MTK_WED_PN_CHECK_CFG				0xe30
+#define MTK_WED_PN_CHECK_SE_ID				GENMASK(11, 0)
+#define MTK_WED_PN_CHECK_RD_RDY				BIT(28)
+#define MTK_WED_PN_CHECK_WR_RDY				BIT(29)
+#define MTK_WED_PN_CHECK_RD				BIT(30)
+#define MTK_WED_PN_CHECK_WR				BIT(31)
+
+#define MTK_WED_PN_CHECK_WDATA_M			0xe38
+#define MTK_WED_PN_CHECK_IS_FIRST			BIT(17)
+
+#define MTK_WED_RRO_MSDU_PG_RING_CFG(_n)		(0xe44 + (_n) * 0x8)
+
+#define MTK_WED_RRO_MSDU_PG_RING2_CFG			0xe58
+#define MTK_WED_RRO_MSDU_PG_DRV_CLR			BIT(26)
+#define MTK_WED_RRO_MSDU_PG_DRV_EN			BIT(31)
+
+#define MTK_WED_RRO_MSDU_PG_CTRL0(_n)			(0xe5c + (_n) * 0xc)
+#define MTK_WED_RRO_MSDU_PG_CTRL1(_n)			(0xe60 + (_n) * 0xc)
+#define MTK_WED_RRO_MSDU_PG_CTRL2(_n)			(0xe64 + (_n) * 0xc)
+
+#define MTK_WED_RRO_RX_D_RX(_n)				(0xe80 + (_n) * 0x10)
+
+#define MTK_WED_RRO_RX_MAGIC_CNT			BIT(13)
+
+#define MTK_WED_RRO_RX_D_CFG(_n)			(0xea0 + (_n) * 0x4)
+#define MTK_WED_RRO_RX_D_DRV_CLR			BIT(26)
+#define MTK_WED_RRO_RX_D_DRV_EN				BIT(31)
+
+#define MTK_WED_RRO_PG_BM_RX_DMAM			0xeb0
+#define MTK_WED_RRO_PG_BM_RX_SDL0			GENMASK(13, 0)
+
+#define MTK_WED_RRO_PG_BM_BASE				0xeb4
+#define MTK_WED_RRO_PG_BM_INIT_PTR			0xeb8
+#define MTK_WED_RRO_PG_BM_SW_TAIL_IDX			GENMASK(15, 0)
+#define MTK_WED_RRO_PG_BM_INIT_SW_TAIL_IDX		BIT(16)
+
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX			0xeec
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX0_EN		BIT(0)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX0_CLR		BIT(1)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX0_DONE_TRIG	GENMASK(6, 2)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX1_EN		BIT(8)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX1_CLR		BIT(9)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX1_DONE_TRIG	GENMASK(14, 10)
+
+#define MTK_WED_WPDMA_INT_CTRL_RRO_MSDU_PG		0xef4
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG0_EN		BIT(0)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG0_CLR		BIT(1)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG0_DONE_TRIG	GENMASK(6, 2)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG1_EN		BIT(8)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG1_CLR		BIT(9)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG1_DONE_TRIG	GENMASK(14, 10)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG2_EN		BIT(16)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG2_CLR		BIT(17)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG2_DONE_TRIG	GENMASK(22, 18)
+
+#define MTK_WED_RX_IND_CMD_CNT0				0xf20
+#define MTK_WED_RX_IND_CMD_DBG_CNT_EN			BIT(31)
+
+#define MTK_WED_RX_IND_CMD_CNT(_n)			(0xf20 + (_n) * 0x4)
+#define MTK_WED_IND_CMD_MAGIC_CNT_FAIL_CNT		GENMASK(15, 0)
+
+#define MTK_WED_RX_ADDR_ELEM_CNT(_n)			(0xf48 + (_n) * 0x4)
+#define MTK_WED_ADDR_ELEM_SIG_FAIL_CNT			GENMASK(15, 0)
+#define MTK_WED_ADDR_ELEM_FIRST_SIG_FAIL_CNT		GENMASK(31, 16)
+#define MTK_WED_ADDR_ELEM_ACKSN_CNT			GENMASK(27, 0)
+
+#define MTK_WED_RX_MSDU_PG_CNT(_n)			(0xf5c + (_n) * 0x4)
+
+#define MTK_WED_RX_PN_CHK_CNT				0xf70
+#define MTK_WED_PN_CHK_FAIL_CNT				GENMASK(15, 0)
+
 #define MTK_WED_WOCPU_VIEW_MIOD_BASE			0x8000
 #define MTK_WED_PCIE_INT_MASK				0x0
 
+#define MTK_WED_PCIE_BASE			0x11280000
+#define MTK_WED_PCIE_BASE0			0x11300000
+#define MTK_WED_PCIE_BASE1			0x11310000
+#define MTK_WED_PCIE_BASE2			0x11290000
 #endif
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.h b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
index 8ed81761bf10..87a67fa3868d 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
@@ -91,6 +91,8 @@ enum mtk_wed_dummy_cr_idx {
 #define MT7981_FIRMWARE_WO	"mediatek/mt7981_wo.bin"
 #define MT7986_FIRMWARE_WO0	"mediatek/mt7986_wo_0.bin"
 #define MT7986_FIRMWARE_WO1	"mediatek/mt7986_wo_1.bin"
+#define MT7988_FIRMWARE_WO0	"mediatek/mt7988_wo_0.bin"
+#define MT7988_FIRMWARE_WO1	"mediatek/mt7988_wo_1.bin"
 
 #define MTK_WO_MCU_CFG_LS_BASE				0
 #define MTK_WO_MCU_CFG_LS_HW_VER_ADDR			(MTK_WO_MCU_CFG_LS_BASE + 0x000)
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 5f00dc26582b..5b096f9f1975 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -138,6 +138,8 @@ struct mtk_wed_device {
 		u32 wpdma_rx;
 
 		bool wcid_512;
+		bool hw_rro;
+		bool msi;
 
 		u16 token_start;
 		unsigned int nbuf;
@@ -211,10 +213,12 @@ mtk_wed_device_attach(struct mtk_wed_device *dev)
 	return ret;
 }
 
-static inline bool
-mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
+static inline bool mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 {
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
+	if (dev->version == 3)
+		return dev->wlan.hw_rro;
+
 	return dev->version != 1;
 #else
 	return false;
-- 
cgit v1.2.3-70-g09d2


From b230812b9dda125e69ab0a5a11cda88d9c0d18a9 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:15 +0200
Subject: net: ethernet: mtk_wed: introduce partial AMSDU offload support for
 MT7988

Introduce partial AMSDU offload support for MT7988 SoC in order to merge
in hw packets belonging to the same AMSDU before passing them to the
WLAN nic.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_ppe.c         |   4 +-
 drivers/net/ethernet/mediatek/mtk_ppe.h         |  19 +--
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c |   3 +-
 drivers/net/ethernet/mediatek/mtk_wed.c         | 154 +++++++++++++++++++++---
 drivers/net/ethernet/mediatek/mtk_wed.h         |   7 ++
 drivers/net/ethernet/mediatek/mtk_wed_regs.h    |  76 ++++++++++++
 include/linux/netdevice.h                       |   1 +
 include/linux/soc/mediatek/mtk_wed.h            |  12 ++
 8 files changed, 248 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c
index 86f32f486043..b2a5d9c3733d 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.c
@@ -425,7 +425,8 @@ int mtk_foe_entry_set_pppoe(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 }
 
 int mtk_foe_entry_set_wdma(struct mtk_eth *eth, struct mtk_foe_entry *entry,
-			   int wdma_idx, int txq, int bss, int wcid)
+			   int wdma_idx, int txq, int bss, int wcid,
+			   bool amsdu_en)
 {
 	struct mtk_foe_mac_info *l2 = mtk_foe_entry_l2(eth, entry);
 	u32 *ib2 = mtk_foe_entry_ib2(eth, entry);
@@ -437,6 +438,7 @@ int mtk_foe_entry_set_wdma(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 			 MTK_FOE_IB2_WDMA_WINFO_V2;
 		l2->w3info = FIELD_PREP(MTK_FOE_WINFO_WCID_V3, wcid) |
 			     FIELD_PREP(MTK_FOE_WINFO_BSS_V3, bss);
+		l2->amsdu = FIELD_PREP(MTK_FOE_WINFO_AMSDU_EN, amsdu_en);
 		break;
 	case 2:
 		*ib2 &= ~MTK_FOE_IB2_PORT_MG_V2;
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h
index e3d0ec72bc69..691806bca372 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe.h
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.h
@@ -88,13 +88,13 @@ enum {
 #define MTK_FOE_WINFO_BSS_V3		GENMASK(23, 16)
 #define MTK_FOE_WINFO_WCID_V3		GENMASK(15, 0)
 
-#define MTK_FOE_WINFO_PAO_USR_INFO	GENMASK(15, 0)
-#define MTK_FOE_WINFO_PAO_TID		GENMASK(19, 16)
-#define MTK_FOE_WINFO_PAO_IS_FIXEDRATE	BIT(20)
-#define MTK_FOE_WINFO_PAO_IS_PRIOR	BIT(21)
-#define MTK_FOE_WINFO_PAO_IS_SP		BIT(22)
-#define MTK_FOE_WINFO_PAO_HF		BIT(23)
-#define MTK_FOE_WINFO_PAO_AMSDU_EN	BIT(24)
+#define MTK_FOE_WINFO_AMSDU_USR_INFO	GENMASK(15, 0)
+#define MTK_FOE_WINFO_AMSDU_TID		GENMASK(19, 16)
+#define MTK_FOE_WINFO_AMSDU_IS_FIXEDRATE	BIT(20)
+#define MTK_FOE_WINFO_AMSDU_IS_PRIOR	BIT(21)
+#define MTK_FOE_WINFO_AMSDU_IS_SP	BIT(22)
+#define MTK_FOE_WINFO_AMSDU_HF		BIT(23)
+#define MTK_FOE_WINFO_AMSDU_EN		BIT(24)
 
 enum {
 	MTK_FOE_STATE_INVALID,
@@ -123,7 +123,7 @@ struct mtk_foe_mac_info {
 
 	/* netsys_v3 */
 	u32 w3info;
-	u32 wpao;
+	u32 amsdu;
 };
 
 /* software-only entry type */
@@ -392,7 +392,8 @@ int mtk_foe_entry_set_vlan(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 int mtk_foe_entry_set_pppoe(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 			    int sid);
 int mtk_foe_entry_set_wdma(struct mtk_eth *eth, struct mtk_foe_entry *entry,
-			   int wdma_idx, int txq, int bss, int wcid);
+			   int wdma_idx, int txq, int bss, int wcid,
+			   bool amsdu_en);
 int mtk_foe_entry_set_queue(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 			    unsigned int queue);
 int mtk_foe_entry_commit(struct mtk_ppe *ppe, struct mtk_flow_entry *entry);
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index 95f76975f258..e073d2b5542c 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -111,6 +111,7 @@ mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_i
 	info->queue = path->mtk_wdma.queue;
 	info->bss = path->mtk_wdma.bss;
 	info->wcid = path->mtk_wdma.wcid;
+	info->amsdu = path->mtk_wdma.amsdu;
 
 	return 0;
 }
@@ -192,7 +193,7 @@ mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe,
 
 	if (mtk_flow_get_wdma_info(dev, dest_mac, &info) == 0) {
 		mtk_foe_entry_set_wdma(eth, foe, info.wdma_idx, info.queue,
-				       info.bss, info.wcid);
+				       info.bss, info.wcid, info.amsdu);
 		if (mtk_is_netsys_v2_or_greater(eth)) {
 			switch (info.wdma_idx) {
 			case 0:
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 18cbf028f6ed..d4b41ccfbad5 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -30,6 +30,8 @@
 #define MTK_WED_RX_PAGE_BUF_PER_PAGE	(PAGE_SIZE / 128)
 #define MTK_WED_RX_RING_SIZE		1536
 #define MTK_WED_RX_PG_BM_CNT		8192
+#define MTK_WED_AMSDU_BUF_SIZE		(PAGE_SIZE << 4)
+#define MTK_WED_AMSDU_NPAGES		32
 
 #define MTK_WED_TX_RING_SIZE		2048
 #define MTK_WED_WDMA_RING_SIZE		1024
@@ -173,6 +175,23 @@ mtk_wdma_rx_reset(struct mtk_wed_device *dev)
 	return ret;
 }
 
+static u32
+mtk_wed_check_busy(struct mtk_wed_device *dev, u32 reg, u32 mask)
+{
+	return !!(wed_r32(dev, reg) & mask);
+}
+
+static int
+mtk_wed_poll_busy(struct mtk_wed_device *dev, u32 reg, u32 mask)
+{
+	int sleep = 15000;
+	int timeout = 100 * sleep;
+	u32 val;
+
+	return read_poll_timeout(mtk_wed_check_busy, val, !val, sleep,
+				 timeout, false, dev, reg, mask);
+}
+
 static void
 mtk_wdma_tx_reset(struct mtk_wed_device *dev)
 {
@@ -335,6 +354,118 @@ out:
 	return hw;
 }
 
+static int
+mtk_wed_amsdu_buffer_alloc(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_hw *hw = dev->hw;
+	struct mtk_wed_amsdu *wed_amsdu;
+	int i;
+
+	if (!mtk_wed_is_v3_or_greater(hw))
+		return 0;
+
+	wed_amsdu = devm_kcalloc(hw->dev, MTK_WED_AMSDU_NPAGES,
+				 sizeof(*wed_amsdu), GFP_KERNEL);
+	if (!wed_amsdu)
+		return -ENOMEM;
+
+	for (i = 0; i < MTK_WED_AMSDU_NPAGES; i++) {
+		void *ptr;
+
+		/* each segment is 64K */
+		ptr = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN |
+					       __GFP_ZERO | __GFP_COMP |
+					       GFP_DMA32,
+					       get_order(MTK_WED_AMSDU_BUF_SIZE));
+		if (!ptr)
+			goto error;
+
+		wed_amsdu[i].txd = ptr;
+		wed_amsdu[i].txd_phy = dma_map_single(hw->dev, ptr,
+						      MTK_WED_AMSDU_BUF_SIZE,
+						      DMA_TO_DEVICE);
+		if (dma_mapping_error(hw->dev, wed_amsdu[i].txd_phy))
+			goto error;
+	}
+	dev->hw->wed_amsdu = wed_amsdu;
+
+	return 0;
+
+error:
+	for (i--; i >= 0; i--)
+		dma_unmap_single(hw->dev, wed_amsdu[i].txd_phy,
+				 MTK_WED_AMSDU_BUF_SIZE, DMA_TO_DEVICE);
+	return -ENOMEM;
+}
+
+static void
+mtk_wed_amsdu_free_buffer(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_amsdu *wed_amsdu = dev->hw->wed_amsdu;
+	int i;
+
+	if (!wed_amsdu)
+		return;
+
+	for (i = 0; i < MTK_WED_AMSDU_NPAGES; i++) {
+		dma_unmap_single(dev->hw->dev, wed_amsdu[i].txd_phy,
+				 MTK_WED_AMSDU_BUF_SIZE, DMA_TO_DEVICE);
+		free_pages((unsigned long)wed_amsdu[i].txd,
+			   get_order(MTK_WED_AMSDU_BUF_SIZE));
+	}
+}
+
+static int
+mtk_wed_amsdu_init(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_amsdu *wed_amsdu = dev->hw->wed_amsdu;
+	int i, ret;
+
+	if (!wed_amsdu)
+		return 0;
+
+	for (i = 0; i < MTK_WED_AMSDU_NPAGES; i++)
+		wed_w32(dev, MTK_WED_AMSDU_HIFTXD_BASE_L(i),
+			wed_amsdu[i].txd_phy);
+
+	/* init all sta parameter */
+	wed_w32(dev, MTK_WED_AMSDU_STA_INFO_INIT, MTK_WED_AMSDU_STA_RMVL |
+		MTK_WED_AMSDU_STA_WTBL_HDRT_MODE |
+		FIELD_PREP(MTK_WED_AMSDU_STA_MAX_AMSDU_LEN,
+			   dev->wlan.amsdu_max_len >> 8) |
+		FIELD_PREP(MTK_WED_AMSDU_STA_MAX_AMSDU_NUM,
+			   dev->wlan.amsdu_max_subframes));
+
+	wed_w32(dev, MTK_WED_AMSDU_STA_INFO, MTK_WED_AMSDU_STA_INFO_DO_INIT);
+
+	ret = mtk_wed_poll_busy(dev, MTK_WED_AMSDU_STA_INFO,
+				MTK_WED_AMSDU_STA_INFO_DO_INIT);
+	if (ret) {
+		dev_err(dev->hw->dev, "amsdu initialization failed\n");
+		return ret;
+	}
+
+	/* init partial amsdu offload txd src */
+	wed_set(dev, MTK_WED_AMSDU_HIFTXD_CFG,
+		FIELD_PREP(MTK_WED_AMSDU_HIFTXD_SRC, dev->hw->index));
+
+	/* init qmem */
+	wed_set(dev, MTK_WED_AMSDU_PSE, MTK_WED_AMSDU_PSE_RESET);
+	ret = mtk_wed_poll_busy(dev, MTK_WED_MON_AMSDU_QMEM_STS1, BIT(29));
+	if (ret) {
+		pr_info("%s: amsdu qmem initialization failed\n", __func__);
+		return ret;
+	}
+
+	/* eagle E1 PCIE1 tx ring 22 flow control issue */
+	if (dev->wlan.id == 0x7991)
+		wed_clr(dev, MTK_WED_AMSDU_FIFO, MTK_WED_AMSDU_IS_PRIOR0_RING);
+
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_TX_AMSDU_EN);
+
+	return 0;
+}
+
 static int
 mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 {
@@ -709,6 +840,7 @@ __mtk_wed_detach(struct mtk_wed_device *dev)
 
 	mtk_wdma_rx_reset(dev);
 	mtk_wed_reset(dev, MTK_WED_RESET_WED);
+	mtk_wed_amsdu_free_buffer(dev);
 	mtk_wed_free_tx_buffer(dev);
 	mtk_wed_free_tx_rings(dev);
 
@@ -1129,23 +1261,6 @@ mtk_wed_ring_reset(struct mtk_wed_ring *ring, int size, bool tx)
 	}
 }
 
-static u32
-mtk_wed_check_busy(struct mtk_wed_device *dev, u32 reg, u32 mask)
-{
-	return !!(wed_r32(dev, reg) & mask);
-}
-
-static int
-mtk_wed_poll_busy(struct mtk_wed_device *dev, u32 reg, u32 mask)
-{
-	int sleep = 15000;
-	int timeout = 100 * sleep;
-	u32 val;
-
-	return read_poll_timeout(mtk_wed_check_busy, val, !val, sleep,
-				 timeout, false, dev, reg, mask);
-}
-
 static int
 mtk_wed_rx_reset(struct mtk_wed_device *dev)
 {
@@ -1692,6 +1807,7 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 	}
 
 	mtk_wed_set_512_support(dev, dev->wlan.wcid_512);
+	mtk_wed_amsdu_init(dev);
 
 	mtk_wed_dma_enable(dev);
 	dev->running = true;
@@ -1748,6 +1864,10 @@ mtk_wed_attach(struct mtk_wed_device *dev)
 	if (ret)
 		goto out;
 
+	ret = mtk_wed_amsdu_buffer_alloc(dev);
+	if (ret)
+		goto out;
+
 	if (mtk_wed_get_rx_capa(dev)) {
 		ret = mtk_wed_rro_alloc(dev);
 		if (ret)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h
index 27d336db4d4d..c1f0479d7a71 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed.h
@@ -25,6 +25,11 @@ struct mtk_wed_soc_data {
 	u32 wdma_desc_size;
 };
 
+struct mtk_wed_amsdu {
+	void *txd;
+	dma_addr_t txd_phy;
+};
+
 struct mtk_wed_hw {
 	const struct mtk_wed_soc_data *soc;
 	struct device_node *node;
@@ -38,6 +43,7 @@ struct mtk_wed_hw {
 	struct dentry *debugfs_dir;
 	struct mtk_wed_device *wed_dev;
 	struct mtk_wed_wo *wed_wo;
+	struct mtk_wed_amsdu *wed_amsdu;
 	u32 pcie_base;
 	u32 debugfs_reg;
 	u32 num_flows;
@@ -52,6 +58,7 @@ struct mtk_wdma_info {
 	u8 queue;
 	u16 wcid;
 	u8 bss;
+	u8 amsdu;
 };
 
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_regs.h b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
index a4d3cf64d090..5a7e4a11a54e 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_regs.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
@@ -672,6 +672,82 @@ struct mtk_wdma_desc {
 #define MTK_WED_WOCPU_VIEW_MIOD_BASE			0x8000
 #define MTK_WED_PCIE_INT_MASK				0x0
 
+#define MTK_WED_AMSDU_FIFO				0x1800
+#define MTK_WED_AMSDU_IS_PRIOR0_RING			BIT(10)
+
+#define MTK_WED_AMSDU_STA_INFO				0x01810
+#define MTK_WED_AMSDU_STA_INFO_DO_INIT			BIT(0)
+#define MTK_WED_AMSDU_STA_INFO_SET_INIT			BIT(1)
+
+#define MTK_WED_AMSDU_STA_INFO_INIT			0x01814
+#define MTK_WED_AMSDU_STA_WTBL_HDRT_MODE		BIT(0)
+#define MTK_WED_AMSDU_STA_RMVL				BIT(1)
+#define MTK_WED_AMSDU_STA_MAX_AMSDU_LEN			GENMASK(7, 2)
+#define MTK_WED_AMSDU_STA_MAX_AMSDU_NUM			GENMASK(11, 8)
+
+#define MTK_WED_AMSDU_HIFTXD_BASE_L(_n)			(0x1980 + (_n) * 0x4)
+
+#define MTK_WED_AMSDU_PSE				0x1910
+#define MTK_WED_AMSDU_PSE_RESET				BIT(16)
+
+#define MTK_WED_AMSDU_HIFTXD_CFG			0x1968
+#define MTK_WED_AMSDU_HIFTXD_SRC			GENMASK(16, 15)
+
+#define MTK_WED_MON_AMSDU_FIFO_DMAD			0x1a34
+
+#define MTK_WED_MON_AMSDU_ENG_DMAD(_n)			(0x1a80 + (_n) * 0x50)
+#define MTK_WED_MON_AMSDU_ENG_QFPL(_n)			(0x1a84 + (_n) * 0x50)
+#define MTK_WED_MON_AMSDU_ENG_QENI(_n)			(0x1a88 + (_n) * 0x50)
+#define MTK_WED_MON_AMSDU_ENG_QENO(_n)			(0x1a8c + (_n) * 0x50)
+#define MTK_WED_MON_AMSDU_ENG_MERG(_n)			(0x1a90 + (_n) * 0x50)
+
+#define MTK_WED_MON_AMSDU_ENG_CNT8(_n)			(0x1a94 + (_n) * 0x50)
+#define MTK_WED_AMSDU_ENG_MAX_QGPP_CNT			GENMASK(10, 0)
+#define MTK_WED_AMSDU_ENG_MAX_PL_CNT			GENMASK(27, 16)
+
+#define MTK_WED_MON_AMSDU_ENG_CNT9(_n)			(0x1a98 + (_n) * 0x50)
+#define MTK_WED_AMSDU_ENG_CUR_ENTRY			GENMASK(10, 0)
+#define MTK_WED_AMSDU_ENG_MAX_BUF_MERGED		GENMASK(20, 16)
+#define MTK_WED_AMSDU_ENG_MAX_MSDU_MERGED		GENMASK(28, 24)
+
+#define MTK_WED_MON_AMSDU_QMEM_STS1			0x1e04
+
+#define MTK_WED_MON_AMSDU_QMEM_CNT(_n)			(0x1e0c + (_n) * 0x4)
+#define MTK_WED_AMSDU_QMEM_FQ_CNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_SP_QCNT			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID0_QCNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID1_QCNT			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID2_QCNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID3_QCNT			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID4_QCNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID5_QCNT			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID6_QCNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID7_QCNT			GENMASK(11, 0)
+
+#define MTK_WED_MON_AMSDU_QMEM_PTR(_n)			(0x1e20 + (_n) * 0x4)
+#define MTK_WED_AMSDU_QMEM_FQ_HEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_SP_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID0_QHEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID1_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID2_QHEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID3_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID4_QHEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID5_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID6_QHEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID7_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_FQ_TAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_SP_QTAIL			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID0_QTAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID1_QTAIL			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID2_QTAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID3_QTAIL			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID4_QTAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID5_QTAIL			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID6_QTAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID7_QTAIL			GENMASK(11, 0)
+
+#define MTK_WED_MON_AMSDU_HIFTXD_FETCH_MSDU(_n)		(0x1ec4 + (_n) * 0x4)
+
 #define MTK_WED_PCIE_BASE			0x11280000
 #define MTK_WED_PCIE_BASE0			0x11300000
 #define MTK_WED_PCIE_BASE1			0x11310000
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index db3d8429d50d..7e520c14eb8c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -919,6 +919,7 @@ struct net_device_path {
 			u8 queue;
 			u16 wcid;
 			u8 bss;
+			u8 amsdu;
 		} mtk_wdma;
 	};
 };
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 5b096f9f1975..90d9c9ead3bc 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -128,6 +128,7 @@ struct mtk_wed_device {
 		enum mtk_wed_bus_tye bus_type;
 		void __iomem *base;
 		u32 phy_base;
+		u32 id;
 
 		u32 wpdma_phys;
 		u32 wpdma_int;
@@ -146,10 +147,12 @@ struct mtk_wed_device {
 		unsigned int rx_nbuf;
 		unsigned int rx_npkt;
 		unsigned int rx_size;
+		unsigned int amsdu_max_len;
 
 		u8 tx_tbit[MTK_WED_TX_QUEUES];
 		u8 rx_tbit[MTK_WED_RX_QUEUES];
 		u8 txfree_tbit;
+		u8 amsdu_max_subframes;
 
 		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
 		int (*offload_enable)(struct mtk_wed_device *wed);
@@ -225,6 +228,15 @@ static inline bool mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 #endif
 }
 
+static inline bool mtk_wed_is_amsdu_supported(struct mtk_wed_device *dev)
+{
+#ifdef CONFIG_NET_MEDIATEK_SOC_WED
+	return dev->version == 3;
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 #define mtk_wed_device_active(_dev) !!(_dev)->ops
 #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev)
-- 
cgit v1.2.3-70-g09d2


From 6757d345dd7dba795f5af44d4442d55a83c4b1b4 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:16 +0200
Subject: net: ethernet: mtk_wed: introduce hw_rro support for MT7988

MT7988 SoC support 802.11 receive reordering offload in hw while
MT7986 SoC implements it through the firmware running on the mcu.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c | 299 +++++++++++++++++++++++++++++++-
 include/linux/soc/mediatek/mtk_wed.h    |  45 +++++
 2 files changed, 342 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index d4b41ccfbad5..2a0be1f2d43e 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -27,7 +27,7 @@
 #define MTK_WED_BUF_SIZE		2048
 #define MTK_WED_PAGE_BUF_SIZE		128
 #define MTK_WED_BUF_PER_PAGE		(PAGE_SIZE / 2048)
-#define MTK_WED_RX_PAGE_BUF_PER_PAGE	(PAGE_SIZE / 128)
+#define MTK_WED_RX_BUF_PER_PAGE		(PAGE_SIZE / MTK_WED_PAGE_BUF_SIZE)
 #define MTK_WED_RX_RING_SIZE		1536
 #define MTK_WED_RX_PG_BM_CNT		8192
 #define MTK_WED_AMSDU_BUF_SIZE		(PAGE_SIZE << 4)
@@ -596,6 +596,68 @@ free_pagelist:
 	kfree(page_list);
 }
 
+static int
+mtk_wed_hwrro_buffer_alloc(struct mtk_wed_device *dev)
+{
+	int n_pages = MTK_WED_RX_PG_BM_CNT / MTK_WED_RX_BUF_PER_PAGE;
+	struct mtk_wed_buf *page_list;
+	struct mtk_wed_bm_desc *desc;
+	dma_addr_t desc_phys;
+	int i, page_idx = 0;
+
+	if (!dev->wlan.hw_rro)
+		return 0;
+
+	page_list = kcalloc(n_pages, sizeof(*page_list), GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	dev->hw_rro.size = dev->wlan.rx_nbuf & ~(MTK_WED_BUF_PER_PAGE - 1);
+	dev->hw_rro.pages = page_list;
+	desc = dma_alloc_coherent(dev->hw->dev,
+				  dev->wlan.rx_nbuf * sizeof(*desc),
+				  &desc_phys, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	dev->hw_rro.desc = desc;
+	dev->hw_rro.desc_phys = desc_phys;
+
+	for (i = 0; i < MTK_WED_RX_PG_BM_CNT; i += MTK_WED_RX_BUF_PER_PAGE) {
+		dma_addr_t page_phys, buf_phys;
+		struct page *page;
+		int s;
+
+		page = __dev_alloc_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+
+		page_phys = dma_map_page(dev->hw->dev, page, 0, PAGE_SIZE,
+					 DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(dev->hw->dev, page_phys)) {
+			__free_page(page);
+			return -ENOMEM;
+		}
+
+		page_list[page_idx].p = page;
+		page_list[page_idx++].phy_addr = page_phys;
+		dma_sync_single_for_cpu(dev->hw->dev, page_phys, PAGE_SIZE,
+					DMA_BIDIRECTIONAL);
+
+		buf_phys = page_phys;
+		for (s = 0; s < MTK_WED_RX_BUF_PER_PAGE; s++) {
+			desc->buf0 = cpu_to_le32(buf_phys);
+			buf_phys += MTK_WED_PAGE_BUF_SIZE;
+			desc++;
+		}
+
+		dma_sync_single_for_device(dev->hw->dev, page_phys, PAGE_SIZE,
+					   DMA_BIDIRECTIONAL);
+	}
+
+	return 0;
+}
+
 static int
 mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
 {
@@ -613,7 +675,42 @@ mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
 	dev->rx_buf_ring.desc_phys = desc_phys;
 	dev->wlan.init_rx_buf(dev, dev->wlan.rx_npkt);
 
-	return 0;
+	return mtk_wed_hwrro_buffer_alloc(dev);
+}
+
+static void
+mtk_wed_hwrro_free_buffer(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_buf *page_list = dev->hw_rro.pages;
+	struct mtk_wed_bm_desc *desc = dev->hw_rro.desc;
+	int i, page_idx = 0;
+
+	if (!dev->wlan.hw_rro)
+		return;
+
+	if (!page_list)
+		return;
+
+	if (!desc)
+		goto free_pagelist;
+
+	for (i = 0; i < MTK_WED_RX_PG_BM_CNT; i += MTK_WED_RX_BUF_PER_PAGE) {
+		dma_addr_t buf_addr = page_list[page_idx].phy_addr;
+		void *page = page_list[page_idx++].p;
+
+		if (!page)
+			break;
+
+		dma_unmap_page(dev->hw->dev, buf_addr, PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+		__free_page(page);
+	}
+
+	dma_free_coherent(dev->hw->dev, dev->hw_rro.size * sizeof(*desc),
+			  desc, dev->hw_rro.desc_phys);
+
+free_pagelist:
+	kfree(page_list);
 }
 
 static void
@@ -627,6 +724,28 @@ mtk_wed_free_rx_buffer(struct mtk_wed_device *dev)
 	dev->wlan.release_rx_buf(dev);
 	dma_free_coherent(dev->hw->dev, dev->rx_buf_ring.size * sizeof(*desc),
 			  desc, dev->rx_buf_ring.desc_phys);
+
+	mtk_wed_hwrro_free_buffer(dev);
+}
+
+static void
+mtk_wed_hwrro_init(struct mtk_wed_device *dev)
+{
+	if (!mtk_wed_get_rx_capa(dev) || !dev->wlan.hw_rro)
+		return;
+
+	wed_set(dev, MTK_WED_RRO_PG_BM_RX_DMAM,
+		FIELD_PREP(MTK_WED_RRO_PG_BM_RX_SDL0, 128));
+
+	wed_w32(dev, MTK_WED_RRO_PG_BM_BASE, dev->hw_rro.desc_phys);
+
+	wed_w32(dev, MTK_WED_RRO_PG_BM_INIT_PTR,
+		MTK_WED_RRO_PG_BM_INIT_SW_TAIL_IDX |
+		FIELD_PREP(MTK_WED_RRO_PG_BM_SW_TAIL_IDX,
+			   MTK_WED_RX_PG_BM_CNT));
+
+	/* enable rx_page_bm to fetch dmad */
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_PG_BM_EN);
 }
 
 static void
@@ -640,6 +759,8 @@ mtk_wed_rx_buffer_hw_init(struct mtk_wed_device *dev)
 	wed_w32(dev, MTK_WED_RX_BM_DYN_ALLOC_TH,
 		FIELD_PREP(MTK_WED_RX_BM_DYN_ALLOC_TH_H, 0xffff));
 	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_BM_EN);
+
+	mtk_wed_hwrro_init(dev);
 }
 
 static void
@@ -935,6 +1056,8 @@ mtk_wed_bus_init(struct mtk_wed_device *dev)
 static void
 mtk_wed_set_wpdma(struct mtk_wed_device *dev)
 {
+	int i;
+
 	if (mtk_wed_is_v1(dev->hw)) {
 		wed_w32(dev, MTK_WED_WPDMA_CFG_BASE,  dev->wlan.wpdma_phys);
 		return;
@@ -952,6 +1075,15 @@ mtk_wed_set_wpdma(struct mtk_wed_device *dev)
 
 	wed_w32(dev, MTK_WED_WPDMA_RX_GLO_CFG, dev->wlan.wpdma_rx_glo);
 	wed_w32(dev, dev->hw->soc->regmap.wpdma_rx_ring0, dev->wlan.wpdma_rx);
+
+	if (!dev->wlan.hw_rro)
+		return;
+
+	wed_w32(dev, MTK_WED_RRO_RX_D_CFG(0), dev->wlan.wpdma_rx_rro[0]);
+	wed_w32(dev, MTK_WED_RRO_RX_D_CFG(1), dev->wlan.wpdma_rx_rro[1]);
+	for (i = 0; i < MTK_WED_RX_PAGE_QUEUES; i++)
+		wed_w32(dev, MTK_WED_RRO_MSDU_PG_RING_CFG(i),
+			dev->wlan.wpdma_rx_pg + i * 0x10);
 }
 
 static void
@@ -1762,6 +1894,165 @@ mtk_wed_dma_enable(struct mtk_wed_device *dev)
 	}
 }
 
+static void
+mtk_wed_start_hw_rro(struct mtk_wed_device *dev, u32 irq_mask, bool reset)
+{
+	int i;
+
+	wed_w32(dev, MTK_WED_WPDMA_INT_MASK, irq_mask);
+	wed_w32(dev, MTK_WED_INT_MASK, irq_mask);
+
+	if (!mtk_wed_get_rx_capa(dev) || !dev->wlan.hw_rro)
+		return;
+
+	wed_set(dev, MTK_WED_RRO_RX_D_CFG(2), MTK_WED_RRO_MSDU_PG_DRV_CLR);
+	wed_w32(dev, MTK_WED_RRO_MSDU_PG_RING2_CFG,
+		MTK_WED_RRO_MSDU_PG_DRV_CLR);
+
+	wed_w32(dev, MTK_WED_WPDMA_INT_CTRL_RRO_RX,
+		MTK_WED_WPDMA_INT_CTRL_RRO_RX0_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_RX0_CLR |
+		MTK_WED_WPDMA_INT_CTRL_RRO_RX1_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_RX1_CLR |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_RX0_DONE_TRIG,
+			   dev->wlan.rro_rx_tbit[0]) |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_RX1_DONE_TRIG,
+			   dev->wlan.rro_rx_tbit[1]));
+
+	wed_w32(dev, MTK_WED_WPDMA_INT_CTRL_RRO_MSDU_PG,
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG0_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG0_CLR |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG1_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG1_CLR |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG2_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG2_CLR |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_PG0_DONE_TRIG,
+			   dev->wlan.rx_pg_tbit[0]) |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_PG1_DONE_TRIG,
+			   dev->wlan.rx_pg_tbit[1]) |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_PG2_DONE_TRIG,
+			   dev->wlan.rx_pg_tbit[2]));
+
+	/* RRO_MSDU_PG_RING2_CFG1_FLD_DRV_EN should be enabled after
+	 * WM FWDL completed, otherwise RRO_MSDU_PG ring may broken
+	 */
+	wed_set(dev, MTK_WED_RRO_MSDU_PG_RING2_CFG,
+		MTK_WED_RRO_MSDU_PG_DRV_EN);
+
+	for (i = 0; i < MTK_WED_RX_QUEUES; i++) {
+		struct mtk_wed_ring *ring = &dev->rx_rro_ring[i];
+
+		if (!(ring->flags & MTK_WED_RING_CONFIGURED))
+			continue;
+
+		if (mtk_wed_check_wfdma_rx_fill(dev, ring))
+			dev_err(dev->hw->dev,
+				"rx_rro_ring(%d) initialization failed\n", i);
+	}
+
+	for (i = 0; i < MTK_WED_RX_PAGE_QUEUES; i++) {
+		struct mtk_wed_ring *ring = &dev->rx_page_ring[i];
+
+		if (!(ring->flags & MTK_WED_RING_CONFIGURED))
+			continue;
+
+		if (mtk_wed_check_wfdma_rx_fill(dev, ring))
+			dev_err(dev->hw->dev,
+				"rx_page_ring(%d) initialization failed\n", i);
+	}
+}
+
+static void
+mtk_wed_rro_rx_ring_setup(struct mtk_wed_device *dev, int idx,
+			  void __iomem *regs)
+{
+	struct mtk_wed_ring *ring = &dev->rx_rro_ring[idx];
+
+	ring->wpdma = regs;
+	wed_w32(dev, MTK_WED_RRO_RX_D_RX(idx) + MTK_WED_RING_OFS_BASE,
+		readl(regs));
+	wed_w32(dev, MTK_WED_RRO_RX_D_RX(idx) + MTK_WED_RING_OFS_COUNT,
+		readl(regs + MTK_WED_RING_OFS_COUNT));
+	ring->flags |= MTK_WED_RING_CONFIGURED;
+}
+
+static void
+mtk_wed_msdu_pg_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
+{
+	struct mtk_wed_ring *ring = &dev->rx_page_ring[idx];
+
+	ring->wpdma = regs;
+	wed_w32(dev, MTK_WED_RRO_MSDU_PG_CTRL0(idx) + MTK_WED_RING_OFS_BASE,
+		readl(regs));
+	wed_w32(dev, MTK_WED_RRO_MSDU_PG_CTRL0(idx) + MTK_WED_RING_OFS_COUNT,
+		readl(regs + MTK_WED_RING_OFS_COUNT));
+	ring->flags |= MTK_WED_RING_CONFIGURED;
+}
+
+static int
+mtk_wed_ind_rx_ring_setup(struct mtk_wed_device *dev, void __iomem *regs)
+{
+	struct mtk_wed_ring *ring = &dev->ind_cmd_ring;
+	u32 val = readl(regs + MTK_WED_RING_OFS_COUNT);
+	int i, count = 0;
+
+	ring->wpdma = regs;
+	wed_w32(dev, MTK_WED_IND_CMD_RX_CTRL1 + MTK_WED_RING_OFS_BASE,
+		readl(regs) & 0xfffffff0);
+
+	wed_w32(dev, MTK_WED_IND_CMD_RX_CTRL1 + MTK_WED_RING_OFS_COUNT,
+		readl(regs + MTK_WED_RING_OFS_COUNT));
+
+	/* ack sn cr */
+	wed_w32(dev, MTK_WED_RRO_CFG0, dev->wlan.phy_base +
+		dev->wlan.ind_cmd.ack_sn_addr);
+	wed_w32(dev, MTK_WED_RRO_CFG1,
+		FIELD_PREP(MTK_WED_RRO_CFG1_MAX_WIN_SZ,
+			   dev->wlan.ind_cmd.win_size) |
+		FIELD_PREP(MTK_WED_RRO_CFG1_PARTICL_SE_ID,
+			   dev->wlan.ind_cmd.particular_sid));
+
+	/* particular session addr element */
+	wed_w32(dev, MTK_WED_ADDR_ELEM_CFG0,
+		dev->wlan.ind_cmd.particular_se_phys);
+
+	for (i = 0; i < dev->wlan.ind_cmd.se_group_nums; i++) {
+		wed_w32(dev, MTK_WED_RADDR_ELEM_TBL_WDATA,
+			dev->wlan.ind_cmd.addr_elem_phys[i] >> 4);
+		wed_w32(dev, MTK_WED_ADDR_ELEM_TBL_CFG,
+			MTK_WED_ADDR_ELEM_TBL_WR | (i & 0x7f));
+
+		val = wed_r32(dev, MTK_WED_ADDR_ELEM_TBL_CFG);
+		while (!(val & MTK_WED_ADDR_ELEM_TBL_WR_RDY) && count++ < 100)
+			val = wed_r32(dev, MTK_WED_ADDR_ELEM_TBL_CFG);
+		if (count >= 100)
+			dev_err(dev->hw->dev,
+				"write ba session base failed\n");
+	}
+
+	/* pn check init */
+	for (i = 0; i < dev->wlan.ind_cmd.particular_sid; i++) {
+		wed_w32(dev, MTK_WED_PN_CHECK_WDATA_M,
+			MTK_WED_PN_CHECK_IS_FIRST);
+
+		wed_w32(dev, MTK_WED_PN_CHECK_CFG, MTK_WED_PN_CHECK_WR |
+			FIELD_PREP(MTK_WED_PN_CHECK_SE_ID, i));
+
+		count = 0;
+		val = wed_r32(dev, MTK_WED_PN_CHECK_CFG);
+		while (!(val & MTK_WED_PN_CHECK_WR_RDY) && count++ < 100)
+			val = wed_r32(dev, MTK_WED_PN_CHECK_CFG);
+		if (count >= 100)
+			dev_err(dev->hw->dev,
+				"session(%d) initialization failed\n", i);
+	}
+
+	wed_w32(dev, MTK_WED_RX_IND_CMD_CNT0, MTK_WED_RX_IND_CMD_DBG_CNT_EN);
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_IND_CMD_EN);
+
+	return 0;
+}
+
 static void
 mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 {
@@ -2216,6 +2507,10 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		.detach = mtk_wed_detach,
 		.ppe_check = mtk_wed_ppe_check,
 		.setup_tc = mtk_wed_setup_tc,
+		.start_hw_rro = mtk_wed_start_hw_rro,
+		.rro_rx_ring_setup = mtk_wed_rro_rx_ring_setup,
+		.msdu_pg_rx_ring_setup = mtk_wed_msdu_pg_rx_ring_setup,
+		.ind_rx_ring_setup = mtk_wed_ind_rx_ring_setup,
 	};
 	struct device_node *eth_np = eth->dev->of_node;
 	struct platform_device *pdev;
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 90d9c9ead3bc..a476648858a6 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -10,6 +10,7 @@
 
 #define MTK_WED_TX_QUEUES		2
 #define MTK_WED_RX_QUEUES		2
+#define MTK_WED_RX_PAGE_QUEUES		3
 
 #define WED_WO_STA_REC			0x6
 
@@ -99,6 +100,9 @@ struct mtk_wed_device {
 	struct mtk_wed_ring txfree_ring;
 	struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES];
 	struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES];
+	struct mtk_wed_ring rx_rro_ring[MTK_WED_RX_QUEUES];
+	struct mtk_wed_ring rx_page_ring[MTK_WED_RX_PAGE_QUEUES];
+	struct mtk_wed_ring ind_cmd_ring;
 
 	struct {
 		int size;
@@ -119,6 +123,13 @@ struct mtk_wed_device {
 		dma_addr_t fdbk_phys;
 	} rro;
 
+	struct {
+		int size;
+		struct mtk_wed_buf *pages;
+		struct mtk_wed_bm_desc *desc;
+		dma_addr_t desc_phys;
+	} hw_rro;
+
 	/* filled by driver: */
 	struct {
 		union {
@@ -137,6 +148,8 @@ struct mtk_wed_device {
 		u32 wpdma_txfree;
 		u32 wpdma_rx_glo;
 		u32 wpdma_rx;
+		u32 wpdma_rx_rro[MTK_WED_RX_QUEUES];
+		u32 wpdma_rx_pg;
 
 		bool wcid_512;
 		bool hw_rro;
@@ -151,9 +164,20 @@ struct mtk_wed_device {
 
 		u8 tx_tbit[MTK_WED_TX_QUEUES];
 		u8 rx_tbit[MTK_WED_RX_QUEUES];
+		u8 rro_rx_tbit[MTK_WED_RX_QUEUES];
+		u8 rx_pg_tbit[MTK_WED_RX_PAGE_QUEUES];
 		u8 txfree_tbit;
 		u8 amsdu_max_subframes;
 
+		struct {
+			u8 se_group_nums;
+			u16 win_size;
+			u16 particular_sid;
+			u32 ack_sn_addr;
+			dma_addr_t particular_se_phys;
+			dma_addr_t addr_elem_phys[1024];
+		} ind_cmd;
+
 		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
 		int (*offload_enable)(struct mtk_wed_device *wed);
 		void (*offload_disable)(struct mtk_wed_device *wed);
@@ -192,6 +216,14 @@ struct mtk_wed_ops {
 	void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask);
 	int (*setup_tc)(struct mtk_wed_device *wed, struct net_device *dev,
 			enum tc_setup_type type, void *type_data);
+	void (*start_hw_rro)(struct mtk_wed_device *dev, u32 irq_mask,
+			     bool reset);
+	void (*rro_rx_ring_setup)(struct mtk_wed_device *dev, int ring,
+				  void __iomem *regs);
+	void (*msdu_pg_rx_ring_setup)(struct mtk_wed_device *dev, int ring,
+				      void __iomem *regs);
+	int (*ind_rx_ring_setup)(struct mtk_wed_device *dev,
+				 void __iomem *regs);
 };
 
 extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops;
@@ -263,6 +295,15 @@ static inline bool mtk_wed_is_amsdu_supported(struct mtk_wed_device *dev)
 #define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev)
 #define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) \
 	(_dev)->ops->setup_tc(_dev, _netdev, _type, _type_data)
+#define mtk_wed_device_start_hw_rro(_dev, _mask, _reset) \
+	(_dev)->ops->start_hw_rro(_dev, _mask, _reset)
+#define mtk_wed_device_rro_rx_ring_setup(_dev, _ring, _regs) \
+	(_dev)->ops->rro_rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_msdu_pg_rx_ring_setup(_dev, _ring, _regs) \
+	(_dev)->ops->msdu_pg_rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_ind_rx_ring_setup(_dev, _regs) \
+	(_dev)->ops->ind_rx_ring_setup(_dev, _regs)
+
 #else
 static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 {
@@ -282,6 +323,10 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_stop(_dev) do {} while (0)
 #define mtk_wed_device_dma_reset(_dev) do {} while (0)
 #define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) -EOPNOTSUPP
+#define mtk_wed_device_start_hw_rro(_dev, _mask, _reset) do {} while (0)
+#define mtk_wed_device_rro_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_msdu_pg_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_ind_rx_ring_setup(_dev, _regs) -ENODEV
 #endif
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 653b7eb9d74426397c95061fd57da3063625af65 Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@nvidia.com>
Date: Mon, 28 Aug 2023 14:20:00 +0300
Subject: net/mlx5: Bridge, Enable mcast in smfs steering mode

In order to have mcast offloads the driver needs the following:
It should know if that mcast comes from wire port, in addition the flow
should not be marked as any specific source, that way it will give the
flexibility for the driver not to be depended on the way iterator
implemented in the FW.

Signed-off-by: Erez Shitrit <erezsh@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 11 ++---------
 include/linux/mlx5/fs.h                                    |  1 +
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
index 7a01714b3780..a7ed87e9d842 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
@@ -78,6 +78,8 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md
 	xa_for_each(&entry->ports, idx, port) {
 		dests[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
 		dests[i].ft = port->mcast.ft;
+		if (port->vport_num == MLX5_VPORT_UPLINK)
+			dests[i].ft->flags |= MLX5_FLOW_TABLE_UPLINK_VPORT;
 		i++;
 	}
 
@@ -585,10 +587,6 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po
 	if (!rule_spec)
 		return ERR_PTR(-ENOMEM);
 
-	if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) &&
-	    port->vport_num == MLX5_VPORT_UPLINK)
-		rule_spec->flow_context.flow_source =
-			MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT;
 	rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
 
 	flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
@@ -660,11 +658,6 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port)
 	if (!rule_spec)
 		return ERR_PTR(-ENOMEM);
 
-	if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) &&
-	    port->vport_num == MLX5_VPORT_UPLINK)
-		rule_spec->flow_context.flow_source =
-			MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT;
-
 	if (MLX5_CAP_ESW(bridge->br_offloads->esw->dev, merged_eswitch)) {
 		dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID;
 		dest.vport.vhca_id = port->esw_owner_vhca_id;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 1e00c2436377..6f7725238abc 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -67,6 +67,7 @@ enum {
 	MLX5_FLOW_TABLE_TERMINATION = BIT(2),
 	MLX5_FLOW_TABLE_UNMANAGED = BIT(3),
 	MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4),
+	MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5),
 };
 
 #define LEFTOVERS_RULE_NUM	 2
-- 
cgit v1.2.3-70-g09d2


From e0cc92fd945a9c8e43d4268cf1ea985d0e99a90f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Thu, 3 Aug 2023 16:39:40 +0300
Subject: net/mlx5: Add a health error syndrome for pci data poisoned

Add new health error syndrome to indicate that pci data poisoned error
has been received while fetching device ICM data.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 ++
 include/linux/mlx5/mlx5_ifc.h                    | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2fb2598b775e..1c220048ae9a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -365,6 +365,8 @@ static const char *hsynd_str(u8 synd)
 		return "FFSER error";
 	case MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR:
 		return "High temperature";
+	case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR:
+		return "ICM fetch PCI data poisoned error";
 	default:
 		return "unrecognized error";
 	}
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index dd8421d021cf..b23d8ff286a1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10574,6 +10574,7 @@ enum {
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV                       = 0xe,
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR                    = 0xf,
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR                = 0x10,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR         = 0x12,
 };
 
 struct mlx5_ifc_initial_seg_bits {
-- 
cgit v1.2.3-70-g09d2


From 0df7cd3c13e44d01f9f28e29cbce74e2931b00fe Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Sat, 16 Sep 2023 16:09:15 +0300
Subject: vsock/virtio/vhost: read data from non-linear skb

This is preparation patch for MSG_ZEROCOPY support. It adds handling of
non-linear skbs by replacing direct calls of 'memcpy_to_msg()' with
'skb_copy_datagram_iter()'. Main advantage of the second one is that it
can handle paged part of the skb by using 'kmap()' on each page, but if
there are no pages in the skb, it behaves like simple copying to iov
iterator. This patch also adds new field to the control block of skb -
this value shows current offset in the skb to read next portion of data
(it doesn't matter linear it or not). Idea behind this field is that
'skb_copy_datagram_iter()' handles both types of skb internally - it
just needs an offset from which to copy data from the given skb. This
offset is incremented on each read from skb. This approach allows to
simplify handling of both linear and non-linear skbs, because for
linear skb we need to call 'skb_pull()' after reading data from it,
while in non-linear case we need to update 'data_len'.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/vhost/vsock.c                   | 14 +++++++++-----
 include/linux/virtio_vsock.h            |  1 +
 net/vmw_vsock/virtio_transport_common.c | 32 +++++++++++++++++++-------------
 3 files changed, 29 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 817d377a3f36..83711aad855c 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -114,6 +114,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		struct sk_buff *skb;
 		unsigned out, in;
 		size_t nbytes;
+		u32 offset;
 		int head;
 
 		skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);
@@ -156,7 +157,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		}
 
 		iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
-		payload_len = skb->len;
+		offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
+		payload_len = skb->len - offset;
 		hdr = virtio_vsock_hdr(skb);
 
 		/* If the packet is greater than the space available in the
@@ -197,8 +199,10 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		nbytes = copy_to_iter(skb->data, payload_len, &iov_iter);
-		if (nbytes != payload_len) {
+		if (skb_copy_datagram_iter(skb,
+					   offset,
+					   &iov_iter,
+					   payload_len)) {
 			kfree_skb(skb);
 			vq_err(vq, "Faulted on copying pkt buf\n");
 			break;
@@ -212,13 +216,13 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		vhost_add_used(vq, head, sizeof(*hdr) + payload_len);
 		added = true;
 
-		skb_pull(skb, payload_len);
+		VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len;
 		total_len += payload_len;
 
 		/* If we didn't send all the payload we can requeue the packet
 		 * to send it with the next available buffer.
 		 */
-		if (skb->len > 0) {
+		if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) {
 			hdr->flags |= cpu_to_le32(flags_to_restore);
 
 			/* We are queueing the same skb to handle
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c58453699ee9..a91fbdf233e4 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -12,6 +12,7 @@
 struct virtio_vsock_skb_cb {
 	bool reply;
 	bool tap_delivered;
+	u32 offset;
 };
 
 #define VIRTIO_VSOCK_SKB_CB(skb) ((struct virtio_vsock_skb_cb *)((skb)->cb))
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 352d042b130b..3e08d52a9355 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -364,9 +364,10 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
 		spin_unlock_bh(&vvs->rx_lock);
 
 		/* sk_lock is held by caller so no one else can dequeue.
-		 * Unlock rx_lock since memcpy_to_msg() may sleep.
+		 * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
 		 */
-		err = memcpy_to_msg(msg, skb->data, bytes);
+		err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset,
+					     &msg->msg_iter, bytes);
 		if (err)
 			goto out;
 
@@ -410,25 +411,27 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 	while (total < len && !skb_queue_empty(&vvs->rx_queue)) {
 		skb = skb_peek(&vvs->rx_queue);
 
-		bytes = len - total;
-		if (bytes > skb->len)
-			bytes = skb->len;
+		bytes = min_t(size_t, len - total,
+			      skb->len - VIRTIO_VSOCK_SKB_CB(skb)->offset);
 
 		/* sk_lock is held by caller so no one else can dequeue.
-		 * Unlock rx_lock since memcpy_to_msg() may sleep.
+		 * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
 		 */
 		spin_unlock_bh(&vvs->rx_lock);
 
-		err = memcpy_to_msg(msg, skb->data, bytes);
+		err = skb_copy_datagram_iter(skb,
+					     VIRTIO_VSOCK_SKB_CB(skb)->offset,
+					     &msg->msg_iter, bytes);
 		if (err)
 			goto out;
 
 		spin_lock_bh(&vvs->rx_lock);
 
 		total += bytes;
-		skb_pull(skb, bytes);
 
-		if (skb->len == 0) {
+		VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes;
+
+		if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) {
 			u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len);
 
 			virtio_transport_dec_rx_pkt(vvs, pkt_len);
@@ -492,9 +495,10 @@ virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk,
 			spin_unlock_bh(&vvs->rx_lock);
 
 			/* sk_lock is held by caller so no one else can dequeue.
-			 * Unlock rx_lock since memcpy_to_msg() may sleep.
+			 * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
 			 */
-			err = memcpy_to_msg(msg, skb->data, bytes);
+			err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset,
+						     &msg->msg_iter, bytes);
 			if (err)
 				return err;
 
@@ -553,11 +557,13 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
 				int err;
 
 				/* sk_lock is held by caller so no one else can dequeue.
-				 * Unlock rx_lock since memcpy_to_msg() may sleep.
+				 * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
 				 */
 				spin_unlock_bh(&vvs->rx_lock);
 
-				err = memcpy_to_msg(msg, skb->data, bytes_to_copy);
+				err = skb_copy_datagram_iter(skb, 0,
+							     &msg->msg_iter,
+							     bytes_to_copy);
 				if (err) {
 					/* Copy of message failed. Rest of
 					 * fragments will be freed without copy.
-- 
cgit v1.2.3-70-g09d2


From 581512a6dc939ef122e49336626ae159f3b8a345 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Sat, 16 Sep 2023 16:09:18 +0300
Subject: vsock/virtio: MSG_ZEROCOPY flag support

This adds handling of MSG_ZEROCOPY flag on transmission path:

1) If this flag is set and zerocopy transmission is possible (enabled
   in socket options and transport allows zerocopy), then non-linear
   skb will be created and filled with the pages of user's buffer.
   Pages of user's buffer are locked in memory by 'get_user_pages()'.
2) Replaces way of skb owning: instead of 'skb_set_owner_sk_safe()' it
   calls 'skb_set_owner_w()'. Reason of this change is that
   '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc' of socket, so
   to decrease this field correctly, proper skb destructor is needed:
   'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.
3) Adds new callback to 'struct virtio_transport': 'can_msgzerocopy'.
   If this callback is set, then transport needs extra check to be able
   to send provided number of buffers in zerocopy mode. Currently, the
   only transport that needs this callback set is virtio, because this
   transport adds new buffers to the virtio queue and we need to check,
   that number of these buffers is less than size of the queue (it is
   required by virtio spec). vhost and loopback transports don't need
   this check.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/virtio_vsock.h                       |   9 +
 .../trace/events/vsock_virtio_transport_common.h   |  12 +-
 net/vmw_vsock/virtio_transport.c                   |  32 +++
 net/vmw_vsock/virtio_transport_common.c            | 250 ++++++++++++++++-----
 4 files changed, 241 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index a91fbdf233e4..ebb3ce63d64d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -160,6 +160,15 @@ struct virtio_transport {
 
 	/* Takes ownership of the packet */
 	int (*send_pkt)(struct sk_buff *skb);
+
+	/* Used in MSG_ZEROCOPY mode. Checks, that provided data
+	 * (number of buffers) could be transmitted with zerocopy
+	 * mode. If this callback is not implemented for the current
+	 * transport - this means that this transport doesn't need
+	 * extra checks and can perform zerocopy transmission by
+	 * default.
+	 */
+	bool (*can_msgzerocopy)(int bufs_num);
 };
 
 ssize_t
diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h
index d0b3f0ea9ba1..f1ebe36787c3 100644
--- a/include/trace/events/vsock_virtio_transport_common.h
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -43,7 +43,8 @@ TRACE_EVENT(virtio_transport_alloc_pkt,
 		 __u32 len,
 		 __u16 type,
 		 __u16 op,
-		 __u32 flags
+		 __u32 flags,
+		 bool zcopy
 	),
 	TP_ARGS(
 		src_cid, src_port,
@@ -51,7 +52,8 @@ TRACE_EVENT(virtio_transport_alloc_pkt,
 		len,
 		type,
 		op,
-		flags
+		flags,
+		zcopy
 	),
 	TP_STRUCT__entry(
 		__field(__u32, src_cid)
@@ -62,6 +64,7 @@ TRACE_EVENT(virtio_transport_alloc_pkt,
 		__field(__u16, type)
 		__field(__u16, op)
 		__field(__u32, flags)
+		__field(bool, zcopy)
 	),
 	TP_fast_assign(
 		__entry->src_cid = src_cid;
@@ -72,14 +75,15 @@ TRACE_EVENT(virtio_transport_alloc_pkt,
 		__entry->type = type;
 		__entry->op = op;
 		__entry->flags = flags;
+		__entry->zcopy = zcopy;
 	),
-	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x",
+	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x zcopy=%s",
 		  __entry->src_cid, __entry->src_port,
 		  __entry->dst_cid, __entry->dst_port,
 		  __entry->len,
 		  show_type(__entry->type),
 		  show_op(__entry->op),
-		  __entry->flags)
+		  __entry->flags, __entry->zcopy ? "true" : "false")
 );
 
 TRACE_EVENT(virtio_transport_recv_pkt,
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 73d730156349..09ba3128e759 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -455,6 +455,37 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
 	queue_work(virtio_vsock_workqueue, &vsock->rx_work);
 }
 
+static bool virtio_transport_can_msgzerocopy(int bufs_num)
+{
+	struct virtio_vsock *vsock;
+	bool res = false;
+
+	rcu_read_lock();
+
+	vsock = rcu_dereference(the_virtio_vsock);
+	if (vsock) {
+		struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];
+
+		/* Check that tx queue is large enough to keep whole
+		 * data to send. This is needed, because when there is
+		 * not enough free space in the queue, current skb to
+		 * send will be reinserted to the head of tx list of
+		 * the socket to retry transmission later, so if skb
+		 * is bigger than whole queue, it will be reinserted
+		 * again and again, thus blocking other skbs to be sent.
+		 * Each page of the user provided buffer will be added
+		 * as a single buffer to the tx virtqueue, so compare
+		 * number of pages against maximum capacity of the queue.
+		 */
+		if (bufs_num <= vq->num_max)
+			res = true;
+	}
+
+	rcu_read_unlock();
+
+	return res;
+}
+
 static bool virtio_transport_seqpacket_allow(u32 remote_cid);
 
 static struct virtio_transport virtio_transport = {
@@ -504,6 +535,7 @@ static struct virtio_transport virtio_transport = {
 	},
 
 	.send_pkt = virtio_transport_send_pkt,
+	.can_msgzerocopy = virtio_transport_can_msgzerocopy,
 };
 
 static bool virtio_transport_seqpacket_allow(u32 remote_cid)
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 3a48e48a99ac..e22c81435ef7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -37,73 +37,99 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
 	return container_of(t, struct virtio_transport, transport);
 }
 
-/* Returns a new packet on success, otherwise returns NULL.
- *
- * If NULL is returned, errp is set to a negative errno.
- */
-static struct sk_buff *
-virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
-			   size_t len,
-			   u32 src_cid,
-			   u32 src_port,
-			   u32 dst_cid,
-			   u32 dst_port)
-{
-	const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
-	struct virtio_vsock_hdr *hdr;
-	struct sk_buff *skb;
-	void *payload;
-	int err;
+static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops,
+				       struct virtio_vsock_pkt_info *info,
+				       size_t pkt_len)
+{
+	struct iov_iter *iov_iter;
 
-	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
-	if (!skb)
-		return NULL;
+	if (!info->msg)
+		return false;
 
-	hdr = virtio_vsock_hdr(skb);
-	hdr->type	= cpu_to_le16(info->type);
-	hdr->op		= cpu_to_le16(info->op);
-	hdr->src_cid	= cpu_to_le64(src_cid);
-	hdr->dst_cid	= cpu_to_le64(dst_cid);
-	hdr->src_port	= cpu_to_le32(src_port);
-	hdr->dst_port	= cpu_to_le32(dst_port);
-	hdr->flags	= cpu_to_le32(info->flags);
-	hdr->len	= cpu_to_le32(len);
+	iov_iter = &info->msg->msg_iter;
 
-	if (info->msg && len > 0) {
-		payload = skb_put(skb, len);
-		err = memcpy_from_msg(payload, info->msg, len);
-		if (err)
-			goto out;
+	if (iov_iter->iov_offset)
+		return false;
 
-		if (msg_data_left(info->msg) == 0 &&
-		    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
-			hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+	/* We can't send whole iov. */
+	if (iov_iter->count > pkt_len)
+		return false;
 
-			if (info->msg->msg_flags & MSG_EOR)
-				hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
-		}
+	/* Check that transport can send data in zerocopy mode. */
+	t_ops = virtio_transport_get_ops(info->vsk);
+
+	if (t_ops->can_msgzerocopy) {
+		int pages_in_iov = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);
+		int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS);
+
+		/* +1 is for packet header. */
+		return t_ops->can_msgzerocopy(pages_to_send + 1);
 	}
 
-	if (info->reply)
-		virtio_vsock_skb_set_reply(skb);
+	return true;
+}
 
-	trace_virtio_transport_alloc_pkt(src_cid, src_port,
-					 dst_cid, dst_port,
-					 len,
-					 info->type,
-					 info->op,
-					 info->flags);
+static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
+					   struct sk_buff *skb,
+					   struct msghdr *msg,
+					   bool zerocopy)
+{
+	struct ubuf_info *uarg;
 
-	if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
-		WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
-		goto out;
+	if (msg->msg_ubuf) {
+		uarg = msg->msg_ubuf;
+		net_zcopy_get(uarg);
+	} else {
+		struct iov_iter *iter = &msg->msg_iter;
+		struct ubuf_info_msgzc *uarg_zc;
+
+		uarg = msg_zerocopy_realloc(sk_vsock(vsk),
+					    iter->count,
+					    NULL);
+		if (!uarg)
+			return -1;
+
+		uarg_zc = uarg_to_msgzc(uarg);
+		uarg_zc->zerocopy = zerocopy ? 1 : 0;
 	}
 
-	return skb;
+	skb_zcopy_init(skb, uarg);
 
-out:
-	kfree_skb(skb);
-	return NULL;
+	return 0;
+}
+
+static int virtio_transport_fill_skb(struct sk_buff *skb,
+				     struct virtio_vsock_pkt_info *info,
+				     size_t len,
+				     bool zcopy)
+{
+	if (zcopy)
+		return __zerocopy_sg_from_iter(info->msg, NULL, skb,
+					       &info->msg->msg_iter,
+					       len);
+
+	return memcpy_from_msg(skb_put(skb, len), info->msg, len);
+}
+
+static void virtio_transport_init_hdr(struct sk_buff *skb,
+				      struct virtio_vsock_pkt_info *info,
+				      size_t payload_len,
+				      u32 src_cid,
+				      u32 src_port,
+				      u32 dst_cid,
+				      u32 dst_port)
+{
+	struct virtio_vsock_hdr *hdr;
+
+	hdr = virtio_vsock_hdr(skb);
+	hdr->type	= cpu_to_le16(info->type);
+	hdr->op		= cpu_to_le16(info->op);
+	hdr->src_cid	= cpu_to_le64(src_cid);
+	hdr->dst_cid	= cpu_to_le64(dst_cid);
+	hdr->src_port	= cpu_to_le32(src_port);
+	hdr->dst_port	= cpu_to_le32(dst_port);
+	hdr->flags	= cpu_to_le32(info->flags);
+	hdr->len	= cpu_to_le32(payload_len);
 }
 
 static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb,
@@ -214,6 +240,82 @@ static u16 virtio_transport_get_type(struct sock *sk)
 		return VIRTIO_VSOCK_TYPE_SEQPACKET;
 }
 
+/* Returns new sk_buff on success, otherwise returns NULL. */
+static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
+						  size_t payload_len,
+						  bool zcopy,
+						  u32 src_cid,
+						  u32 src_port,
+						  u32 dst_cid,
+						  u32 dst_port)
+{
+	struct vsock_sock *vsk;
+	struct sk_buff *skb;
+	size_t skb_len;
+
+	skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
+
+	if (!zcopy)
+		skb_len += payload_len;
+
+	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	virtio_transport_init_hdr(skb, info, payload_len, src_cid, src_port,
+				  dst_cid, dst_port);
+
+	vsk = info->vsk;
+
+	/* If 'vsk' != NULL then payload is always present, so we
+	 * will never call '__zerocopy_sg_from_iter()' below without
+	 * setting skb owner in 'skb_set_owner_w()'. The only case
+	 * when 'vsk' == NULL is VIRTIO_VSOCK_OP_RST control message
+	 * without payload.
+	 */
+	WARN_ON_ONCE(!(vsk && (info->msg && payload_len)) && zcopy);
+
+	/* Set owner here, because '__zerocopy_sg_from_iter()' uses
+	 * owner of skb without check to update 'sk_wmem_alloc'.
+	 */
+	if (vsk)
+		skb_set_owner_w(skb, sk_vsock(vsk));
+
+	if (info->msg && payload_len > 0) {
+		int err;
+
+		err = virtio_transport_fill_skb(skb, info, payload_len, zcopy);
+		if (err)
+			goto out;
+
+		if (msg_data_left(info->msg) == 0 &&
+		    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
+			struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+
+			hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+
+			if (info->msg->msg_flags & MSG_EOR)
+				hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+		}
+	}
+
+	if (info->reply)
+		virtio_vsock_skb_set_reply(skb);
+
+	trace_virtio_transport_alloc_pkt(src_cid, src_port,
+					 dst_cid, dst_port,
+					 payload_len,
+					 info->type,
+					 info->op,
+					 info->flags,
+					 zcopy);
+
+	return skb;
+out:
+	kfree_skb(skb);
+	return NULL;
+}
+
 /* This function can only be used on connecting/connected sockets,
  * since a socket assigned to a transport is required.
  *
@@ -222,10 +324,12 @@ static u16 virtio_transport_get_type(struct sock *sk)
 static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 					  struct virtio_vsock_pkt_info *info)
 {
+	u32 max_skb_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
 	u32 src_cid, src_port, dst_cid, dst_port;
 	const struct virtio_transport *t_ops;
 	struct virtio_vsock_sock *vvs;
 	u32 pkt_len = info->pkt_len;
+	bool can_zcopy = false;
 	u32 rest_len;
 	int ret;
 
@@ -254,15 +358,30 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
 		return pkt_len;
 
+	if (info->msg) {
+		/* If zerocopy is not enabled by 'setsockopt()', we behave as
+		 * there is no MSG_ZEROCOPY flag set.
+		 */
+		if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
+			info->msg->msg_flags &= ~MSG_ZEROCOPY;
+
+		if (info->msg->msg_flags & MSG_ZEROCOPY)
+			can_zcopy = virtio_transport_can_zcopy(t_ops, info, pkt_len);
+
+		if (can_zcopy)
+			max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
+					    (MAX_SKB_FRAGS * PAGE_SIZE));
+	}
+
 	rest_len = pkt_len;
 
 	do {
 		struct sk_buff *skb;
 		size_t skb_len;
 
-		skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
+		skb_len = min(max_skb_len, rest_len);
 
-		skb = virtio_transport_alloc_skb(info, skb_len,
+		skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy,
 						 src_cid, src_port,
 						 dst_cid, dst_port);
 		if (!skb) {
@@ -270,6 +389,21 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 			break;
 		}
 
+		/* We process buffer part by part, allocating skb on
+		 * each iteration. If this is last skb for this buffer
+		 * and MSG_ZEROCOPY mode is in use - we must allocate
+		 * completion for the current syscall.
+		 */
+		if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY &&
+		    skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) {
+			if (virtio_transport_init_zcopy_skb(vsk, skb,
+							    info->msg,
+							    can_zcopy)) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+
 		virtio_transport_inc_tx_pkt(vvs, skb);
 
 		ret = t_ops->send_pkt(skb);
@@ -985,7 +1119,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
 	if (!t)
 		return -ENOTCONN;
 
-	reply = virtio_transport_alloc_skb(&info, 0,
+	reply = virtio_transport_alloc_skb(&info, 0, false,
 					   le64_to_cpu(hdr->dst_cid),
 					   le32_to_cpu(hdr->dst_port),
 					   le64_to_cpu(hdr->src_cid),
-- 
cgit v1.2.3-70-g09d2


From 6b348f6e34ce7dc5eb68066377d5e38780ce4095 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Sep 2023 08:40:52 +0200
Subject: wifi: mac80211: ethtool: always hold wiphy mutex

Drivers should really be able to rely on the wiphy mutex
being held all the time, unless otherwise documented. For
ethtool, that wasn't quite right. Fix and clarify this in
both code and documentation.

Reported-by: syzbot+c12a771b218dcbba32e1@syzkaller.appspotmail.com
Fixes: 0e8185ce1dde ("wifi: mac80211: check wiphy mutex in ops")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 4 ++++
 net/mac80211/ethtool.c | 9 ++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8d993f6ab919..0f6390865fe7 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4067,11 +4067,15 @@ struct ieee80211_prep_tx_info {
  *	This callback must be atomic.
  *
  * @get_et_sset_count:  Ethtool API to get string-set count.
+ *	Note that the wiphy mutex is not held for this callback since it's
+ *	expected to return a static value.
  *
  * @get_et_stats:  Ethtool API to get a set of u64 stats.
  *
  * @get_et_strings:  Ethtool API to get a set of strings to describe stats
  *	and perhaps other supported types of ethtool data-sets.
+ *	Note that the wiphy mutex is not held for this callback since it's
+ *	expected to return a static value.
  *
  * @mgd_prepare_tx: Prepare for transmitting a management frame for association
  *	before associated. In multi-channel scenarios, a virtual interface is
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 9894d2024470..99f6174a9d69 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -19,11 +19,16 @@ static int ieee80211_set_ringparam(struct net_device *dev,
 				   struct netlink_ext_ack *extack)
 {
 	struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy);
+	int ret;
 
 	if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0)
 		return -EINVAL;
 
-	return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending);
+	wiphy_lock(local->hw.wiphy);
+	ret = drv_set_ringparam(local, rp->tx_pending, rp->rx_pending);
+	wiphy_unlock(local->hw.wiphy);
+
+	return ret;
 }
 
 static void ieee80211_get_ringparam(struct net_device *dev,
@@ -35,8 +40,10 @@ static void ieee80211_get_ringparam(struct net_device *dev,
 
 	memset(rp, 0, sizeof(*rp));
 
+	wiphy_lock(local->hw.wiphy);
 	drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending,
 			  &rp->rx_pending, &rp->rx_max_pending);
+	wiphy_unlock(local->hw.wiphy);
 }
 
 static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = {
-- 
cgit v1.2.3-70-g09d2


From bb55441c57ccc5cc2eab44e1a97698b9d708871d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 25 Sep 2023 08:56:38 +0200
Subject: wifi: cfg80211: split struct cfg80211_ap_settings

Using the full struct cfg80211_ap_settings for an update is
misleading, since most settings cannot be updated. Split the
update case off into a new struct cfg80211_ap_update.

Change-Id: I3ba4dd9280938ab41252f145227a7005edf327e4
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c            |  2 +-
 drivers/net/wireless/ath/wil6210/cfg80211.c           |  2 +-
 .../wireless/broadcom/brcm80211/brcmfmac/cfg80211.c   |  2 +-
 drivers/net/wireless/marvell/mwifiex/cfg80211.c       |  2 +-
 drivers/net/wireless/microchip/wilc1000/cfg80211.c    |  2 +-
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c     |  2 +-
 drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c     |  2 +-
 include/net/cfg80211.h                                | 18 +++++++++++++++++-
 net/mac80211/cfg.c                                    |  2 +-
 net/wireless/nl80211.c                                | 19 +++++++++----------
 net/wireless/rdev-ops.h                               |  2 +-
 net/wireless/trace.h                                  |  2 +-
 12 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index eea60e2fca44..e37db4af33de 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -2954,7 +2954,7 @@ static int ath6kl_start_ap(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int ath6kl_change_beacon(struct wiphy *wiphy, struct net_device *dev,
-				struct cfg80211_ap_settings *params)
+				struct cfg80211_ap_update *params)
 {
 	struct ath6kl_vif *vif = netdev_priv(dev);
 
diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index dfbb478ae274..dbe4b3478f03 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -2082,7 +2082,7 @@ void wil_cfg80211_ap_recovery(struct wil6210_priv *wil)
 
 static int wil_cfg80211_change_beacon(struct wiphy *wiphy,
 				      struct net_device *ndev,
-				      struct cfg80211_ap_settings *params)
+				      struct cfg80211_ap_update *params)
 {
 	struct wil6210_priv *wil = wiphy_to_wil(wiphy);
 	struct wireless_dev *wdev = ndev->ieee80211_ptr;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 9012456e1a18..667462369a32 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -5415,7 +5415,7 @@ static int brcmf_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *ndev,
 
 static s32
 brcmf_cfg80211_change_beacon(struct wiphy *wiphy, struct net_device *ndev,
-			     struct cfg80211_ap_settings *info)
+			     struct cfg80211_ap_update *info)
 {
 	struct brcmf_if *ifp = netdev_priv(ndev);
 
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 70473be42d7b..7a15ea8072e6 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -1835,7 +1835,7 @@ static int mwifiex_cfg80211_set_cqm_rssi_config(struct wiphy *wiphy,
  */
 static int mwifiex_cfg80211_change_beacon(struct wiphy *wiphy,
 					  struct net_device *dev,
-					  struct cfg80211_ap_settings *params)
+					  struct cfg80211_ap_update *params)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 	struct mwifiex_adapter *adapter = priv->adapter;
diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
index 3447470d3d02..da52f91693b5 100644
--- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c
+++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
@@ -1441,7 +1441,7 @@ static int start_ap(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int change_beacon(struct wiphy *wiphy, struct net_device *dev,
-			 struct cfg80211_ap_settings *params)
+			 struct cfg80211_ap_update *params)
 {
 	struct wilc_vif *vif = netdev_priv(dev);
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 9388adcdcac1..663d77770fce 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -331,7 +331,7 @@ out:
 }
 
 static int qtnf_change_beacon(struct wiphy *wiphy, struct net_device *dev,
-			      struct cfg80211_ap_settings *info)
+			      struct cfg80211_ap_update *info)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(dev);
 
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index 5ddc2d9a6060..1e683212027c 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -2319,7 +2319,7 @@ static int cfg80211_rtw_start_ap(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static int cfg80211_rtw_change_beacon(struct wiphy *wiphy, struct net_device *ndev,
-		struct cfg80211_ap_settings *info)
+		struct cfg80211_ap_update *info)
 {
 	struct adapter *adapter = rtw_netdev_priv(ndev);
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9af714431b22..899e9ffa6048 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1480,6 +1480,22 @@ struct cfg80211_ap_settings {
 	u16 punct_bitmap;
 };
 
+
+/**
+ * struct cfg80211_ap_update - AP configuration update
+ *
+ * Subset of &struct cfg80211_ap_settings, for updating a running AP.
+ *
+ * @beacon: beacon data
+ * @fils_discovery: FILS discovery transmission parameters
+ * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
+ */
+struct cfg80211_ap_update {
+	struct cfg80211_beacon_data beacon;
+	struct cfg80211_fils_discovery fils_discovery;
+	struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
+};
+
 /**
  * struct cfg80211_csa_settings - channel switch settings
  *
@@ -4523,7 +4539,7 @@ struct cfg80211_ops {
 	int	(*start_ap)(struct wiphy *wiphy, struct net_device *dev,
 			    struct cfg80211_ap_settings *settings);
 	int	(*change_beacon)(struct wiphy *wiphy, struct net_device *dev,
-				 struct cfg80211_ap_settings *info);
+				 struct cfg80211_ap_update *info);
 	int	(*stop_ap)(struct wiphy *wiphy, struct net_device *dev,
 			   unsigned int link_id);
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e751d4eba8f5..e1a64a154287 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1477,7 +1477,7 @@ error:
 }
 
 static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
-				   struct cfg80211_ap_settings *params)
+				   struct cfg80211_ap_update *params)
 
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e64bf2a58b36..cbdf635e6025 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5636,11 +5636,10 @@ static int nl80211_parse_he_obss_pd(struct nlattr *attrs,
 
 static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev,
 					struct nlattr *attrs,
-					struct cfg80211_ap_settings *params)
+					struct cfg80211_fils_discovery *fd)
 {
 	struct nlattr *tb[NL80211_FILS_DISCOVERY_ATTR_MAX + 1];
 	int ret;
-	struct cfg80211_fils_discovery *fd = &params->fils_discovery;
 
 	if (!wiphy_ext_feature_isset(&rdev->wiphy,
 				     NL80211_EXT_FEATURE_FILS_DISCOVERY))
@@ -5674,12 +5673,10 @@ static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev,
 static int
 nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev,
 				     struct nlattr *attrs,
-				     struct cfg80211_ap_settings *params)
+				     struct cfg80211_unsol_bcast_probe_resp *presp)
 {
 	struct nlattr *tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1];
 	int ret;
-	struct cfg80211_unsol_bcast_probe_resp *presp =
-					&params->unsol_bcast_probe_resp;
 
 	if (!wiphy_ext_feature_isset(&rdev->wiphy,
 				     NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP))
@@ -6122,7 +6119,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) {
 		err = nl80211_parse_fils_discovery(rdev,
 						   info->attrs[NL80211_ATTR_FILS_DISCOVERY],
-						   params);
+						   &params->fils_discovery);
 		if (err)
 			goto out;
 	}
@@ -6130,7 +6127,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) {
 		err = nl80211_parse_unsol_bcast_probe_resp(
 			rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP],
-			params);
+			&params->unsol_bcast_probe_resp);
 		if (err)
 			goto out;
 	}
@@ -6202,7 +6199,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	struct cfg80211_ap_settings *params;
+	struct cfg80211_ap_update *params;
 	struct nlattr *attr;
 	int err;
 
@@ -6227,14 +6224,16 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 
 	attr = info->attrs[NL80211_ATTR_FILS_DISCOVERY];
 	if (attr) {
-		err = nl80211_parse_fils_discovery(rdev, attr, params);
+		err = nl80211_parse_fils_discovery(rdev, attr,
+						   &params->fils_discovery);
 		if (err)
 			goto out;
 	}
 
 	attr = info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP];
 	if (attr) {
-		err = nl80211_parse_unsol_bcast_probe_resp(rdev, attr, params);
+		err = nl80211_parse_unsol_bcast_probe_resp(rdev, attr,
+							   &params->unsol_bcast_probe_resp);
 		if (err)
 			goto out;
 	}
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index c6a2c07e380b..2214a90cf101 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -173,7 +173,7 @@ static inline int rdev_start_ap(struct cfg80211_registered_device *rdev,
 
 static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev,
 				     struct net_device *dev,
-				     struct cfg80211_ap_settings *info)
+				     struct cfg80211_ap_update *info)
 {
 	int ret;
 	trace_rdev_change_beacon(&rdev->wiphy, dev, info);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 1557dc1d58e2..da2b73951c32 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -615,7 +615,7 @@ TRACE_EVENT(rdev_start_ap,
 
 TRACE_EVENT(rdev_change_beacon,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
-		 struct cfg80211_ap_settings *info),
+		 struct cfg80211_ap_update *info),
 	TP_ARGS(wiphy, netdev, info),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
-- 
cgit v1.2.3-70-g09d2


From b3239498353484fd6ddeb513df89c4628cd623d0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:12 +0300
Subject: wifi: mac80211: use bandwidth indication element for CSA

In CSA, parse the (EHT) bandwidth indication element and
use it (in fact prefer it if present).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.43ef01920556.If4f24a61cd634ab1e50eba43899b9e992bf25602@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 23 +++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  3 ++-
 net/mac80211/mlme.c        |  5 +++--
 net/mac80211/spectmgmt.c   | 13 +++++++++++--
 net/mac80211/util.c        | 42 +++++++++++++++++++++++++++++-------------
 5 files changed, 68 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 340d7e0f6bf7..f11b7022d9eb 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3139,6 +3139,28 @@ ieee80211_eht_oper_size_ok(const u8 *data, u8 len)
 	return len >= needed;
 }
 
+#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT	BIT(1)
+
+struct ieee80211_bandwidth_indication {
+	u8 params;
+	struct ieee80211_eht_operation_info info;
+} __packed;
+
+static inline bool
+ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len)
+{
+	const struct ieee80211_bandwidth_indication *bwi = (const void *)data;
+
+	if (len < sizeof(*bwi))
+		return false;
+
+	if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT &&
+	    len < sizeof(*bwi) + 2)
+		return false;
+
+	return true;
+}
+
 #define LISTEN_INT_USF	GENMASK(15, 14)
 #define LISTEN_INT_UI	GENMASK(13, 0)
 
@@ -3596,6 +3618,7 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_EHT_OPERATION = 106,
 	WLAN_EID_EXT_EHT_MULTI_LINK = 107,
 	WLAN_EID_EXT_EHT_CAPABILITY = 108,
+	WLAN_EID_EXT_BANDWIDTH_INDICATION = 135,
 };
 
 /* Action category code */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d5c5f865323c..e7856336b5c6 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1677,6 +1677,7 @@ struct ieee802_11_elems {
 	const struct ieee80211_eht_operation *eht_operation;
 	const struct ieee80211_multi_link_elem *ml_basic;
 	const struct ieee80211_multi_link_elem *ml_reconf;
+	const struct ieee80211_bandwidth_indication *bandwidth_indication;
 
 	/* length of them, respectively */
 	u8 ext_capab_len;
@@ -2463,7 +2464,7 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 				const struct ieee80211_vht_operation *oper,
 				const struct ieee80211_ht_operation *htop,
 				struct cfg80211_chan_def *chandef);
-void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation *eht_oper,
+void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info,
 				bool support_160, bool support_320,
 				struct cfg80211_chan_def *chandef);
 bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e8f16ed235c3..a211f594f25a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -109,7 +109,8 @@ ieee80211_extract_dis_subch_bmap(const struct ieee80211_eht_operation *eht_oper,
 		return 0;
 
 	/* set 160/320 supported to get the full AP definition */
-	ieee80211_chandef_eht_oper(eht_oper, true, true, &ap_chandef);
+	ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
+				   true, true, &ap_chandef);
 	ap_center_freq = ap_chandef.center_freq1;
 	ap_bw = 20 * BIT(u8_get_bits(info->control,
 				     IEEE80211_EHT_OPER_CHAN_WIDTH));
@@ -387,7 +388,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 	if (eht_oper && (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) {
 		struct cfg80211_chan_def eht_chandef = *chandef;
 
-		ieee80211_chandef_eht_oper(eht_oper,
+		ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
 					   eht_chandef.width ==
 					   NL80211_CHAN_WIDTH_160,
 					   false, &eht_chandef);
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 871cdac2d0f4..55959b0b24c5 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -9,7 +9,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2008, Intel Corporation
  * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2018, 2020, 2022 Intel Corporation
+ * Copyright (C) 2018, 2020, 2022-2023 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -33,12 +33,14 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 	struct cfg80211_chan_def new_vht_chandef = {};
 	const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
 	const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
+	const struct ieee80211_bandwidth_indication *bwi;
 	int secondary_channel_offset = -1;
 
 	memset(csa_ie, 0, sizeof(*csa_ie));
 
 	sec_chan_offs = elems->sec_chan_offs;
 	wide_bw_chansw_ie = elems->wide_bw_chansw_ie;
+	bwi = elems->bandwidth_indication;
 
 	if (conn_flags & (IEEE80211_CONN_DISABLE_HT |
 			  IEEE80211_CONN_DISABLE_40MHZ)) {
@@ -132,7 +134,14 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
-	if (wide_bw_chansw_ie) {
+	if (bwi) {
+		/* start with the CSA one */
+		new_vht_chandef = csa_ie->chandef;
+		/* and update the width accordingly */
+		/* FIXME: support 160/320 */
+		ieee80211_chandef_eht_oper(&bwi->info, true, true,
+					   &new_vht_chandef);
+	} else if (wide_bw_chansw_ie) {
 		u8 new_seg1 = wide_bw_chansw_ie->new_center_freq_seg1;
 		struct ieee80211_vht_operation vht_oper = {
 			.chan_width =
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 88f714a75862..a1e18938ce52 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -990,6 +990,11 @@ ieee80211_parse_extension_element(u32 *crc,
 			}
 		}
 		break;
+	case WLAN_EID_EXT_BANDWIDTH_INDICATION:
+		if (ieee80211_bandwidth_indication_size_ok(data, len))
+			elems->bandwidth_indication = data;
+		calc_crc = true;
+		break;
 	}
 
 	if (crc && calc_crc)
@@ -1005,11 +1010,11 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params,
 	bool calc_crc = params->filter != 0;
 	DECLARE_BITMAP(seen_elems, 256);
 	u32 crc = params->crc;
-	const u8 *ie;
 
 	bitmap_zero(seen_elems, 256);
 
 	for_each_element(elem, params->start, params->len) {
+		const struct element *subelem;
 		bool elem_parse_failed;
 		u8 id = elem->id;
 		u8 elen = elem->datalen;
@@ -1267,15 +1272,27 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params,
 			}
 			/*
 			 * This is a bit tricky, but as we only care about
-			 * the wide bandwidth channel switch element, so
-			 * just parse it out manually.
+			 * a few elements, parse them out manually.
 			 */
-			ie = cfg80211_find_ie(WLAN_EID_WIDE_BW_CHANNEL_SWITCH,
-					      pos, elen);
-			if (ie) {
-				if (ie[1] >= sizeof(*elems->wide_bw_chansw_ie))
+			subelem = cfg80211_find_elem(WLAN_EID_WIDE_BW_CHANNEL_SWITCH,
+						     pos, elen);
+			if (subelem) {
+				if (subelem->datalen >= sizeof(*elems->wide_bw_chansw_ie))
 					elems->wide_bw_chansw_ie =
-						(void *)(ie + 2);
+						(void *)subelem->data;
+				else
+					elem_parse_failed = true;
+			}
+
+			subelem = cfg80211_find_ext_elem(WLAN_EID_EXT_BANDWIDTH_INDICATION,
+							 pos, elen);
+			if (subelem) {
+				const void *edata = subelem->data + 1;
+				u8 edatalen = subelem->datalen - 1;
+
+				if (ieee80211_bandwidth_indication_size_ok(edata,
+									   edatalen))
+					elems->bandwidth_indication = edata;
 				else
 					elem_parse_failed = true;
 			}
@@ -3746,12 +3763,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 	return true;
 }
 
-void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation *eht_oper,
+void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info,
 				bool support_160, bool support_320,
 				struct cfg80211_chan_def *chandef)
 {
-	struct ieee80211_eht_operation_info *info = (void *)eht_oper->optional;
-
 	chandef->center_freq1 =
 		ieee80211_channel_to_frequency(info->ccfs0,
 					       chandef->chan->band);
@@ -3920,8 +3935,9 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
 		support_320 =
 			eht_phy_cap & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ;
 
-		ieee80211_chandef_eht_oper(eht_oper, support_160,
-					   support_320, &he_chandef);
+		ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
+					   support_160, support_320,
+					   &he_chandef);
 	}
 
 	if (!cfg80211_chandef_valid(&he_chandef)) {
-- 
cgit v1.2.3-70-g09d2


From 8a58fc1ce487a77dc9c87e12682b5297e4a030fa Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:15 +0300
Subject: wifi: mac80211: relax RCU check in for_each_vif_active_link()

To iterate the vif links we don't necessarily need to be in an
RCU critical section, it's also possible to hold the sdata/wdev
mutex. Annotate for_each_vif_active_link() accordingly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.858921bd2860.I01f456be8ce2a4fbd15e0d44302e2f7d72e91987@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 0f6390865fe7..5b477c35e03b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1942,7 +1942,7 @@ static inline bool ieee80211_vif_is_mld(const struct ieee80211_vif *vif)
 	for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++)	\
 		if ((!(vif)->active_links ||					\
 		     (vif)->active_links & BIT(link_id)) &&			\
-		    (link = rcu_dereference((vif)->link_conf[link_id])))
+		    (link = link_conf_dereference_check(vif, link_id)))
 
 static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif)
 {
-- 
cgit v1.2.3-70-g09d2


From e865c827e9da4d4740746b3fa1e321fab5482e12 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:16 +0300
Subject: wifi: mac80211: allow for_each_sta_active_link() under RCU

Since we only use this to protect the dereference and with
STA mutex, we can also allow this with just RCU.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.73c3e04985f4.I52ef396d693e0e381a73eade06850137d8900948@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5b477c35e03b..6dbe775b9e87 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2393,7 +2393,7 @@ static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta)
 	for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++)		\
 		if ((!(vif)->active_links ||					\
 		     (vif)->active_links & BIT(link_id)) &&			\
-		    ((link_sta) = link_sta_dereference_protected(sta, link_id)))
+		    ((link_sta) = link_sta_dereference_check(sta, link_id)))
 
 /**
  * enum sta_notify_cmd - sta notify command
-- 
cgit v1.2.3-70-g09d2


From 041a74cbe49048abc3adb6129141256e50b84b6e Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 20 Sep 2023 21:25:22 +0300
Subject: wifi: mac80211: Notify the low level driver on change in MLO valid
 links

Notify the low level driver when there is change in the valid links.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.4fc85b0a51b0.I64238e0e892709a2bd4764b3bca93cdcf021e2fd@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 net/mac80211/main.c    | 3 ++-
 net/mac80211/mlme.c    | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6dbe775b9e87..d4ef2a605cb4 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -341,6 +341,7 @@ struct ieee80211_vif_chanctx_switch {
  * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response
  *	status changed.
  * @BSS_CHANGED_EHT_PUNCTURING: The channel puncturing bitmap changed.
+ * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed.
  */
 enum ieee80211_bss_change {
 	BSS_CHANGED_ASSOC		= 1<<0,
@@ -376,6 +377,7 @@ enum ieee80211_bss_change {
 	BSS_CHANGED_FILS_DISCOVERY      = 1<<30,
 	BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31,
 	BSS_CHANGED_EHT_PUNCTURING	= BIT_ULL(32),
+	BSS_CHANGED_MLD_VALID_LINKS	= BIT_ULL(33),
 
 	/* when adding here, make sure to change ieee80211_reconfig */
 };
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index bf8f72c412ee..b46f4d733c5d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -207,7 +207,8 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 				   BSS_CHANGED_PS |\
 				   BSS_CHANGED_IBSS |\
 				   BSS_CHANGED_ARP_FILTER |\
-				   BSS_CHANGED_SSID)
+				   BSS_CHANGED_SSID |\
+				   BSS_CHANGED_MLD_VALID_LINKS)
 
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u64 changed)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a85873f305bf..085efae8e23c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5713,6 +5713,8 @@ static void ieee80211_ml_reconf_work(struct wiphy *wiphy,
 	if (ret)
 		sdata_info(sdata, "Failed setting valid links\n");
 
+	ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS);
+
 out:
 	if (!ret)
 		cfg80211_links_removed(sdata->dev, sdata->u.mgd.removed_links);
-- 
cgit v1.2.3-70-g09d2


From 62e9c64eedfeb697ba28081ccaac59a45f9a96e1 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Wed, 20 Sep 2023 21:25:24 +0300
Subject: wifi: mac80211: add support for parsing TID to Link mapping element

Add the relevant definitions for TID to Link mapping element
according to the P802.11be_D4.0.

Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.9ea9b0b4412a.I2281ab2c70e8b43a39032dc115db6a80f1f0b3f4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  3 +++
 net/mac80211/util.c        |  8 +++++++
 3 files changed, 69 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f11b7022d9eb..f2965ff3d7c1 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1246,6 +1246,30 @@ struct ieee80211_twt_setup {
 	u8 params[];
 } __packed;
 
+#define IEEE80211_TTLM_MAX_CNT				2
+#define IEEE80211_TTLM_CONTROL_DIRECTION		0x03
+#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP		0x04
+#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT	0x08
+#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT	0x10
+#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE		0x20
+
+#define IEEE80211_TTLM_DIRECTION_DOWN		0
+#define IEEE80211_TTLM_DIRECTION_UP		1
+#define IEEE80211_TTLM_DIRECTION_BOTH		2
+
+/**
+ * struct ieee80211_ttlm_elem - TID-To-Link Mapping element
+ *
+ * Defined in section 9.4.2.314 in P802.11be_D4
+ *
+ * @control: the first part of control field
+ * @optional: the second part of control field
+ */
+struct ieee80211_ttlm_elem {
+	u8 control;
+	u8 optional[];
+} __packed;
+
 struct ieee80211_mgmt {
 	__le16 frame_control;
 	__le16 duration;
@@ -3618,6 +3642,7 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_EHT_OPERATION = 106,
 	WLAN_EID_EXT_EHT_MULTI_LINK = 107,
 	WLAN_EID_EXT_EHT_CAPABILITY = 108,
+	WLAN_EID_EXT_TID_TO_LINK_MAPPING = 109,
 	WLAN_EID_EXT_BANDWIDTH_INDICATION = 135,
 };
 
@@ -5155,6 +5180,39 @@ static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data,
 	       fixed + prof->sta_info_len - 1 <= len;
 }
 
+static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len)
+{
+	const struct ieee80211_ttlm_elem *t2l = (const void *)data;
+	u8 control, fixed = sizeof(*t2l), elem_len = 0;
+
+	if (len < fixed)
+		return false;
+
+	control = t2l->control;
+
+	if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)
+		elem_len += 2;
+	if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)
+		elem_len += 3;
+
+	if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) {
+		u8 bm_size;
+
+		elem_len += 1;
+		if (len < fixed + elem_len)
+			return false;
+
+		if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE)
+			bm_size = 1;
+		else
+			bm_size = 2;
+
+		elem_len += hweight8(t2l->optional[0]) * bm_size;
+	}
+
+	return len >= fixed + elem_len;
+}
+
 #define for_each_mle_subelement(_elem, _data, _len)			\
 	if (ieee80211_mle_size_ok(_data, _len))				\
 		for_each_element(_elem,					\
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index e7856336b5c6..d1a73095c914 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1678,6 +1678,7 @@ struct ieee802_11_elems {
 	const struct ieee80211_multi_link_elem *ml_basic;
 	const struct ieee80211_multi_link_elem *ml_reconf;
 	const struct ieee80211_bandwidth_indication *bandwidth_indication;
+	const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT];
 
 	/* length of them, respectively */
 	u8 ext_capab_len;
@@ -1711,6 +1712,8 @@ struct ieee802_11_elems {
 	/* The reconfiguration Multi-Link element in the original IEs */
 	const struct element *ml_reconf_elem;
 
+	u8 ttlm_num;
+
 	/*
 	 * store the per station profile pointer and length in case that the
 	 * parsing also handled Multi-Link element parsing for a specific link
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 97c5823da0eb..98a3bffc6991 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -995,6 +995,14 @@ ieee80211_parse_extension_element(u32 *crc,
 			elems->bandwidth_indication = data;
 		calc_crc = true;
 		break;
+	case WLAN_EID_EXT_TID_TO_LINK_MAPPING:
+		calc_crc = true;
+		if (ieee80211_tid_to_link_map_size_ok(data, len) &&
+		    elems->ttlm_num < ARRAY_SIZE(elems->ttlm)) {
+			elems->ttlm[elems->ttlm_num] = (void *)data;
+			elems->ttlm_num++;
+		}
+		break;
 	}
 
 	if (crc && calc_crc)
-- 
cgit v1.2.3-70-g09d2


From ef246a1480cc484cd2aeda75737cb0848616ddf3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:26 +0300
Subject: wifi: mac80211: support antenna control in injection

Support antenna control for injection by parsing the antenna
radiotap field (which may be presented multiple times) and
telling the driver about the resulting antenna bitmap. Of
course there's no guarantee the driver will actually honour
this, just like any other injection control.

If misconfigured, i.e. the injected HT/VHT MCS needs more
chains than antennas are configured, the bitmap is reset to
zero, indicating no selection.

For now this is only set up for two anntenas so we keep more
free bits, but that can be trivially extended if any driver
implements support for it that can deal with hardware with
more antennas.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.f71001aa4da9.I00ccb762a806ea62bc3d728fa3a0d29f4f285eeb@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  2 ++
 include/net/mac80211.h    |  6 +++++-
 net/mac80211/tx.c         | 14 ++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f2965ff3d7c1..3b02f038d509 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1705,6 +1705,8 @@ struct ieee80211_mcs_info {
 #define		IEEE80211_HT_MCS_TX_MAX_STREAMS	4
 #define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION	0x10
 
+#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3)))
+
 /*
  * 802.11n D5.0 20.3.5 / 20.6 says:
  * - indices 0 to 7 and 32 are single spatial stream
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d4ef2a605cb4..72375eceb786 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1178,7 +1178,11 @@ struct ieee80211_tx_info {
 					u8 use_cts_prot:1;
 					u8 short_preamble:1;
 					u8 skip_table:1;
-					/* 2 bytes free */
+
+					/* for injection only (bitmap) */
+					u8 antennas:2;
+
+					/* 14 bits free */
 				};
 				/* only needed before rate control */
 				unsigned long jiffies;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 932516f8cc13..a984fc54644e 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2162,6 +2162,11 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb,
 			rate_found = true;
 			break;
 
+		case IEEE80211_RADIOTAP_ANTENNA:
+			/* this can appear multiple times, keep a bitmap */
+			info->control.antennas |= BIT(*iterator.this_arg);
+			break;
+
 		case IEEE80211_RADIOTAP_DATA_RETRIES:
 			rate_retries = *iterator.this_arg;
 			break;
@@ -2256,8 +2261,17 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb,
 		}
 
 		if (rate_flags & IEEE80211_TX_RC_MCS) {
+			/* reset antennas if not enough */
+			if (IEEE80211_HT_MCS_CHAINS(rate) >
+					hweight8(info->control.antennas))
+				info->control.antennas = 0;
+
 			info->control.rates[0].idx = rate;
 		} else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) {
+			/* reset antennas if not enough */
+			if (vht_nss > hweight8(info->control.antennas))
+				info->control.antennas = 0;
+
 			ieee80211_rate_set_vht(info->control.rates, vht_mcs,
 					       vht_nss);
 		} else if (sband) {
-- 
cgit v1.2.3-70-g09d2


From a7b2cc591d556eca044c823e4e92d8c1cb430ef7 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:27 +0300
Subject: wifi: cfg80211: report per-link errors during association

When one of the links (other than the assoc_link) is misconfigured
and cannot work the association will fail. However, userspace was not
able to tell that the operation only failed because of a problem with
one of the links. Fix this, by allowing the driver to set a per-link
error code and reporting the (first) offending link by setting the
bad_attr accordingly.

This only allows us to report the first error, but that is sufficient
for userspace to e.g. remove the offending link and retry.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.ebe63c0bd513.I40799998f02bf987acee1501a2522dc98bb6eb5a@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  3 +++
 net/wireless/nl80211.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 899e9ffa6048..34c50f7273d1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2980,12 +2980,15 @@ struct cfg80211_auth_request {
  * @elems_len: length of the elements
  * @disabled: If set this link should be included during association etc. but it
  *	should not be used until enabled by the AP MLD.
+ * @error: per-link error code, must be <= 0. If there is an error, then the
+ *	operation as a whole must fail.
  */
 struct cfg80211_assoc_link {
 	struct cfg80211_bss *bss;
 	const u8 *elems;
 	size_t elems_len;
 	bool disabled;
+	int error;
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index cbdf635e6025..87b21c0c0f25 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10941,8 +10941,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 
 		if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
 					   req.ie, req.ie_len)) {
-			GENL_SET_ERR_MSG(info,
-					 "non-inheritance makes no sense");
+			NL_SET_ERR_MSG_ATTR(info->extack,
+					    info->attrs[NL80211_ATTR_IE],
+					    "non-inheritance makes no sense");
 			return -EINVAL;
 		}
 	}
@@ -11067,6 +11068,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 
 			if (!attrs[NL80211_ATTR_MLO_LINK_ID]) {
 				err = -EINVAL;
+				NL_SET_BAD_ATTR(info->extack, link);
 				goto free;
 			}
 
@@ -11074,6 +11076,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 			/* cannot use the same link ID again */
 			if (req.links[link_id].bss) {
 				err = -EINVAL;
+				NL_SET_BAD_ATTR(info->extack, link);
 				goto free;
 			}
 			req.links[link_id].bss =
@@ -11081,6 +11084,8 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 			if (IS_ERR(req.links[link_id].bss)) {
 				err = PTR_ERR(req.links[link_id].bss);
 				req.links[link_id].bss = NULL;
+				NL_SET_ERR_MSG_ATTR(info->extack,
+						    link, "Error fetching BSS for link");
 				goto free;
 			}
 
@@ -11093,8 +11098,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 				if (cfg80211_find_elem(WLAN_EID_FRAGMENT,
 						       req.links[link_id].elems,
 						       req.links[link_id].elems_len)) {
-					GENL_SET_ERR_MSG(info,
-							 "cannot deal with fragmentation");
+					NL_SET_ERR_MSG_ATTR(info->extack,
+							    attrs[NL80211_ATTR_IE],
+							    "cannot deal with fragmentation");
 					err = -EINVAL;
 					goto free;
 				}
@@ -11102,8 +11108,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 				if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
 							   req.links[link_id].elems,
 							   req.links[link_id].elems_len)) {
-					GENL_SET_ERR_MSG(info,
-							 "cannot deal with non-inheritance");
+					NL_SET_ERR_MSG_ATTR(info->extack,
+							    attrs[NL80211_ATTR_IE],
+							    "cannot deal with non-inheritance");
 					err = -EINVAL;
 					goto free;
 				}
@@ -11146,6 +11153,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 
 	err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
 	if (!err) {
+		struct nlattr *link;
+		int rem = 0;
+
 		err = cfg80211_mlme_assoc(rdev, dev, &req);
 
 		if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
@@ -11154,6 +11164,34 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 			memcpy(dev->ieee80211_ptr->disconnect_bssid,
 			       ap_addr, ETH_ALEN);
 		}
+
+		/* Report error from first problematic link */
+		if (info->attrs[NL80211_ATTR_MLO_LINKS]) {
+			nla_for_each_nested(link,
+					    info->attrs[NL80211_ATTR_MLO_LINKS],
+					    rem) {
+				struct nlattr *link_id_attr =
+					nla_find_nested(link, NL80211_ATTR_MLO_LINK_ID);
+
+				if (!link_id_attr)
+					continue;
+
+				link_id = nla_get_u8(link_id_attr);
+
+				if (link_id == req.link_id)
+					continue;
+
+				if (!req.links[link_id].error ||
+				    WARN_ON(req.links[link_id].error > 0))
+					continue;
+
+				WARN_ON(err >= 0);
+
+				NL_SET_BAD_ATTR(info->extack, link);
+				err = req.links[link_id].error;
+				break;
+			}
+		}
 	}
 
 free:
-- 
cgit v1.2.3-70-g09d2


From 823a0258912b1a4f627e455bee7e78d349c29c5b Mon Sep 17 00:00:00 2001
From: Po-Hao Huang <phhuang@realtek.com>
Date: Mon, 25 Sep 2023 16:08:59 +0800
Subject: wifi: ieee80211: add UL-bandwidth definition of trigger frame

Define UL-bandwidth values of trigger frame according to 802.11 std.

Signed-off-by: Po-Hao Huang <phhuang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://lore.kernel.org/r/20230925080902.51449-2-pkshih@realtek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 3b02f038d509..62b4469c6866 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -307,6 +307,13 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 #define IEEE80211_TRIGGER_TYPE_BQRP		0x6
 #define IEEE80211_TRIGGER_TYPE_NFRP		0x7
 
+/* UL-bandwidth within common_info of trigger frame */
+#define IEEE80211_TRIGGER_ULBW_MASK		0xc0000
+#define IEEE80211_TRIGGER_ULBW_20MHZ		0x0
+#define IEEE80211_TRIGGER_ULBW_40MHZ		0x1
+#define IEEE80211_TRIGGER_ULBW_80MHZ		0x2
+#define IEEE80211_TRIGGER_ULBW_160_80P80MHZ	0x3
+
 struct ieee80211_hdr {
 	__le16 frame_control;
 	__le16 duration_id;
-- 
cgit v1.2.3-70-g09d2


From 5482c0a28b2634e7a7d8ddaca7feac183e74b528 Mon Sep 17 00:00:00 2001
From: Vinayak Yadawad <vinayak.yadawad@broadcom.com>
Date: Fri, 22 Sep 2023 14:55:51 +0530
Subject: wifi: cfg80211: OWE DH IE handling offload

Introduce new feature flags for OWE offload that driver can
advertise to indicate kernel/application space to avoid DH IE
handling. When this flag is advertised, the driver/device will
take care of DH IE inclusion and processing of peer DH IE to
generate PMK.

Signed-off-by: Vinayak Yadawad <vinayak.yadawad@broadcom.com>
Link: https://lore.kernel.org/r/f891cce4b52c939dfc6b71bb2f73e560e8cad287.1695374530.git.vinayak.yadawad@broadcom.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 367e5fbc8930..0c64994b118e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -334,6 +334,15 @@
  * use %NL80211_CMD_START_AP or similar functions.
  */
 
+/**
+ * DOC: OWE DH IE handling offload
+ *
+ * By setting @NL80211_EXT_FEATURE_OWE_OFFLOAD flag, drivers can indicate
+ * kernel/application space to avoid DH IE handling. When this flag is
+ * advertised, the driver/device will take care of DH IE inclusion and
+ * processing of peer DH IE to generate PMK.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -6411,6 +6420,12 @@ enum nl80211_feature_flags {
  *	in authentication and deauthentication frames sent to unassociated peer
  *	using @NL80211_CMD_FRAME.
  *
+ * @NL80211_EXT_FEATURE_OWE_OFFLOAD: Driver/Device wants to do OWE DH IE
+ *	handling in station mode.
+ *
+ * @NL80211_EXT_FEATURE_OWE_OFFLOAD_AP: Driver/Device wants to do OWE DH IE
+ *	handling in AP mode.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6482,6 +6497,8 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_PUNCT,
 	NL80211_EXT_FEATURE_SECURE_NAN,
 	NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA,
+	NL80211_EXT_FEATURE_OWE_OFFLOAD,
+	NL80211_EXT_FEATURE_OWE_OFFLOAD_AP,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3-70-g09d2


From e2b2cd592adbd303bcc02451d32fedd511000fb0 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:38 +0200
Subject: bpf: Add missed value to kprobe_multi link info

Add missed value to kprobe_multi link info to hold the stats of missed
kprobe_multi probe.

The missed counter gets incremented when fprobe fails the recursion
check or there's no rethook available for return probe. In either
case the attached bpf program is not executed.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <song@kernel.org>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-3-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       | 1 +
 kernel/trace/bpf_trace.c       | 1 +
 tools/include/uapi/linux/bpf.h | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5f13db15a3c7..1da5b1bcce71 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6532,6 +6532,7 @@ struct bpf_link_info {
 			__aligned_u64 addrs;
 			__u32 count; /* in/out: kprobe_multi function count */
 			__u32 flags;
+			__u64 missed;
 		} kprobe_multi;
 		struct {
 			__u32 type; /* enum bpf_perf_event_type */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 54827d04c9a6..6aec6e7d612a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2614,6 +2614,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
 	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
 	info->kprobe_multi.count = kmulti_link->cnt;
 	info->kprobe_multi.flags = kmulti_link->flags;
+	info->kprobe_multi.missed = kmulti_link->fp.nmissed;
 
 	if (!uaddrs)
 		return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5f13db15a3c7..1da5b1bcce71 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6532,6 +6532,7 @@ struct bpf_link_info {
 			__aligned_u64 addrs;
 			__u32 count; /* in/out: kprobe_multi function count */
 			__u32 flags;
+			__u64 missed;
 		} kprobe_multi;
 		struct {
 			__u32 type; /* enum bpf_perf_event_type */
-- 
cgit v1.2.3-70-g09d2


From 3acf8ace68230e9558cf916847f1cc9f208abdf1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:39 +0200
Subject: bpf: Add missed value to kprobe perf link info

Add missed value to kprobe attached through perf link info to
hold the stats of missed kprobe handler execution.

The kprobe's missed counter gets incremented when kprobe handler
is not executed due to another kprobe running on the same cpu.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-4-jolsa@kernel.org
---
 include/linux/trace_events.h   |  6 ++++--
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           | 14 ++++++++------
 kernel/trace/bpf_trace.c       |  5 +++--
 kernel/trace/trace_kprobe.c    | 14 +++++++++++---
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 21ae37e49319..5eb88a66eb68 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -761,7 +761,8 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name);
 void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
-			    u64 *probe_offset, u64 *probe_addr);
+			    u64 *probe_offset, u64 *probe_addr,
+			    unsigned long *missed);
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
@@ -801,7 +802,7 @@ static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
 static inline int bpf_get_perf_event_info(const struct perf_event *event,
 					  u32 *prog_id, u32 *fd_type,
 					  const char **buf, u64 *probe_offset,
-					  u64 *probe_addr)
+					  u64 *probe_addr, unsigned long *missed)
 {
 	return -EOPNOTSUPP;
 }
@@ -877,6 +878,7 @@ extern void perf_kprobe_destroy(struct perf_event *event);
 extern int bpf_get_kprobe_info(const struct perf_event *event,
 			       u32 *fd_type, const char **symbol,
 			       u64 *probe_offset, u64 *probe_addr,
+			       unsigned long *missed,
 			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1da5b1bcce71..70bfa997e896 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6548,6 +6548,7 @@ struct bpf_link_info {
 					__u32 name_len;
 					__u32 offset; /* offset from func_name */
 					__u64 addr;
+					__u64 missed;
 				} kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
 				struct {
 					__aligned_u64 tp_name;   /* in/out */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85c1d908f70f..6b5280f14a53 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3374,7 +3374,7 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
 static int bpf_perf_link_fill_common(const struct perf_event *event,
 				     char __user *uname, u32 ulen,
 				     u64 *probe_offset, u64 *probe_addr,
-				     u32 *fd_type)
+				     u32 *fd_type, unsigned long *missed)
 {
 	const char *buf;
 	u32 prog_id;
@@ -3385,7 +3385,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
 		return -EINVAL;
 
 	err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
-				      probe_offset, probe_addr);
+				      probe_offset, probe_addr, missed);
 	if (err)
 		return err;
 	if (!uname)
@@ -3408,6 +3408,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
 static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 				     struct bpf_link_info *info)
 {
+	unsigned long missed;
 	char __user *uname;
 	u64 addr, offset;
 	u32 ulen, type;
@@ -3416,7 +3417,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
 	ulen = info->perf_event.kprobe.name_len;
 	err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
-					&type);
+					&type, &missed);
 	if (err)
 		return err;
 	if (type == BPF_FD_TYPE_KRETPROBE)
@@ -3425,6 +3426,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 		info->perf_event.type = BPF_PERF_EVENT_KPROBE;
 
 	info->perf_event.kprobe.offset = offset;
+	info->perf_event.kprobe.missed = missed;
 	if (!kallsyms_show_value(current_cred()))
 		addr = 0;
 	info->perf_event.kprobe.addr = addr;
@@ -3444,7 +3446,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
 	ulen = info->perf_event.uprobe.name_len;
 	err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
-					&type);
+					&type, NULL);
 	if (err)
 		return err;
 
@@ -3480,7 +3482,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
 	ulen = info->perf_event.tracepoint.name_len;
 	info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
-	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
+	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
 }
 
 static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
@@ -4813,7 +4815,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 
 		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
 					      &buf, &probe_offset,
-					      &probe_addr);
+					      &probe_addr, NULL);
 		if (!err)
 			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
 						     fd_type, buf,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6aec6e7d612a..f6a7d2524949 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2384,7 +2384,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
-			    u64 *probe_offset, u64 *probe_addr)
+			    u64 *probe_offset, u64 *probe_addr,
+			    unsigned long *missed)
 {
 	bool is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
@@ -2419,7 +2420,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 #ifdef CONFIG_KPROBE_EVENTS
 		if (flags & TRACE_EVENT_FL_KPROBE)
 			err = bpf_get_kprobe_info(event, fd_type, buf,
-						  probe_offset, probe_addr,
+						  probe_offset, probe_addr, missed,
 						  event->attr.type == PERF_TYPE_TRACEPOINT);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3d7a180a8427..961a78ffd6d2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1189,6 +1189,12 @@ static const struct file_operations kprobe_events_ops = {
 	.write		= probes_write,
 };
 
+static unsigned long trace_kprobe_missed(struct trace_kprobe *tk)
+{
+	return trace_kprobe_is_return(tk) ?
+		tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+}
+
 /* Probes profiling interfaces */
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
@@ -1200,8 +1206,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 		return 0;
 
 	tk = to_trace_kprobe(ev);
-	nmissed = trace_kprobe_is_return(tk) ?
-		tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+	nmissed = trace_kprobe_missed(tk);
 	seq_printf(m, "  %-44s %15lu %15lu\n",
 		   trace_probe_name(&tk->tp),
 		   trace_kprobe_nhit(tk),
@@ -1547,7 +1552,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
 
 int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
 			const char **symbol, u64 *probe_offset,
-			u64 *probe_addr, bool perf_type_tracepoint)
+			u64 *probe_addr, unsigned long *missed,
+			bool perf_type_tracepoint)
 {
 	const char *pevent = trace_event_name(event->tp_event);
 	const char *group = event->tp_event->class->system;
@@ -1566,6 +1572,8 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
 	*probe_addr = kallsyms_show_value(current_cred()) ?
 		      (unsigned long)tk->rp.kp.addr : 0;
 	*symbol = tk->symbol;
+	if (missed)
+		*missed = trace_kprobe_missed(tk);
 	return 0;
 }
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1da5b1bcce71..70bfa997e896 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6548,6 +6548,7 @@ struct bpf_link_info {
 					__u32 name_len;
 					__u32 offset; /* offset from func_name */
 					__u64 addr;
+					__u64 missed;
 				} kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
 				struct {
 					__aligned_u64 tp_name;   /* in/out */
-- 
cgit v1.2.3-70-g09d2


From dd8657894c11b03c6eb0fd53fe9d7fec2072d18b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:40 +0200
Subject: bpf: Count missed stats in trace_call_bpf

Increase misses stats in case bpf array execution is skipped
because of recursion check in trace_call_bpf.

Adding bpf_prog_inc_misses_counters that increase misses
counts for all bpf programs in bpf_prog_array.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-5-jolsa@kernel.org
---
 include/linux/bpf.h      | 16 ++++++++++++++++
 kernel/trace/bpf_trace.c |  3 +++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 30063a760b5a..a82efd34b741 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2922,6 +2922,22 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
+static __always_inline void
+bpf_prog_inc_misses_counters(const struct bpf_prog_array *array)
+{
+	const struct bpf_prog_array_item *item;
+	struct bpf_prog *prog;
+
+	if (unlikely(!array))
+		return;
+
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		bpf_prog_inc_misses_counter(prog);
+		item++;
+	}
+}
+
 #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 void bpf_sk_reuseport_detach(struct sock *sk);
 int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f6a7d2524949..df697c74d519 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -117,6 +117,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 		 * and don't send kprobe event into ring-buffer,
 		 * so return zero here
 		 */
+		rcu_read_lock();
+		bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
+		rcu_read_unlock();
 		ret = 0;
 		goto out;
 	}
-- 
cgit v1.2.3-70-g09d2


From 5aa4c9608d2d5fea29e211a80c29696f7d94e9f7 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:48 +0300
Subject: net/mlx5: Introduce ifc bits for migration in a chunk mode

Introduce ifc related stuff to enable migration in a chunk mode.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-2-yishaih@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fc3db401f8a2..3265bfcb3156 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1948,7 +1948,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_c0[0x8];
 	u8	   migration_multi_load[0x1];
 	u8	   migration_tracking_state[0x1];
-	u8	   reserved_at_ca[0x16];
+	u8	   reserved_at_ca[0x6];
+	u8	   migration_in_chunks[0x1];
+	u8	   reserved_at_d1[0xf];
 
 	u8	   reserved_at_e0[0xc0];
 
@@ -12392,7 +12394,8 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits {
 	u8         op_mod[0x10];
 
 	u8         incremental[0x1];
-	u8         reserved_at_41[0xf];
+	u8         chunk[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vhca_id[0x10];
 
 	u8         reserved_at_60[0x20];
@@ -12408,7 +12411,11 @@ struct mlx5_ifc_query_vhca_migration_state_out_bits {
 
 	u8         required_umem_size[0x20];
 
-	u8         reserved_at_a0[0x160];
+	u8         reserved_at_a0[0x20];
+
+	u8         remaining_total_size[0x40];
+
+	u8         reserved_at_100[0x100];
 };
 
 struct mlx5_ifc_save_vhca_state_in_bits {
@@ -12440,7 +12447,7 @@ struct mlx5_ifc_save_vhca_state_out_bits {
 
 	u8         actual_image_size[0x20];
 
-	u8         reserved_at_60[0x20];
+	u8         next_required_umem_size[0x20];
 };
 
 struct mlx5_ifc_load_vhca_state_in_bits {
-- 
cgit v1.2.3-70-g09d2


From fce9c967820a72f600abbf061d7077861685a14d Mon Sep 17 00:00:00 2001
From: Ingo Rohloff <lundril@gmx.de>
Date: Sat, 26 Aug 2023 22:02:41 +0200
Subject: wifi: mt76: mt7921e: Support MT7992 IP in Xiaomi Redmibook 15 Pro
 (2023)

In the Xiaomi Redmibook 15 Pro (2023) laptop I have got, a wifi chip is
used, which according to its PCI Vendor ID is from "ITTIM Technology".

This chip works flawlessly with the mt7921e module.  The driver doesn't
bind to this PCI device, because the Vendor ID from "ITTIM Technology" is
not recognized.

This patch adds the PCI Vendor ID from "ITTIM Technology" to the list of
PCI Vendor IDs and lets the mt7921e driver bind to the mentioned wifi
chip.

Signed-off-by: Ingo Rohloff <lundril@gmx.de>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 drivers/net/wireless/mediatek/mt76/mt7921/pci.c | 2 ++
 include/linux/pci_ids.h                         | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
index 3dda84a93717..f04e7095e181 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
@@ -17,6 +17,8 @@ static const struct pci_device_id mt7921_pci_device_table[] = {
 		.driver_data = (kernel_ulong_t)MT7921_FIRMWARE_WM },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MEDIATEK, 0x7922),
 		.driver_data = (kernel_ulong_t)MT7922_FIRMWARE_WM },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ITTIM, 0x7922),
+		.driver_data = (kernel_ulong_t)MT7922_FIRMWARE_WM },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MEDIATEK, 0x0608),
 		.driver_data = (kernel_ulong_t)MT7921_FIRMWARE_WM },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MEDIATEK, 0x0616),
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2dc75df1437f..6ae1803bcd2f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -180,6 +180,8 @@
 #define PCI_DEVICE_ID_BERKOM_A4T		0xffa4
 #define PCI_DEVICE_ID_BERKOM_SCITEL_QUADRO	0xffa8
 
+#define PCI_VENDOR_ID_ITTIM		0x0b48
+
 #define PCI_VENDOR_ID_COMPAQ		0x0e11
 #define PCI_DEVICE_ID_COMPAQ_TOKENRING	0x0508
 #define PCI_DEVICE_ID_COMPAQ_TACHYON	0xa0fc
-- 
cgit v1.2.3-70-g09d2


From 5033f58d5feed1040eebeadb0c5efc95b8bf5720 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 Sep 2023 17:29:41 +0000
Subject: net: constify sk_dst_get() and __sk_dst_get() argument

Both helpers only read fields from their socket argument.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 56ac1abadea5..2800d587b08d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2141,14 +2141,14 @@ static inline bool sk_rethink_txhash(struct sock *sk)
 }
 
 static inline struct dst_entry *
-__sk_dst_get(struct sock *sk)
+__sk_dst_get(const struct sock *sk)
 {
 	return rcu_dereference_check(sk->sk_dst_cache,
 				     lockdep_sock_is_held(sk));
 }
 
 static inline struct dst_entry *
-sk_dst_get(struct sock *sk)
+sk_dst_get(const struct sock *sk)
 {
 	struct dst_entry *dst;
 
-- 
cgit v1.2.3-70-g09d2


From f68a181fcd3b6a22236794813d5c9e5a62efd2b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 Sep 2023 17:29:42 +0000
Subject: tcp: constify tcp_rto_min() and tcp_rto_min_us() argument

Make clear these functions do not change any field from TCP socket.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 91688d0dadcd..a8db7d43fb62 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -719,7 +719,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
 }
 
 /* Compute the actual rto_min value */
-static inline u32 tcp_rto_min(struct sock *sk)
+static inline u32 tcp_rto_min(const struct sock *sk)
 {
 	const struct dst_entry *dst = __sk_dst_get(sk);
 	u32 rto_min = inet_csk(sk)->icsk_rto_min;
@@ -729,7 +729,7 @@ static inline u32 tcp_rto_min(struct sock *sk)
 	return rto_min;
 }
 
-static inline u32 tcp_rto_min_us(struct sock *sk)
+static inline u32 tcp_rto_min_us(const struct sock *sk)
 {
 	return jiffies_to_usecs(tcp_rto_min(sk));
 }
-- 
cgit v1.2.3-70-g09d2


From bbf80d713fe75cfbecda26e7c03a9a8d22af2f4f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 Sep 2023 17:29:43 +0000
Subject: tcp: derive delack_max from rto_min

While BPF allows to set icsk->->icsk_delack_max
and/or icsk->icsk_rto_min, we have an ip route
attribute (RTAX_RTO_MIN) to be able to tune rto_min,
but nothing to consequently adjust max delayed ack,
which vary from 40ms to 200 ms (TCP_DELACK_{MIN|MAX}).

This makes RTAX_RTO_MIN of almost no practical use,
unless customers are in big trouble.

Modern days datacenter communications want to set
rto_min to ~5 ms, and the max delayed ack one jiffie
smaller to avoid spurious retransmits.

After this patch, an "rto_min 5" route attribute will
effectively lower max delayed ack timers to 4 ms.

Note in the following ss output, "rto:6 ... ato:4"

$ ss -temoi dst XXXXXX
State Recv-Q Send-Q           Local Address:Port       Peer Address:Port  Process
ESTAB 0      0        [2002:a05:6608:295::]:52950   [2002:a05:6608:297::]:41597
     ino:255134 sk:1001 <->
         skmem:(r0,rb1707063,t872,tb262144,f0,w0,o0,bl0,d0) ts sack
 cubic wscale:8,8 rto:6 rtt:0.02/0.002 ato:4 mss:4096 pmtu:4500
 rcvmss:536 advmss:4096 cwnd:10 bytes_sent:54823160 bytes_acked:54823121
 bytes_received:54823120 segs_out:1370582 segs_in:1370580
 data_segs_out:1370579 data_segs_in:1370578 send 16.4Gbps
 pacing_rate 32.6Gbps delivery_rate 1.72Gbps delivered:1370579
 busy:26920ms unacked:1 rcv_rtt:34.615 rcv_space:65920
 rcv_ssthresh:65535 minrtt:0.015 snd_wnd:65536

While we could argue this patch fixes a bug with RTAX_RTO_MIN,
I do not add a Fixes: tag, so that we can soak it a bit before
asking backports to stable branches.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  2 ++
 net/ipv4/tcp.c        |  3 ++-
 net/ipv4/tcp_output.c | 16 +++++++++++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a8db7d43fb62..af9cb37fbe53 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -718,6 +718,8 @@ static inline void tcp_fast_path_check(struct sock *sk)
 		tcp_fast_path_on(tp);
 }
 
+u32 tcp_delack_max(const struct sock *sk);
+
 /* Compute the actual rto_min value */
 static inline u32 tcp_rto_min(const struct sock *sk)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 69b8d7073708..e54f91eb943b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3762,7 +3762,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
 
 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
-	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
+	info->tcpi_ato = jiffies_to_usecs(min(icsk->icsk_ack.ato,
+					      tcp_delack_max(sk)));
 	info->tcpi_snd_mss = tp->mss_cache;
 	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1fc1f879cfd6..2d1e4b5ac1ca 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3977,6 +3977,20 @@ int tcp_connect(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_connect);
 
+u32 tcp_delack_max(const struct sock *sk)
+{
+	const struct dst_entry *dst = __sk_dst_get(sk);
+	u32 delack_max = inet_csk(sk)->icsk_delack_max;
+
+	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) {
+		u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
+		u32 delack_from_rto_min = max_t(int, 1, rto_min - 1);
+
+		delack_max = min_t(u32, delack_max, delack_from_rto_min);
+	}
+	return delack_max;
+}
+
 /* Send out a delayed ack, the caller does the policy checking
  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
  * for details.
@@ -4012,7 +4026,7 @@ void tcp_send_delayed_ack(struct sock *sk)
 		ato = min(ato, max_ato);
 	}
 
-	ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
+	ato = min_t(u32, ato, tcp_delack_max(sk));
 
 	/* Stay within the limit we were given */
 	timeout = jiffies + ato;
-- 
cgit v1.2.3-70-g09d2


From 1add90738cf58def972356ecf402e36397187db7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 Sep 2023 20:17:11 +0000
Subject: net_sched: constify qdisc_priv()

In order to propagate const qualifiers, we change qdisc_priv()
to accept a possibly const argument.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 15960564e0c3..9fa1d0794dfa 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -20,10 +20,10 @@ struct qdisc_walker {
 	int	(*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *);
 };
 
-static inline void *qdisc_priv(struct Qdisc *q)
-{
-	return &q->privdata;
-}
+#define qdisc_priv(q)							\
+	_Generic(q,							\
+		 const struct Qdisc * : (const void *)&q->privdata,	\
+		 struct Qdisc * : (void *)&q->privdata)
 
 static inline struct Qdisc *qdisc_from_priv(void *priv)
 {
-- 
cgit v1.2.3-70-g09d2


From 076433bd78d719b34d465c1e69eef512036b534c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 Sep 2023 20:17:14 +0000
Subject: net_sched: sch_fq: add fast path for mostly idle qdisc

TCQ_F_CAN_BYPASS can be used by few qdiscs.

Idea is that if we queue a packet to an empty qdisc,
following dequeue() would pick it immediately.

FQ can not use the generic TCQ_F_CAN_BYPASS code,
because some additional checks need to be performed.

This patch adds a similar fast path to FQ.

Most of the time, qdisc is not throttled,
and many packets can avoid bringing/touching
at least four cache lines, and consuming 128bytes
of memory to store the state of a flow.

After this patch, netperf can send UDP packets about 13 % faster,
and pktgen goes 30 % faster (when FQ is in the way), on a fast NIC.

TCP traffic is also improved, thanks to a reduction of cache line misses.
I have measured a 5 % increase of throughput on a tcp_rr intensive workload.

tc -s -d qd sh dev eth1
...
qdisc fq 8004: parent 1:2 limit 10000p flow_limit 100p buckets 1024
   orphan_mask 1023 quantum 3028b initial_quantum 15140b low_rate_threshold 550Kbit
   refill_delay 40ms timer_slack 10us horizon 10s horizon_drop
 Sent 5646784384 bytes 1985161 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
  flows 122 (inactive 122 throttled 0)
  gc 0 highprio 0 fastpath 659990 throttled 27762 latency 8.57us

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |   1 +
 net/sched/sch_fq.c             | 128 +++++++++++++++++++++++++++++------------
 2 files changed, 92 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 3f85ae578056..579f641846b8 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -962,6 +962,7 @@ struct tc_fq_qd_stats {
 	__u64	ce_mark;		/* packets above ce_threshold */
 	__u64	horizon_drops;
 	__u64	horizon_caps;
+	__u64	fastpath_packets;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4af43a401dbb..5cf3b50a24d5 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -2,7 +2,7 @@
 /*
  * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
  *
- *  Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
  *
  *  Meant to be mostly used for locally generated traffic :
  *  Fast classification depends on skb->sk being set before reaching us.
@@ -73,7 +73,13 @@ struct fq_flow {
 		struct sk_buff *tail;	/* last skb in the list */
 		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
 	};
-	struct rb_node	fq_node;	/* anchor in fq_root[] trees */
+	union {
+		struct rb_node	fq_node;	/* anchor in fq_root[] trees */
+		/* Following field is only used for q->internal,
+		 * because q->internal is not hashed in fq_root[]
+		 */
+		u64		stat_fastpath_packets;
+	};
 	struct sock	*sk;
 	u32		socket_hash;	/* sk_hash */
 	int		qlen;		/* number of packets in flow queue */
@@ -134,7 +140,7 @@ struct fq_sched_data {
 
 /* Seldom used fields. */
 
-	u64		stat_internal_packets;
+	u64		stat_internal_packets; /* aka highprio */
 	u64		stat_ce_mark;
 	u64		stat_horizon_drops;
 	u64		stat_horizon_caps;
@@ -266,17 +272,64 @@ static void fq_gc(struct fq_sched_data *q,
 	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
 }
 
-static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+/* Fast path can be used if :
+ * 1) Packet tstamp is in the past.
+ * 2) FQ qlen == 0   OR
+ *   (no flow is currently eligible for transmit,
+ *    AND fast path queue has less than 8 packets)
+ * 3) No SO_MAX_PACING_RATE on the socket (if any).
+ * 4) No @maxrate attribute on this qdisc,
+ *
+ * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
+ */
+static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb)
+{
+	const struct fq_sched_data *q = qdisc_priv(sch);
+	const struct sock *sk;
+
+	if (fq_skb_cb(skb)->time_to_send > q->ktime_cache)
+		return false;
+
+	if (sch->q.qlen != 0) {
+		/* Even if some packets are stored in this qdisc,
+		 * we can still enable fast path if all of them are
+		 * scheduled in the future (ie no flows are eligible)
+		 * or in the fast path queue.
+		 */
+		if (q->flows != q->inactive_flows + q->throttled_flows)
+			return false;
+
+		/* Do not allow fast path queue to explode, we want Fair Queue mode
+		 * under pressure.
+		 */
+		if (q->internal.qlen >= 8)
+			return false;
+	}
+
+	sk = skb->sk;
+	if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
+	    sk->sk_max_pacing_rate != ~0UL)
+		return false;
+
+	if (q->flow_max_rate != ~0UL)
+		return false;
+
+	return true;
+}
+
+static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
 {
+	struct fq_sched_data *q = qdisc_priv(sch);
 	struct rb_node **p, *parent;
 	struct sock *sk = skb->sk;
 	struct rb_root *root;
 	struct fq_flow *f;
 
 	/* warning: no starvation prevention... */
-	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
+	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) {
+		q->stat_internal_packets++; /* highprio packet */
 		return &q->internal;
-
+	}
 	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
 	 * or a listener (SYNCOOKIE mode)
 	 * 1) request sockets are not full blown,
@@ -307,6 +360,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 		sk = (struct sock *)((hash << 1) | 1UL);
 	}
 
+	if (fq_fastpath_check(sch, skb)) {
+		q->internal.stat_fastpath_packets++;
+		return &q->internal;
+	}
+
 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
 
 	if (q->flows >= (2U << q->fq_trees_log) &&
@@ -402,12 +460,8 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
 			   struct sk_buff *skb)
 {
-	struct fq_sched_data *q = qdisc_priv(sch);
-
 	fq_erase_head(sch, flow, skb);
 	skb_mark_not_on_list(skb);
-	if (--flow->qlen == 0)
-		q->inactive_flows++;
 	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
 }
@@ -459,49 +513,45 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (unlikely(sch->q.qlen >= sch->limit))
 		return qdisc_drop(skb, sch, to_free);
 
+	q->ktime_cache = ktime_get_ns();
 	if (!skb->tstamp) {
-		fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
+		fq_skb_cb(skb)->time_to_send = q->ktime_cache;
 	} else {
-		/* Check if packet timestamp is too far in the future.
-		 * Try first if our cached value, to avoid ktime_get_ns()
-		 * cost in most cases.
-		 */
+		/* Check if packet timestamp is too far in the future. */
 		if (fq_packet_beyond_horizon(skb, q)) {
-			/* Refresh our cache and check another time */
-			q->ktime_cache = ktime_get_ns();
-			if (fq_packet_beyond_horizon(skb, q)) {
-				if (q->horizon_drop) {
+			if (q->horizon_drop) {
 					q->stat_horizon_drops++;
 					return qdisc_drop(skb, sch, to_free);
-				}
-				q->stat_horizon_caps++;
-				skb->tstamp = q->ktime_cache + q->horizon;
 			}
+			q->stat_horizon_caps++;
+			skb->tstamp = q->ktime_cache + q->horizon;
 		}
 		fq_skb_cb(skb)->time_to_send = skb->tstamp;
 	}
 
-	f = fq_classify(skb, q);
-	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
-		q->stat_flows_plimit++;
-		return qdisc_drop(skb, sch, to_free);
-	}
+	f = fq_classify(sch, skb);
 
-	if (f->qlen++ == 0)
-		q->inactive_flows--;
-	qdisc_qstats_backlog_inc(sch, skb);
-	if (fq_flow_is_detached(f)) {
-		fq_flow_add_tail(&q->new_flows, f);
-		if (time_after(jiffies, f->age + q->flow_refill_delay))
-			f->credit = max_t(u32, f->credit, q->quantum);
+	if (f != &q->internal) {
+		if (unlikely(f->qlen >= q->flow_plimit)) {
+			q->stat_flows_plimit++;
+			return qdisc_drop(skb, sch, to_free);
+		}
+
+		if (fq_flow_is_detached(f)) {
+			fq_flow_add_tail(&q->new_flows, f);
+			if (time_after(jiffies, f->age + q->flow_refill_delay))
+				f->credit = max_t(u32, f->credit, q->quantum);
+		}
+
+		if (f->qlen == 0)
+			q->inactive_flows--;
 	}
 
+	f->qlen++;
 	/* Note: this overwrites f->age */
 	flow_queue_add(f, skb);
 
-	if (unlikely(f == &q->internal)) {
-		q->stat_internal_packets++;
-	}
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
@@ -549,6 +599,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 
 	skb = fq_peek(&q->internal);
 	if (unlikely(skb)) {
+		q->internal.qlen--;
 		fq_dequeue_skb(sch, &q->internal, skb);
 		goto out;
 	}
@@ -592,6 +643,8 @@ begin:
 			INET_ECN_set_ce(skb);
 			q->stat_ce_mark++;
 		}
+		if (--f->qlen == 0)
+			q->inactive_flows++;
 		fq_dequeue_skb(sch, f, skb);
 	} else {
 		head->first = f->next;
@@ -1024,6 +1077,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 
 	st.gc_flows		  = q->stat_gc_flows;
 	st.highprio_packets	  = q->stat_internal_packets;
+	st.fastpath_packets	  = q->internal.stat_fastpath_packets;
 	st.tcp_retrans		  = 0;
 	st.throttled		  = q->stat_throttled;
 	st.flows_plimit		  = q->stat_flows_plimit;
-- 
cgit v1.2.3-70-g09d2


From 0b068c714ca9479d2783cc333fff5bc2d4a6d45c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Sep 2023 08:52:16 +0000
Subject: net: add DEV_STATS_READ() helper

Companion of DEV_STATS_INC() & DEV_STATS_ADD().

This is going to be used in the series.

Use it in macsec_get_stats64().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macsec.c      | 6 +++---
 include/linux/netdevice.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index b7e151439c48..7a44e1cbe305 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3655,9 +3655,9 @@ static void macsec_get_stats64(struct net_device *dev,
 
 	dev_fetch_sw_netstats(s, dev->tstats);
 
-	s->rx_dropped = atomic_long_read(&dev->stats.__rx_dropped);
-	s->tx_dropped = atomic_long_read(&dev->stats.__tx_dropped);
-	s->rx_errors = atomic_long_read(&dev->stats.__rx_errors);
+	s->rx_dropped = DEV_STATS_READ(dev, rx_dropped);
+	s->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
+	s->rx_errors = DEV_STATS_READ(dev, rx_errors);
 }
 
 static int macsec_get_iflink(const struct net_device *dev)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7e520c14eb8c..e070a4540fba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5236,5 +5236,6 @@ extern struct net_device *blackhole_netdev;
 #define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
 #define DEV_STATS_ADD(DEV, FIELD, VAL) 	\
 		atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
+#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)
 
 #endif	/* _LINUX_NETDEVICE_H */
-- 
cgit v1.2.3-70-g09d2


From 10bbf1652c1cca9819e98d56f3432c56d7a2d229 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Sep 2023 20:28:11 +0000
Subject: net: implement lockless SO_PRIORITY

This is a followup of 8bf43be799d4 ("net: annotate data-races
around sk->sk_priority").

sk->sk_priority can be read and written without holding the socket lock.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/pppoe.c           |  2 +-
 include/net/bluetooth/bluetooth.h |  2 +-
 net/appletalk/aarp.c              |  2 +-
 net/ax25/af_ax25.c                |  2 +-
 net/bluetooth/l2cap_sock.c        |  2 +-
 net/can/j1939/socket.c            |  2 +-
 net/can/raw.c                     |  2 +-
 net/core/sock.c                   | 23 ++++++++++++-----------
 net/dccp/ipv6.c                   |  2 +-
 net/ipv4/inet_diag.c              |  2 +-
 net/ipv4/ip_output.c              |  2 +-
 net/ipv4/tcp_ipv4.c               |  2 +-
 net/ipv4/tcp_minisocks.c          |  2 +-
 net/ipv6/inet6_connection_sock.c  |  2 +-
 net/ipv6/ip6_output.c             |  2 +-
 net/ipv6/tcp_ipv6.c               |  4 ++--
 net/mptcp/sockopt.c               |  2 +-
 net/netrom/af_netrom.c            |  2 +-
 net/rose/af_rose.c                |  2 +-
 net/sched/em_meta.c               |  2 +-
 net/sctp/ipv6.c                   |  2 +-
 net/smc/af_smc.c                  |  2 +-
 net/x25/af_x25.c                  |  2 +-
 net/xdp/xsk.c                     |  2 +-
 24 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index ba8b6bd8233c..8e7238e97d0a 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -877,7 +877,7 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m,
 
 	skb->dev = dev;
 
-	skb->priority = sk->sk_priority;
+	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->protocol = cpu_to_be16(ETH_P_PPP_SES);
 
 	ph = skb_put(skb, total_len + sizeof(struct pppoe_hdr));
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index aa90adc3b2a4..7ffa8c192c3f 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -541,7 +541,7 @@ static inline struct sk_buff *bt_skb_sendmsg(struct sock *sk,
 		return ERR_PTR(-EFAULT);
 	}
 
-	skb->priority = sk->sk_priority;
+	skb->priority = READ_ONCE(sk->sk_priority);
 
 	return skb;
 }
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index c7236daa2415..9fa0b246902b 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -664,7 +664,7 @@ out_unlock:
 
 sendit:
 	if (skb->sk)
-		skb->priority = skb->sk->sk_priority;
+		skb->priority = READ_ONCE(skb->sk->sk_priority);
 	if (dev_queue_xmit(skb))
 		goto drop;
 sent:
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 5db805d5f74d..558e158c98d0 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -939,7 +939,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
 	sock_init_data(NULL, sk);
 
 	sk->sk_type     = osk->sk_type;
-	sk->sk_priority = osk->sk_priority;
+	sk->sk_priority = READ_ONCE(osk->sk_priority);
 	sk->sk_protocol = osk->sk_protocol;
 	sk->sk_rcvbuf   = osk->sk_rcvbuf;
 	sk->sk_sndbuf   = osk->sk_sndbuf;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 3bdfc3f1e73d..e50d3d102078 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1615,7 +1615,7 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
 		return ERR_PTR(-ENOTCONN);
 	}
 
-	skb->priority = sk->sk_priority;
+	skb->priority = READ_ONCE(sk->sk_priority);
 
 	bt_cb(skb)->l2cap.chan = chan;
 
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index b28c976f52a0..14c431663233 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -884,7 +884,7 @@ static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev,
 	skcb = j1939_skb_to_cb(skb);
 	memset(skcb, 0, sizeof(*skcb));
 	skcb->addr = jsk->addr;
-	skcb->priority = j1939_prio(sk->sk_priority);
+	skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority));
 
 	if (msg->msg_name) {
 		struct sockaddr_can *addr = msg->msg_name;
diff --git a/net/can/raw.c b/net/can/raw.c
index d50c3f3d892f..73468d2ebd51 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -881,7 +881,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	}
 
 	skb->dev = dev;
-	skb->priority = sk->sk_priority;
+	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = READ_ONCE(sk->sk_mark);
 	skb->tstamp = sockc.transmit_time;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index a5995750c5c5..1fdc0a0d8ff2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -806,9 +806,7 @@ EXPORT_SYMBOL(sock_no_linger);
 
 void sock_set_priority(struct sock *sk, u32 priority)
 {
-	lock_sock(sk);
 	WRITE_ONCE(sk->sk_priority, priority);
-	release_sock(sk);
 }
 EXPORT_SYMBOL(sock_set_priority);
 
@@ -1118,6 +1116,18 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
 
 	valbool = val ? 1 : 0;
 
+	/* handle options which do not require locking the socket. */
+	switch (optname) {
+	case SO_PRIORITY:
+		if ((val >= 0 && val <= 6) ||
+		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+			sock_set_priority(sk, val);
+			return 0;
+		}
+		return -EPERM;
+	}
+
 	sockopt_lock_sock(sk);
 
 	switch (optname) {
@@ -1213,15 +1223,6 @@ set_sndbuf:
 		sk->sk_no_check_tx = valbool;
 		break;
 
-	case SO_PRIORITY:
-		if ((val >= 0 && val <= 6) ||
-		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
-		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
-			WRITE_ONCE(sk->sk_priority, val);
-		else
-			ret = -EPERM;
-		break;
-
 	case SO_LINGER:
 		if (optlen < sizeof(ling)) {
 			ret = -EINVAL;	/* 1003.1g */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 80b956b39252..8d344b219f84 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -239,7 +239,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
 		if (!opt)
 			opt = rcu_dereference(np->opt);
 		err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt,
-			       np->tclass, sk->sk_priority);
+			       np->tclass, READ_ONCE(sk->sk_priority));
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e13a84433413..9f0bd518901a 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -165,7 +165,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 		 * For cgroup2 classid is always zero.
 		 */
 		if (!classid)
-			classid = sk->sk_priority;
+			classid = READ_ONCE(sk->sk_priority);
 
 		if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
 			goto errout;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4ab877cf6d35..6b14097e80ad 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1449,7 +1449,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 		ip_options_build(skb, opt, cork->addr, rt);
 	}
 
-	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
+	skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
 	skb->mark = cork->mark;
 	skb->tstamp = cork->transmit_time;
 	/*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f13eb7e23d03..95e972be0c05 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -828,7 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
-				   inet_twsk(sk)->tw_priority : sk->sk_priority;
+				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
 		transmit_time = tcp_transmit_time(sk);
 		xfrm_sk_clone_policy(ctl_sk, sk);
 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index eee8ab1bfa0e..3f87611077ef 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -292,7 +292,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 
 		tw->tw_transparent	= inet_test_bit(TRANSPARENT, sk);
 		tw->tw_mark		= sk->sk_mark;
-		tw->tw_priority		= sk->sk_priority;
+		tw->tw_priority		= READ_ONCE(sk->sk_priority);
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 0c50dcd35fe8..80043e46117c 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -133,7 +133,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
 	fl6.daddr = sk->sk_v6_daddr;
 
 	res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
-		       np->tclass,  sk->sk_priority);
+		       np->tclass, READ_ONCE(sk->sk_priority));
 	rcu_read_unlock();
 	return res;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 951ba8089b5b..cdaa9275e990 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1984,7 +1984,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 	hdr->saddr = fl6->saddr;
 	hdr->daddr = *final_dst;
 
-	skb->priority = sk->sk_priority;
+	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = cork->base.mark;
 	skb->tstamp = cork->base.transmit_time;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 94afb8d0f2d0..8a6e2e97f673 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -565,7 +565,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 		if (!opt)
 			opt = rcu_dereference(np->opt);
 		err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark),
-			       opt, tclass, sk->sk_priority);
+			       opt, tclass, READ_ONCE(sk->sk_priority));
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
@@ -1058,7 +1058,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 			trace_tcp_send_reset(sk, skb);
 			if (inet6_test_bit(REPFLOW, sk))
 				label = ip6_flowlabel(ipv6h);
-			priority = sk->sk_priority;
+			priority = READ_ONCE(sk->sk_priority);
 			txhash = sk->sk_txhash;
 		}
 		if (sk->sk_state == TCP_TIME_WAIT) {
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 8260202c0066..f3485a6b35e7 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -89,7 +89,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
 			sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val);
 			break;
 		case SO_PRIORITY:
-			ssk->sk_priority = val;
+			WRITE_ONCE(ssk->sk_priority, val);
 			break;
 		case SO_SNDBUF:
 		case SO_SNDBUFFORCE:
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 96e91ab71573..0eed00184adf 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -487,7 +487,7 @@ static struct sock *nr_make_new(struct sock *osk)
 	sock_init_data(NULL, sk);
 
 	sk->sk_type     = osk->sk_type;
-	sk->sk_priority = osk->sk_priority;
+	sk->sk_priority = READ_ONCE(osk->sk_priority);
 	sk->sk_protocol = osk->sk_protocol;
 	sk->sk_rcvbuf   = osk->sk_rcvbuf;
 	sk->sk_sndbuf   = osk->sk_sndbuf;
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 49dafe9ac72f..0cc5a4e19900 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -583,7 +583,7 @@ static struct sock *rose_make_new(struct sock *osk)
 #endif
 
 	sk->sk_type     = osk->sk_type;
-	sk->sk_priority = osk->sk_priority;
+	sk->sk_priority = READ_ONCE(osk->sk_priority);
 	sk->sk_protocol = osk->sk_protocol;
 	sk->sk_rcvbuf   = osk->sk_rcvbuf;
 	sk->sk_sndbuf   = osk->sk_sndbuf;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index da34fd4c9269..09d8afd04a2a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -546,7 +546,7 @@ META_COLLECTOR(int_sk_prio)
 		*err = -1;
 		return;
 	}
-	dst->value = sk->sk_priority;
+	dst->value = READ_ONCE(sk->sk_priority);
 }
 
 META_COLLECTOR(int_sk_rcvlowat)
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 5c0ed5909d85..24368f755ab1 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -247,7 +247,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
 		rcu_read_lock();
 		res = ip6_xmit(sk, skb, fl6, sk->sk_mark,
 			       rcu_dereference(np->opt),
-			       tclass, sk->sk_priority);
+			       tclass, READ_ONCE(sk->sk_priority));
 		rcu_read_unlock();
 		return res;
 	}
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index bacdd971615e..297681601414 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -493,7 +493,7 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
 	nsk->sk_mark = READ_ONCE(osk->sk_mark);
-	nsk->sk_priority = osk->sk_priority;
+	nsk->sk_priority = READ_ONCE(osk->sk_priority);
 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
 	nsk->sk_err = osk->sk_err;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 0fb5143bec7a..aad8ffeaee04 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -598,7 +598,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	x25 = x25_sk(sk);
 
 	sk->sk_type        = osk->sk_type;
-	sk->sk_priority    = osk->sk_priority;
+	sk->sk_priority    = READ_ONCE(osk->sk_priority);
 	sk->sk_protocol    = osk->sk_protocol;
 	sk->sk_rcvbuf      = osk->sk_rcvbuf;
 	sk->sk_sndbuf      = osk->sk_sndbuf;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7482d0aca504..f5e96e0d6e01 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -684,7 +684,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 	}
 
 	skb->dev = dev;
-	skb->priority = xs->sk.sk_priority;
+	skb->priority = READ_ONCE(xs->sk.sk_priority);
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
 	skb->destructor = xsk_destruct_skb;
 	xsk_set_destructor_arg(skb);
-- 
cgit v1.2.3-70-g09d2


From 28b24f90020fed8e8e3e8e20575f08c1cd06e54f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Sep 2023 20:28:15 +0000
Subject: net: implement lockless SO_MAX_PACING_RATE

SO_MAX_PACING_RATE setsockopt() does not need to hold
the socket lock, because sk->sk_pacing_rate readers
can run fine if the value is changed by other threads,
after adding READ_ONCE() accessors.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/mptcp.h |  2 +-
 net/core/sock.c              | 40 +++++++++++++++++++++-------------------
 net/ipv4/tcp_bbr.c           | 13 +++++++------
 net/ipv4/tcp_input.c         |  4 ++--
 net/ipv4/tcp_output.c        |  9 +++++----
 net/sched/sch_fq.c           |  2 +-
 6 files changed, 37 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
index 563e48617374..09e72215b9f9 100644
--- a/include/trace/events/mptcp.h
+++ b/include/trace/events/mptcp.h
@@ -44,7 +44,7 @@ TRACE_EVENT(mptcp_subflow_get_send,
 		ssk = mptcp_subflow_tcp_sock(subflow);
 		if (ssk && sk_fullsock(ssk)) {
 			__entry->snd_wnd = tcp_sk(ssk)->snd_wnd;
-			__entry->pace = ssk->sk_pacing_rate;
+			__entry->pace = READ_ONCE(ssk->sk_pacing_rate);
 		} else {
 			__entry->snd_wnd = 0;
 			__entry->pace = 0;
diff --git a/net/core/sock.c b/net/core/sock.c
index 408081549bd7..4254ed0e4817 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1160,6 +1160,27 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
 		return 0;
 #endif
+	case SO_MAX_PACING_RATE:
+		{
+		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
+		unsigned long pacing_rate;
+
+		if (sizeof(ulval) != sizeof(val) &&
+		    optlen >= sizeof(ulval) &&
+		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
+			return -EFAULT;
+		}
+		if (ulval != ~0UL)
+			cmpxchg(&sk->sk_pacing_status,
+				SK_PACING_NONE,
+				SK_PACING_NEEDED);
+		/* Pairs with READ_ONCE() from sk_getsockopt() */
+		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
+		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
+		if (ulval < pacing_rate)
+			WRITE_ONCE(sk->sk_pacing_rate, ulval);
+		return 0;
+		}
 	}
 
 	sockopt_lock_sock(sk);
@@ -1423,25 +1444,6 @@ set_sndbuf:
 		break;
 
 
-	case SO_MAX_PACING_RATE:
-		{
-		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
-
-		if (sizeof(ulval) != sizeof(val) &&
-		    optlen >= sizeof(ulval) &&
-		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
-			ret = -EFAULT;
-			break;
-		}
-		if (ulval != ~0UL)
-			cmpxchg(&sk->sk_pacing_status,
-				SK_PACING_NONE,
-				SK_PACING_NEEDED);
-		/* Pairs with READ_ONCE() from sk_getsockopt() */
-		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
-		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
-		break;
-		}
 	case SO_INCOMING_CPU:
 		reuseport_update_incoming_cpu(sk, val);
 		break;
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 146792cd26fe..22358032dd48 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -258,7 +258,7 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
 	u64 rate = bw;
 
 	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
-	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
 	return rate;
 }
 
@@ -278,7 +278,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
 	}
 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
 	do_div(bw, rtt_us);
-	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
+	WRITE_ONCE(sk->sk_pacing_rate,
+		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
 }
 
 /* Pace using current bw estimate and a gain factor. */
@@ -290,14 +291,14 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 
 	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
 		bbr_init_pacing_rate_from_rtt(sk);
-	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
-		sk->sk_pacing_rate = rate;
+	if (bbr_full_bw_reached(sk) || rate > READ_ONCE(sk->sk_pacing_rate))
+		WRITE_ONCE(sk->sk_pacing_rate, rate);
 }
 
 /* override sysctl_tcp_min_tso_segs */
 __bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
 {
-	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
 }
 
 static u32 bbr_tso_segs_goal(struct sock *sk)
@@ -309,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
 	 * driver provided sk_gso_max_size.
 	 */
 	bytes = min_t(unsigned long,
-		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
+		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
 		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
 	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 584825ddd0a0..22c2a7c2e65e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -927,8 +927,8 @@ static void tcp_update_pacing_rate(struct sock *sk)
 	 * without any lock. We want to make sure compiler wont store
 	 * intermediate values in this location.
 	 */
-	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
-					     sk->sk_max_pacing_rate));
+	WRITE_ONCE(sk->sk_pacing_rate,
+		   min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2d1e4b5ac1ca..970a07289480 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1201,7 +1201,7 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (sk->sk_pacing_status != SK_PACING_NONE) {
-		unsigned long rate = sk->sk_pacing_rate;
+		unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
 
 		/* Original sch_fq does not pace first 10 MSS
 		 * Note that tp->data_segs_out overflows after 2^32 packets,
@@ -1973,7 +1973,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 	unsigned long bytes;
 	u32 r;
 
-	bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
+	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
 
 	r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
 	if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
@@ -2553,7 +2553,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 
 	limit = max_t(unsigned long,
 		      2 * skb->truesize,
-		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
+		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
 	if (sk->sk_pacing_status == SK_PACING_NONE)
 		limit = min_t(unsigned long, limit,
 			      READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
@@ -2561,7 +2561,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 
 	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
 	    tcp_sk(sk)->tcp_tx_delay) {
-		u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+		u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
+				  tcp_sk(sk)->tcp_tx_delay;
 
 		/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
 		 * approximate our needs assuming an ~100% skb->truesize overhead.
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 681bbf34b707..d35419db7b94 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -668,7 +668,7 @@ begin:
 	 */
 	if (!skb->tstamp) {
 		if (skb->sk)
-			rate = min(skb->sk->sk_pacing_rate, rate);
+			rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
 
 		if (rate <= q->low_rate_threshold) {
 			f->credit = 0;
-- 
cgit v1.2.3-70-g09d2


From 0bb4d124d34044179b42a769a0c76f389ae973b6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Sep 2023 20:28:17 +0000
Subject: net: annotate data-races around sk->sk_tx_queue_mapping

This field can be read or written without socket lock being held.

Add annotations to avoid load-store tearing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2800d587b08d..a50e37af2b57 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2007,21 +2007,33 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
 	/* sk_tx_queue_mapping accept only upto a 16-bit value */
 	if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
 		return;
-	sk->sk_tx_queue_mapping = tx_queue;
+	/* Paired with READ_ONCE() in sk_tx_queue_get() and
+	 * other WRITE_ONCE() because socket lock might be not held.
+	 */
+	WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
 }
 
 #define NO_QUEUE_MAPPING	USHRT_MAX
 
 static inline void sk_tx_queue_clear(struct sock *sk)
 {
-	sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING;
+	/* Paired with READ_ONCE() in sk_tx_queue_get() and
+	 * other WRITE_ONCE() because socket lock might be not held.
+	 */
+	WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
 }
 
 static inline int sk_tx_queue_get(const struct sock *sk)
 {
-	if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING)
-		return sk->sk_tx_queue_mapping;
+	if (sk) {
+		/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+		 * and sk_tx_queue_set().
+		 */
+		int val = READ_ONCE(sk->sk_tx_queue_mapping);
 
+		if (val != NO_QUEUE_MAPPING)
+			return val;
+	}
 	return -1;
 }
 
-- 
cgit v1.2.3-70-g09d2


From eb44ad4e635132754bfbcb18103f1dcb7058aedd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Sep 2023 20:28:18 +0000
Subject: net: annotate data-races around sk->sk_dst_pending_confirm

This field can be read or written without socket lock being held.

Add annotations to avoid load-store tearing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    | 6 +++---
 net/core/sock.c       | 2 +-
 net/ipv4/tcp_output.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index a50e37af2b57..01f0005cb7d8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2182,7 +2182,7 @@ static inline void __dst_negative_advice(struct sock *sk)
 		if (ndst != dst) {
 			rcu_assign_pointer(sk->sk_dst_cache, ndst);
 			sk_tx_queue_clear(sk);
-			sk->sk_dst_pending_confirm = 0;
+			WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 		}
 	}
 }
@@ -2199,7 +2199,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst)
 	struct dst_entry *old_dst;
 
 	sk_tx_queue_clear(sk);
-	sk->sk_dst_pending_confirm = 0;
+	WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 	old_dst = rcu_dereference_protected(sk->sk_dst_cache,
 					    lockdep_sock_is_held(sk));
 	rcu_assign_pointer(sk->sk_dst_cache, dst);
@@ -2212,7 +2212,7 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst)
 	struct dst_entry *old_dst;
 
 	sk_tx_queue_clear(sk);
-	sk->sk_dst_pending_confirm = 0;
+	WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 	old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
 	dst_release(old_dst);
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index f0930f858714..290165954379 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -600,7 +600,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 			       dst, cookie) == NULL) {
 		sk_tx_queue_clear(sk);
-		sk->sk_dst_pending_confirm = 0;
+		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 		dst_release(dst);
 		return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 970a07289480..8885552dff8e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1325,7 +1325,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 
-	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+	skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));
 
 	/* Build TCP header and checksum it. */
 	th = (struct tcphdr *)skb->data;
-- 
cgit v1.2.3-70-g09d2


From ceaa714138a372ac63cc2c5c19ee0882d22827f9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Sep 2023 03:42:15 +0000
Subject: inet: implement lockless IP_MTU_DISCOVER

inet->pmtudisc can be read locklessly.

Implement proper lockless reads and writes to inet->pmtudisc

ip_sock_set_mtu_discover() can now be called from arbitrary
contexts.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h                | 13 ++++++++-----
 net/ipv4/ip_output.c            |  7 ++++---
 net/ipv4/ip_sockglue.c          | 17 ++++++-----------
 net/ipv4/ping.c                 |  2 +-
 net/ipv4/raw.c                  |  2 +-
 net/ipv4/udp.c                  |  2 +-
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 7 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 3489a1cca5e7..46933a0d98ea 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -434,19 +434,22 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 
 static inline bool ip_sk_accept_pmtu(const struct sock *sk)
 {
-	return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE &&
-	       inet_sk(sk)->pmtudisc != IP_PMTUDISC_OMIT;
+	u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);
+
+	return pmtudisc != IP_PMTUDISC_INTERFACE &&
+	       pmtudisc != IP_PMTUDISC_OMIT;
 }
 
 static inline bool ip_sk_use_pmtu(const struct sock *sk)
 {
-	return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
+	return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE;
 }
 
 static inline bool ip_sk_ignore_df(const struct sock *sk)
 {
-	return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO ||
-	       inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT;
+	u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);
+
+	return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT;
 }
 
 static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f07ce051760d..9fc7be2c2033 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1387,8 +1387,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	struct ip_options *opt = NULL;
 	struct rtable *rt = (struct rtable *)cork->dst;
 	struct iphdr *iph;
+	u8 pmtudisc, ttl;
 	__be16 df = 0;
-	__u8 ttl;
 
 	skb = __skb_dequeue(queue);
 	if (!skb)
@@ -1418,8 +1418,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If ignore_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc == IP_PMTUDISC_DO ||
-	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
+	pmtudisc = READ_ONCE(inet->pmtudisc);
+	if (pmtudisc == IP_PMTUDISC_DO ||
+	    pmtudisc == IP_PMTUDISC_PROBE ||
 	    (skb->len <= dst_mtu(&rt->dst) &&
 	     ip_dont_fragment(sk, &rt->dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4ad3003378ae..6d874cc03c8b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -622,9 +622,7 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val)
 {
 	if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
 		return -EINVAL;
-	lock_sock(sk);
-	inet_sk(sk)->pmtudisc = val;
-	release_sock(sk);
+	WRITE_ONCE(inet_sk(sk)->pmtudisc, val);
 	return 0;
 }
 EXPORT_SYMBOL(ip_sock_set_mtu_discover);
@@ -1050,6 +1048,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		WRITE_ONCE(inet->mc_ttl, val);
 		return 0;
+	case IP_MTU_DISCOVER:
+		return ip_sock_set_mtu_discover(sk, val);
 	}
 
 	err = 0;
@@ -1107,11 +1107,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 	case IP_TOS:	/* This sets both TOS and Precedence */
 		__ip_sock_set_tos(sk, val);
 		break;
-	case IP_MTU_DISCOVER:
-		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
-			goto e_inval;
-		inet->pmtudisc = val;
-		break;
 	case IP_UNICAST_IF:
 	{
 		struct net_device *dev = NULL;
@@ -1595,6 +1590,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_MULTICAST_TTL:
 		val = READ_ONCE(inet->mc_ttl);
 		goto copyval;
+	case IP_MTU_DISCOVER:
+		val = READ_ONCE(inet->pmtudisc);
+		goto copyval;
 	}
 
 	if (needs_rtnl)
@@ -1634,9 +1632,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_TOS:
 		val = inet->tos;
 		break;
-	case IP_MTU_DISCOVER:
-		val = inet->pmtudisc;
-		break;
 	case IP_MTU:
 	{
 		struct dst_entry *dst;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4dd809b7b188..50d12b0c8d46 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -551,7 +551,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
 		case ICMP_DEST_UNREACH:
 			if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
 				ipv4_sk_update_pmtu(skb, sk, info);
-				if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+				if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) {
 					err = EMSGSIZE;
 					harderr = 1;
 					break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4b5db5d1edc2..ade1aecd7c71 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -239,7 +239,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 		if (code > NR_ICMP_UNREACH)
 			break;
 		if (code == ICMP_FRAG_NEEDED) {
-			harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+			harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT;
 			err = EMSGSIZE;
 		} else {
 			err = icmp_err_convert[code].errno;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ff984b6354..731a723dc808 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -750,7 +750,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	case ICMP_DEST_UNREACH:
 		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
 			ipv4_sk_update_pmtu(skb, sk, info);
-			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+			if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
 				err = EMSGSIZE;
 				harderr = 1;
 				break;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3eed16702248..4f6c795588fb 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1335,7 +1335,7 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
 
 	/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
 	lock_sock(sk);
-	inet->pmtudisc = val;
+	WRITE_ONCE(inet->pmtudisc, val);
 #ifdef CONFIG_IP_VS_IPV6
 	if (sk->sk_family == AF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
-- 
cgit v1.2.3-70-g09d2


From e08d0b3d172311e2bb500865c0d0038533d0ff11 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Sep 2023 03:42:16 +0000
Subject: inet: implement lockless IP_TOS

Some reads of inet->tos are racy.

Add needed READ_ONCE() annotations and convert IP_TOS option lockless.

v2: missing changes in include/net/route.h (David Ahern)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h                                   |  3 +--
 include/net/route.h                                |  4 +--
 net/dccp/ipv4.c                                    |  2 +-
 net/ipv4/inet_diag.c                               |  2 +-
 net/ipv4/ip_output.c                               |  4 +--
 net/ipv4/ip_sockglue.c                             | 29 +++++++++-------------
 net/ipv4/tcp_ipv4.c                                |  9 ++++---
 net/mptcp/sockopt.c                                |  8 +++---
 net/sctp/protocol.c                                |  4 +--
 tools/testing/selftests/net/mptcp/mptcp_connect.sh |  2 +-
 10 files changed, 31 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 46933a0d98ea..6fbc0dcf4b97 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -258,7 +258,7 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
 
 static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet)
 {
-	return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(inet->tos);
+	return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(READ_ONCE(inet->tos));
 }
 
 /* datagram.c */
@@ -810,6 +810,5 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val);
 void ip_sock_set_pktinfo(struct sock *sk);
 void ip_sock_set_recverr(struct sock *sk);
 void ip_sock_set_tos(struct sock *sk, int val);
-void  __ip_sock_set_tos(struct sock *sk, int val);
 
 #endif	/* _IP_H */
diff --git a/include/net/route.h b/include/net/route.h
index 51a45b1887b5..5c248a8e3d0e 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -37,7 +37,7 @@
 
 #define RTO_ONLINK	0x01
 
-#define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
+#define RT_CONN_FLAGS(sk)   (RT_TOS(READ_ONCE(inet_sk(sk)->tos)) | sock_flag(sk, SOCK_LOCALROUTE))
 #define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
 
 static inline __u8 ip_sock_rt_scope(const struct sock *sk)
@@ -50,7 +50,7 @@ static inline __u8 ip_sock_rt_scope(const struct sock *sk)
 
 static inline __u8 ip_sock_rt_tos(const struct sock *sk)
 {
-	return RT_TOS(inet_sk(sk)->tos);
+	return RT_TOS(READ_ONCE(inet_sk(sk)->tos));
 }
 
 struct ip_tunnel_info;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 69453b936bd5..1b8cbfda6e5d 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -511,7 +511,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 					    ireq->ir_rmt_addr,
 					    rcu_dereference(ireq->ireq_opt),
-					    inet_sk(sk)->tos);
+					    READ_ONCE(inet_sk(sk)->tos));
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 9f0bd518901a..f01aee832aab 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -134,7 +134,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 	 * hence this needs to be included regardless of socket family.
 	 */
 	if (ext & (1 << (INET_DIAG_TOS - 1)))
-		if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+		if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0)
 			goto errout;
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9fc7be2c2033..89e62ed08dad 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -544,7 +544,7 @@ EXPORT_SYMBOL(__ip_queue_xmit);
 
 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
 {
-	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+	return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
 }
 EXPORT_SYMBOL(ip_queue_xmit);
 
@@ -1438,7 +1438,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	iph = ip_hdr(skb);
 	iph->version = 4;
 	iph->ihl = 5;
-	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+	iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
 	iph->frag_off = df;
 	iph->ttl = ttl;
 	iph->protocol = sk->sk_protocol;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6d874cc03c8b..50c008efbb6d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -585,25 +585,20 @@ out:
 	return err;
 }
 
-void __ip_sock_set_tos(struct sock *sk, int val)
+void ip_sock_set_tos(struct sock *sk, int val)
 {
+	u8 old_tos = READ_ONCE(inet_sk(sk)->tos);
+
 	if (sk->sk_type == SOCK_STREAM) {
 		val &= ~INET_ECN_MASK;
-		val |= inet_sk(sk)->tos & INET_ECN_MASK;
+		val |= old_tos & INET_ECN_MASK;
 	}
-	if (inet_sk(sk)->tos != val) {
-		inet_sk(sk)->tos = val;
+	if (old_tos != val) {
+		WRITE_ONCE(inet_sk(sk)->tos, val);
 		WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
 		sk_dst_reset(sk);
 	}
 }
-
-void ip_sock_set_tos(struct sock *sk, int val)
-{
-	lock_sock(sk);
-	__ip_sock_set_tos(sk, val);
-	release_sock(sk);
-}
 EXPORT_SYMBOL(ip_sock_set_tos);
 
 void ip_sock_set_freebind(struct sock *sk)
@@ -1050,6 +1045,9 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 		return 0;
 	case IP_MTU_DISCOVER:
 		return ip_sock_set_mtu_discover(sk, val);
+	case IP_TOS:	/* This sets both TOS and Precedence */
+		ip_sock_set_tos(sk, val);
+		return 0;
 	}
 
 	err = 0;
@@ -1104,9 +1102,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 			}
 		}
 		break;
-	case IP_TOS:	/* This sets both TOS and Precedence */
-		__ip_sock_set_tos(sk, val);
-		break;
 	case IP_UNICAST_IF:
 	{
 		struct net_device *dev = NULL;
@@ -1593,6 +1588,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_MTU_DISCOVER:
 		val = READ_ONCE(inet->pmtudisc);
 		goto copyval;
+	case IP_TOS:
+		val = READ_ONCE(inet->tos);
+		goto copyval;
 	}
 
 	if (needs_rtnl)
@@ -1629,9 +1627,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 		return 0;
 	}
-	case IP_TOS:
-		val = inet->tos;
-		break;
 	case IP_MTU:
 	{
 		struct dst_entry *dst;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 95e972be0c05..a441740616d7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1024,10 +1024,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 	if (skb) {
 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 
-		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
-				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
-				(inet_sk(sk)->tos & INET_ECN_MASK) :
-				inet_sk(sk)->tos;
+		tos = READ_ONCE(inet_sk(sk)->tos);
+
+		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+			      (tos & INET_ECN_MASK);
 
 		if (!INET_ECN_is_capable(tos) &&
 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index f3485a6b35e7..18ce624bfde2 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -734,11 +734,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
 
 	lock_sock(sk);
 	sockopt_seq_inc(msk);
-	val = inet_sk(sk)->tos;
+	val = READ_ONCE(inet_sk(sk)->tos);
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
-		__ip_sock_set_tos(ssk, val);
+		ip_sock_set_tos(ssk, val);
 	}
 	release_sock(sk);
 
@@ -1343,7 +1343,7 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
 
 	switch (optname) {
 	case IP_TOS:
-		return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
+		return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
 	}
 
 	return -EOPNOTSUPP;
@@ -1411,7 +1411,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
 	ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
 	ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
 	ssk->sk_ipv6only = sk->sk_ipv6only;
-	__ip_sock_set_tos(ssk, inet_sk(sk)->tos);
+	ip_sock_set_tos(ssk, inet_sk(sk)->tos);
 
 	if (sk->sk_userlocks & tx_rx_locks) {
 		ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2185f44198de..94c6dd53cd62 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,7 +426,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	struct dst_entry *dst = NULL;
 	union sctp_addr *daddr = &t->ipaddr;
 	union sctp_addr dst_saddr;
-	__u8 tos = inet_sk(sk)->tos;
+	u8 tos = READ_ONCE(inet_sk(sk)->tos);
 
 	if (t->dscp & SCTP_DSCP_SET_MASK)
 		tos = t->dscp & SCTP_DSCP_VAL_MASK;
@@ -1057,7 +1057,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	struct flowi4 *fl4 = &t->fl.u.ip4;
 	struct sock *sk = skb->sk;
 	struct inet_sock *inet = inet_sk(sk);
-	__u8 dscp = inet->tos;
+	__u8 dscp = READ_ONCE(inet->tos);
 	__be16 df = 0;
 
 	pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index b1fc8afd072d..61a2a1988ce6 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -716,7 +716,7 @@ run_test_transparent()
 	# the required infrastructure in MPTCP sockopt code. To support TOS, the
 	# following function has been exported (T). Not great but better than
 	# checking for a specific kernel version.
-	if ! mptcp_lib_kallsyms_has "T __ip_sock_set_tos$"; then
+	if ! mptcp_lib_kallsyms_has "T ip_sock_set_tos$"; then
 		echo "INFO: ${msg} not supported by the kernel: SKIP"
 		mptcp_lib_result_skip "${TEST_GROUP}"
 		return
-- 
cgit v1.2.3-70-g09d2


From 0d293714ac32650bfb669ceadf7cc2fad8161401 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Thu, 21 Sep 2023 15:10:27 +0300
Subject: RDMA/mlx5: Send events from IB driver about device affiliation state

Send blocking events from IB driver whenever the device is done being
affiliated or if it is removed from an affiliation.

This is useful since now the EN driver can register to those event and
know when a device is affiliated or not.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/a7491c3e483cfd8d962f5f75b9a25f253043384a.1695296682.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/main.c              | 17 +++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  6 ++++++
 include/linux/mlx5/device.h                    |  2 ++
 include/linux/mlx5/driver.h                    |  2 ++
 4 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index aed5cdea50e6..530d88784e41 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -24,6 +24,7 @@
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/fs.h>
 #include <linux/mlx5/eswitch.h>
+#include <linux/mlx5/driver.h>
 #include <linux/list.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem_odp.h>
@@ -3175,6 +3176,13 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 
 	lockdep_assert_held(&mlx5_ib_multiport_mutex);
 
+	mlx5_core_mp_event_replay(ibdev->mdev,
+				  MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
+				  NULL);
+	mlx5_core_mp_event_replay(mpi->mdev,
+				  MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
+				  NULL);
+
 	mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
 
 	spin_lock(&port->mp.mpi_lock);
@@ -3226,6 +3234,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
 				    struct mlx5_ib_multiport_info *mpi)
 {
 	u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
+	u64 key;
 	int err;
 
 	lockdep_assert_held(&mlx5_ib_multiport_mutex);
@@ -3254,6 +3263,14 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
 
 	mlx5_ib_init_cong_debugfs(ibdev, port_num);
 
+	key = ibdev->ib_dev.index;
+	mlx5_core_mp_event_replay(mpi->mdev,
+				  MLX5_DRIVER_EVENT_AFFILIATION_DONE,
+				  &key);
+	mlx5_core_mp_event_replay(ibdev->mdev,
+				  MLX5_DRIVER_EVENT_AFFILIATION_DONE,
+				  &key);
+
 	return true;
 
 unbind:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 15561965d2af..6ca91c0e8a6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -361,6 +361,12 @@ void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_core_uplink_netdev_event_replay);
 
+void mlx5_core_mp_event_replay(struct mlx5_core_dev *dev, u32 event, void *data)
+{
+	mlx5_blocking_notifier_call_chain(dev, event, data);
+}
+EXPORT_SYMBOL(mlx5_core_mp_event_replay);
+
 int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type,
 			    enum mlx5_cap_mode cap_mode)
 {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 4d5be378fa8c..26333d602a50 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -366,6 +366,8 @@ enum mlx5_driver_event {
 	MLX5_DRIVER_EVENT_UPLINK_NETDEV,
 	MLX5_DRIVER_EVENT_MACSEC_SA_ADDED,
 	MLX5_DRIVER_EVENT_MACSEC_SA_DELETED,
+	MLX5_DRIVER_EVENT_AFFILIATION_DONE,
+	MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
 };
 
 enum {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3033bbaeac81..5ca4e085d813 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1027,6 +1027,8 @@ bool mlx5_cmd_is_down(struct mlx5_core_dev *dev);
 void mlx5_core_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev);
 void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *mdev);
 
+void mlx5_core_mp_event_replay(struct mlx5_core_dev *dev, u32 event, void *data);
+
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From ef36ffcb381096ed32c309c342a02bf62939c503 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Thu, 21 Sep 2023 15:10:30 +0300
Subject: net/mlx5: Add alias flow table bits

Add all the capabilities needed to check for alias object support.
As well as all the fields or commands needed for its creation and
the creation of flow table that is able to jump to an alias object.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/544c030f2a78c4adf3fe6b64f97a39cc1bbdabb9.1695296682.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 56 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3265bfcb3156..23f9780adb83 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -312,6 +312,7 @@ enum {
 	MLX5_CMD_OP_QUERY_VHCA_STATE              = 0xb0d,
 	MLX5_CMD_OP_MODIFY_VHCA_STATE             = 0xb0e,
 	MLX5_CMD_OP_SYNC_CRYPTO                   = 0xb12,
+	MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS       = 0xb16,
 	MLX5_CMD_OP_MAX
 };
 
@@ -1934,6 +1935,14 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         match_definer_format_supported[0x40];
 };
 
+enum {
+	MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_TO_REMOTE_FLOW_TABLE_MISS  = 0x80000,
+};
+
+enum {
+	MLX5_ALLOWED_OBJ_FOR_OTHER_VHCA_ACCESS_FLOW_TABLE       = 0x200,
+};
+
 struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
@@ -1952,7 +1961,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   migration_in_chunks[0x1];
 	u8	   reserved_at_d1[0xf];
 
-	u8	   reserved_at_e0[0xc0];
+	u8	   cross_vhca_object_to_object_supported[0x20];
+
+	u8	   allowed_object_for_other_vhca_access[0x40];
+
+	u8	   reserved_at_140[0x60];
 
 	u8	   flow_table_type_2_type[0x8];
 	u8	   reserved_at_1a8[0x3];
@@ -6371,6 +6384,28 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_allow_other_vhca_access_in_bits {
+	u8 opcode[0x10];
+	u8 uid[0x10];
+	u8 reserved_at_20[0x10];
+	u8 op_mod[0x10];
+	u8 reserved_at_40[0x50];
+	u8 object_type_to_be_accessed[0x10];
+	u8 object_id_to_be_accessed[0x20];
+	u8 reserved_at_c0[0x40];
+	union {
+		u8 access_key_raw[0x100];
+		u8 access_key[8][0x20];
+	};
+};
+
+struct mlx5_ifc_allow_other_vhca_access_out_bits {
+	u8 status[0x8];
+	u8 reserved_at_8[0x18];
+	u8 syndrome[0x20];
+	u8 reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_modify_header_arg_bits {
 	u8         reserved_at_0[0x80];
 
@@ -6393,6 +6428,24 @@ struct mlx5_ifc_create_match_definer_out_bits {
 	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
 };
 
+struct mlx5_ifc_alias_context_bits {
+	u8 vhca_id_to_be_accessed[0x10];
+	u8 reserved_at_10[0xd];
+	u8 status[0x3];
+	u8 object_id_to_be_accessed[0x20];
+	u8 reserved_at_40[0x40];
+	union {
+		u8 access_key_raw[0x100];
+		u8 access_key[8][0x20];
+	};
+	u8 metadata[0x80];
+};
+
+struct mlx5_ifc_create_alias_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+	struct mlx5_ifc_alias_context_bits alias_ctx;
+};
+
 enum {
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
@@ -11919,6 +11972,7 @@ enum {
 	MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24,
 	MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27,
 	MLX5_GENERAL_OBJECT_TYPES_INT_KEK = 0x47,
+	MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS = 0xff15,
 };
 
 enum {
-- 
cgit v1.2.3-70-g09d2


From 5b98fd5dc1e3b8446f98bc8c6e71d2585f9dcdce Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:28:43 -0700
Subject: ipv4: Annotate struct fib_info with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct fib_info.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: David Ahern <dsahern@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922172858.3822653-1-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_fib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f0c13864180e..84b0a82c9df4 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -156,7 +156,7 @@ struct fib_info {
 	bool			nh_updated;
 	struct nexthop		*nh;
 	struct rcu_head		rcu;
-	struct fib_nh		fib_nh[];
+	struct fib_nh		fib_nh[] __counted_by(fib_nhs);
 };
 
 
-- 
cgit v1.2.3-70-g09d2


From 210d4e9c732fa87b584af72faae96c037d9d7957 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:28:44 -0700
Subject: ipv4/igmp: Annotate struct ip_sf_socklist with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ip_sf_socklist.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922172858.3822653-2-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/igmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index ebf4349a53af..5171231f70a8 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -39,7 +39,7 @@ struct ip_sf_socklist {
 	unsigned int		sl_max;
 	unsigned int		sl_count;
 	struct rcu_head		rcu;
-	__be32			sl_addr[];
+	__be32			sl_addr[] __counted_by(sl_max);
 };
 
 #define IP_SFBLOCK	10	/* allocate this many at once */
-- 
cgit v1.2.3-70-g09d2


From 5d22b6528073fd79f6a520234c77f77c2ed8b9e1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:28:45 -0700
Subject: ipv6: Annotate struct ip6_sf_socklist with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ip6_sf_socklist.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922172858.3822653-3-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/if_inet6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index c8490729b4ae..3e454c4d7ba6 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -89,7 +89,7 @@ struct ip6_sf_socklist {
 	unsigned int		sl_max;
 	unsigned int		sl_count;
 	struct rcu_head		rcu;
-	struct in6_addr		sl_addr[];
+	struct in6_addr		sl_addr[] __counted_by(sl_max);
 };
 
 #define IP6_SFBLOCK	10	/* allocate this many at once */
-- 
cgit v1.2.3-70-g09d2


From a3d7a1209bbb06fe9d24b274475bc9879ca73333 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:28:49 -0700
Subject: net: mana: Annotate struct mana_rxq with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct mana_rxq.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Long Li <longli@microsoft.com>
Cc: Ajay Sharma <sharmaajay@microsoft.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922172858.3822653-7-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mana/mana.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 9f70b4332238..38441be68592 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -338,7 +338,7 @@ struct mana_rxq {
 	/* MUST BE THE LAST MEMBER:
 	 * Each receive buffer has an associated mana_recv_buf_oob.
 	 */
-	struct mana_recv_buf_oob rx_oobs[];
+	struct mana_recv_buf_oob rx_oobs[] __counted_by(num_rx_buf);
 };
 
 struct mana_tx_qp {
-- 
cgit v1.2.3-70-g09d2


From 59656519763d55bb93730ca17a11fba399a6ef49 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:28:51 -0700
Subject: net: mana: Annotate struct hwc_dma_buf with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct hwc_dma_buf.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Long Li <longli@microsoft.com>
Cc: Ajay Sharma <sharmaajay@microsoft.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922172858.3822653-9-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mana/hw_channel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h
index 3d3b5c881bc1..158b125692c2 100644
--- a/include/net/mana/hw_channel.h
+++ b/include/net/mana/hw_channel.h
@@ -121,7 +121,7 @@ struct hwc_dma_buf {
 	u32 gpa_mkey;
 
 	u32 num_reqs;
-	struct hwc_work_request reqs[];
+	struct hwc_work_request reqs[] __counted_by(num_reqs);
 };
 
 typedef void hwc_rx_event_handler_t(void *ctx, u32 gdma_rxq_id,
-- 
cgit v1.2.3-70-g09d2


From f25e621f5d4c423fa7217afbddd427007b0e0ec0 Mon Sep 17 00:00:00 2001
From: Beniamino Galvani <b.galvani@gmail.com>
Date: Sun, 24 Sep 2023 17:30:14 +0200
Subject: ipv6: mark address parameters of udp_tunnel6_xmit_skb() as const

The function doesn't modify the addresses passed as input, mark them
as 'const' to make that clear.

Signed-off-by: Beniamino Galvani <b.galvani@gmail.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://lore.kernel.org/r/20230924153014.786962-1-b.galvani@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/udp_tunnel.h  | 5 +++--
 net/ipv6/ip6_udp_tunnel.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 29251c3519cf..21ba0a25f936 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -154,8 +154,9 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
 
 int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			 struct sk_buff *skb,
-			 struct net_device *dev, struct in6_addr *saddr,
-			 struct in6_addr *daddr,
+			 struct net_device *dev,
+			 const struct in6_addr *saddr,
+			 const struct in6_addr *daddr,
 			 __u8 prio, __u8 ttl, __be32 label,
 			 __be16 src_port, __be16 dst_port, bool nocheck);
 
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index cdc4d4ee2420..70d38705c92f 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -75,8 +75,9 @@ EXPORT_SYMBOL_GPL(udp_sock_create6);
 
 int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			 struct sk_buff *skb,
-			 struct net_device *dev, struct in6_addr *saddr,
-			 struct in6_addr *daddr,
+			 struct net_device *dev,
+			 const struct in6_addr *saddr,
+			 const struct in6_addr *daddr,
 			 __u8 prio, __u8 ttl, __be32 label,
 			 __be16 src_port, __be16 dst_port, bool nocheck)
 {
-- 
cgit v1.2.3-70-g09d2


From fefe5dc4afeafe896c90d5b20b605f2759343c3b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 22 Sep 2023 15:31:04 +0200
Subject: net: dsa: propagate extack to ds->ops->port_hsr_join()

Drivers can provide meaningful error messages which state a reason why
they can't perform an offload, and dsa_slave_changeupper() already has
the infrastructure to propagate these over netlink rather than printing
to the kernel log. So pass the extack argument and modify the xrs700x
driver's port_hsr_join() prototype.

Also take the opportunity and use the extack for the 2 -EOPNOTSUPP cases
from xrs700x_hsr_join().

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Lukasz Majewski <lukma@denx.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/dsa/xrs700x/xrs700x.c | 18 ++++++++++++------
 include/net/dsa.h                 |  3 ++-
 net/dsa/port.c                    |  5 +++--
 net/dsa/port.h                    |  3 ++-
 net/dsa/slave.c                   |  2 +-
 5 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c
index 753fef757f11..5b02e9e426fd 100644
--- a/drivers/net/dsa/xrs700x/xrs700x.c
+++ b/drivers/net/dsa/xrs700x/xrs700x.c
@@ -548,7 +548,8 @@ static void xrs700x_bridge_leave(struct dsa_switch *ds, int port,
 }
 
 static int xrs700x_hsr_join(struct dsa_switch *ds, int port,
-			    struct net_device *hsr)
+			    struct net_device *hsr,
+			    struct netlink_ext_ack *extack)
 {
 	unsigned int val = XRS_HSR_CFG_HSR_PRP;
 	struct dsa_port *partner = NULL, *dp;
@@ -562,16 +563,21 @@ static int xrs700x_hsr_join(struct dsa_switch *ds, int port,
 	if (ret)
 		return ret;
 
-	/* Only ports 1 and 2 can be HSR/PRP redundant ports. */
-	if (port != 1 && port != 2)
+	if (port != 1 && port != 2) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Only ports 1 and 2 can offload HSR/PRP");
 		return -EOPNOTSUPP;
+	}
 
-	if (ver == HSR_V1)
+	if (ver == HSR_V1) {
 		val |= XRS_HSR_CFG_HSR;
-	else if (ver == PRP_V1)
+	} else if (ver == PRP_V1) {
 		val |= XRS_HSR_CFG_PRP;
-	else
+	} else {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Only HSR v1 and PRP v1 can be offloaded");
 		return -EOPNOTSUPP;
+	}
 
 	dsa_hsr_foreach_port(dp, ds, hsr) {
 		if (dp->index != port) {
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0b9c6aa27047..426724808e76 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1198,7 +1198,8 @@ struct dsa_switch_ops {
 	 * HSR integration
 	 */
 	int	(*port_hsr_join)(struct dsa_switch *ds, int port,
-				 struct net_device *hsr);
+				 struct net_device *hsr,
+				 struct netlink_ext_ack *extack);
 	int	(*port_hsr_leave)(struct dsa_switch *ds, int port,
 				  struct net_device *hsr);
 
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 37ab238e8304..5f01bd4f9dec 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -2024,7 +2024,8 @@ void dsa_shared_port_link_unregister_of(struct dsa_port *dp)
 		dsa_shared_port_setup_phy_of(dp, false);
 }
 
-int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr)
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
+		      struct netlink_ext_ack *extack)
 {
 	struct dsa_switch *ds = dp->ds;
 	int err;
@@ -2034,7 +2035,7 @@ int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr)
 
 	dp->hsr_dev = hsr;
 
-	err = ds->ops->port_hsr_join(ds, dp->index, hsr);
+	err = ds->ops->port_hsr_join(ds, dp->index, hsr, extack);
 	if (err)
 		dp->hsr_dev = NULL;
 
diff --git a/net/dsa/port.h b/net/dsa/port.h
index dc812512fd0e..334879964e2c 100644
--- a/net/dsa/port.h
+++ b/net/dsa/port.h
@@ -103,7 +103,8 @@ int dsa_port_phylink_create(struct dsa_port *dp);
 void dsa_port_phylink_destroy(struct dsa_port *dp);
 int dsa_shared_port_link_register_of(struct dsa_port *dp);
 void dsa_shared_port_link_unregister_of(struct dsa_port *dp);
-int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr);
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
+		      struct netlink_ext_ack *extack);
 void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
 int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
 void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 48db91b33390..2b3d89b77121 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2862,7 +2862,7 @@ static int dsa_slave_changeupper(struct net_device *dev,
 		}
 	} else if (is_hsr_master(info->upper_dev)) {
 		if (info->linking) {
-			err = dsa_port_hsr_join(dp, info->upper_dev);
+			err = dsa_port_hsr_join(dp, info->upper_dev, extack);
 			if (err == -EOPNOTSUPP) {
 				NL_SET_ERR_MSG_WEAK_MOD(extack,
 							"Offloading not supported");
-- 
cgit v1.2.3-70-g09d2


From 6715042cd112e1db971583e2eed89b90d6f9b139 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 22 Sep 2023 15:31:05 +0200
Subject: net: dsa: notify drivers of MAC address changes on user ports

In some cases, drivers may need to veto the changing of a MAC address on
a user port. Such is the case with KSZ9477 when it offloads a HSR device,
because it programs the MAC address of multiple ports to a shared
hardware register. Those ports need to have equal MAC addresses for the
lifetime of the HSR offload.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Lukasz Majewski <lukma@denx.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/dsa.h | 10 ++++++++++
 net/dsa/slave.c   |  7 +++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 426724808e76..d98439ea6146 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -969,6 +969,16 @@ struct dsa_switch_ops {
 			       struct phy_device *phy);
 	void	(*port_disable)(struct dsa_switch *ds, int port);
 
+
+	/*
+	 * Notification for MAC address changes on user ports. Drivers can
+	 * currently only veto operations. They should not use the method to
+	 * program the hardware, since the operation is not rolled back in case
+	 * of other errors.
+	 */
+	int	(*port_set_mac_address)(struct dsa_switch *ds, int port,
+					const unsigned char *addr);
+
 	/*
 	 * Compatibility between device trees defining multiple CPU ports and
 	 * drivers which are not OK to use by default the numerically smallest
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 2b3d89b77121..4c3e502d7e16 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -457,6 +457,13 @@ static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
 	if (!is_valid_ether_addr(addr->sa_data))
 		return -EADDRNOTAVAIL;
 
+	if (ds->ops->port_set_mac_address) {
+		err = ds->ops->port_set_mac_address(ds, dp->index,
+						    addr->sa_data);
+		if (err)
+			return err;
+	}
+
 	/* If the port is down, the address isn't synced yet to hardware or
 	 * to the DSA master, so there is nothing to change.
 	 */
-- 
cgit v1.2.3-70-g09d2


From 75cec20345fa8e05a2b5f861fada95ad8e165257 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Sep 2023 16:32:15 +0200
Subject: bpf: Remove xdp_do_flush_map().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xdp_do_flush_map() can be removed because there is no more user in tree.

Remove xdp_do_flush_map().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://lore.kernel.org/r/20230908143215.869913-3-bigeasy@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/filter.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 27406aee2d40..e8822bd595f9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1025,12 +1025,6 @@ int xdp_do_redirect_frame(struct net_device *dev,
 			  struct bpf_prog *prog);
 void xdp_do_flush(void);
 
-/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
- * it is no longer only flushing maps. Keep this define for compatibility
- * until all drivers are updated - do not use xdp_do_flush_map() in new code!
- */
-#define xdp_do_flush_map xdp_do_flush
-
 void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act);
 
 #ifdef CONFIG_INET
-- 
cgit v1.2.3-70-g09d2


From 26dd68d293fd1c5ac966fb5dd5f6d89de322a541 Mon Sep 17 00:00:00 2001
From: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Date: Tue, 12 Sep 2023 07:59:31 -0400
Subject: overflow: add DEFINE_FLEX() for on-stack allocs

Add DEFINE_FLEX() macro for on-stack allocations of structs with
flexible array member.

Expose __struct_size() macro outside of fortify-string.h, as it could be
used to read size of structs allocated by DEFINE_FLEX().
Move __member_size() alongside it.
-Kees

Using underlying array for on-stack storage lets us to declare
known-at-compile-time structures without kzalloc().

Actual usage for ice driver is in following patches of the series.

Missing __has_builtin() workaround is moved up to serve also assembly
compilation with m68k-linux-gcc, see [1].
Error was (note the .S file extension):
In file included from ../include/linux/linkage.h:5,
                 from ../arch/m68k/fpsp040/skeleton.S:40:
../include/linux/compiler_types.h:331:5: warning: "__has_builtin" is not defined, evaluates to 0 [-Wundef]
  331 | #if __has_builtin(__builtin_dynamic_object_size)
      |     ^~~~~~~~~~~~~
../include/linux/compiler_types.h:331:18: error: missing binary operator before token "("
  331 | #if __has_builtin(__builtin_dynamic_object_size)
      |                  ^

[1] https://lore.kernel.org/netdev/202308112122.OuF0YZqL-lkp@intel.com/
Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Link: https://lore.kernel.org/r/20230912115937.1645707-2-przemyslaw.kitszel@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/compiler_types.h | 32 +++++++++++++++++++++-----------
 include/linux/fortify-string.h |  4 ----
 include/linux/overflow.h       | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index c523c6683789..6f1ca49306d2 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -2,6 +2,15 @@
 #ifndef __LINUX_COMPILER_TYPES_H
 #define __LINUX_COMPILER_TYPES_H
 
+/*
+ * __has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.
+ * In the meantime, to support gcc < 10, we implement __has_builtin
+ * by hand.
+ */
+#ifndef __has_builtin
+#define __has_builtin(x) (0)
+#endif
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -134,17 +143,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 # define __preserve_most
 #endif
 
-/* Builtins */
-
-/*
- * __has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.
- * In the meantime, to support gcc < 10, we implement __has_builtin
- * by hand.
- */
-#ifndef __has_builtin
-#define __has_builtin(x) (0)
-#endif
-
 /* Compiler specific macros. */
 #ifdef __clang__
 #include <linux/compiler-clang.h>
@@ -352,6 +350,18 @@ struct ftrace_likely_data {
 # define __realloc_size(x, ...)
 #endif
 
+/*
+ * When the size of an allocated object is needed, use the best available
+ * mechanism to find it. (For cases where sizeof() cannot be used.)
+ */
+#if __has_builtin(__builtin_dynamic_object_size)
+#define __struct_size(p)	__builtin_dynamic_object_size(p, 0)
+#define __member_size(p)	__builtin_dynamic_object_size(p, 1)
+#else
+#define __struct_size(p)	__builtin_object_size(p, 0)
+#define __member_size(p)	__builtin_object_size(p, 1)
+#endif
+
 #ifndef asm_volatile_goto
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index da51a83b2829..1e7711185ec6 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -93,13 +93,9 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 #if __has_builtin(__builtin_dynamic_object_size)
 #define POS			__pass_dynamic_object_size(1)
 #define POS0			__pass_dynamic_object_size(0)
-#define __struct_size(p)	__builtin_dynamic_object_size(p, 0)
-#define __member_size(p)	__builtin_dynamic_object_size(p, 1)
 #else
 #define POS			__pass_object_size(1)
 #define POS0			__pass_object_size(0)
-#define __struct_size(p)	__builtin_object_size(p, 0)
-#define __member_size(p)	__builtin_object_size(p, 1)
 #endif
 
 #define __compiletime_lessthan(bounds, length)	(	\
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index f9b60313eaea..7b5cf4a5cd19 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -309,4 +309,39 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
 #define struct_size_t(type, member, count)					\
 	struct_size((type *)NULL, member, count)
 
+/**
+ * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
+ * Enables caller macro to pass (different) initializer.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ * @initializer: initializer expression (could be empty for no init).
+ */
+#define _DEFINE_FLEX(type, name, member, count, initializer)			\
+	_Static_assert(__builtin_constant_p(count),				\
+		       "onstack flex array members require compile-time const count"); \
+	union {									\
+		u8 bytes[struct_size_t(type, member, count)];			\
+		type obj;							\
+	} name##_u initializer;							\
+	type *name = (type *)&name##_u
+
+/**
+ * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing
+ * flexible array member.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ *
+ * Define a zeroed, on-stack, instance of @type structure with a trailing
+ * flexible array member.
+ * Use __struct_size(@name) to get compile-time size of it afterwards.
+ */
+#define DEFINE_FLEX(type, name, member, count)			\
+	_DEFINE_FLEX(type, name, member, count, = {})
+
 #endif /* __LINUX_OVERFLOW_H */
-- 
cgit v1.2.3-70-g09d2


From 473267a4911f2469722c74ca58087d951072f72a Mon Sep 17 00:00:00 2001
From: Patrick Rohr <prohr@google.com>
Date: Mon, 25 Sep 2023 14:47:11 -0700
Subject: net: add sysctl to disable rfc4862 5.5.3e lifetime handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds a sysctl to opt-out of RFC4862 section 5.5.3e's valid
lifetime derivation mechanism.

RFC4862 section 5.5.3e prescribes that the valid lifetime in a Router
Advertisement PIO shall be ignored if it less than 2 hours and to reset
the lifetime of the corresponding address to 2 hours. An in-progress
6man draft (see draft-ietf-6man-slaac-renum-07 section 4.2) is currently
looking to remove this mechanism. While this draft has not been moving
particularly quickly for other reasons, there is widespread consensus on
section 4.2 which updates RFC4862 section 5.5.3e.

Cc: Maciej Żenczykowski <maze@google.com>
Cc: Lorenzo Colitti <lorenzo@google.com>
Cc: Jen Linkova <furry@google.com>
Signed-off-by: Patrick Rohr <prohr@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230925214711.959704-1-prohr@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst | 11 ++++++++++
 include/linux/ipv6.h                   |  1 +
 net/ipv6/addrconf.c                    | 38 ++++++++++++++++++++++------------
 3 files changed, 37 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 5bfa1837968c..f7dfde3b09a9 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2311,6 +2311,17 @@ accept_ra_pinfo - BOOLEAN
 		- enabled if accept_ra is enabled.
 		- disabled if accept_ra is disabled.
 
+ra_honor_pio_life - BOOLEAN
+	Whether to use RFC4862 Section 5.5.3e to determine the valid
+	lifetime of an address matching a prefix sent in a Router
+	Advertisement Prefix Information Option.
+
+	- If enabled, the PIO valid lifetime will always be honored.
+	- If disabled, RFC4862 section 5.5.3e is used to determine
+	  the valid lifetime of the address.
+
+	Default: 0 (disabled)
+
 accept_ra_rt_info_min_plen - INTEGER
 	Minimum prefix length of Route Information in RA.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e400ff757f13..5e605e384aac 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -82,6 +82,7 @@ struct ipv6_devconf {
 	__u32		ioam6_id_wide;
 	__u8		ioam6_enabled;
 	__u8		ndisc_evict_nocarrier;
+	__u8		ra_honor_pio_life;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0b6ee962c84e..c2d471ad7922 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -236,6 +236,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.ioam6_id               = IOAM6_DEFAULT_IF_ID,
 	.ioam6_id_wide		= IOAM6_DEFAULT_IF_ID_WIDE,
 	.ndisc_evict_nocarrier	= 1,
+	.ra_honor_pio_life	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -297,6 +298,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.ioam6_id               = IOAM6_DEFAULT_IF_ID,
 	.ioam6_id_wide		= IOAM6_DEFAULT_IF_ID_WIDE,
 	.ndisc_evict_nocarrier	= 1,
+	.ra_honor_pio_life	= 0,
 };
 
 /* Check if link is ready: is it up and is a valid qdisc available */
@@ -2657,22 +2659,23 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 			stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
 		else
 			stored_lft = 0;
-		if (!create && stored_lft) {
+
+		/* RFC4862 Section 5.5.3e:
+		 * "Note that the preferred lifetime of the
+		 *  corresponding address is always reset to
+		 *  the Preferred Lifetime in the received
+		 *  Prefix Information option, regardless of
+		 *  whether the valid lifetime is also reset or
+		 *  ignored."
+		 *
+		 * So we should always update prefered_lft here.
+		 */
+		update_lft = !create && stored_lft;
+
+		if (update_lft && !in6_dev->cnf.ra_honor_pio_life) {
 			const u32 minimum_lft = min_t(u32,
 				stored_lft, MIN_VALID_LIFETIME);
 			valid_lft = max(valid_lft, minimum_lft);
-
-			/* RFC4862 Section 5.5.3e:
-			 * "Note that the preferred lifetime of the
-			 *  corresponding address is always reset to
-			 *  the Preferred Lifetime in the received
-			 *  Prefix Information option, regardless of
-			 *  whether the valid lifetime is also reset or
-			 *  ignored."
-			 *
-			 * So we should always update prefered_lft here.
-			 */
-			update_lft = 1;
 		}
 
 		if (update_lft) {
@@ -6846,6 +6849,15 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "ra_honor_pio_life",
+		.data		= &ipv6_devconf.ra_honor_pio_life,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	{
 		.procname	= "accept_ra_rtr_pref",
-- 
cgit v1.2.3-70-g09d2


From 00f3696f7555d0890ae07b635e6ccbf39fd2eb3a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 27 Sep 2023 11:00:30 +0200
Subject: net: appletalk: remove cops support

The COPS Appletalk support is very old, never said to actually work
properly, and the firmware code for the devices are under a very suspect
license.  Remove it all to clear up the license issue, if it is still
needed and actually used by anyone, we can add it back later once the
license is cleared up.

Reported-by: Prarit Bhargava <prarit@redhat.com>
Cc: jschlst@samba.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20230927090029.44704-2-gregkh@linuxfoundation.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/device_drivers/appletalk/cops.rst   |   80 --
 .../networking/device_drivers/appletalk/index.rst  |   18 -
 Documentation/networking/device_drivers/index.rst  |    1 -
 drivers/net/Space.c                                |    6 -
 drivers/net/appletalk/Kconfig                      |   30 -
 drivers/net/appletalk/Makefile                     |    1 -
 drivers/net/appletalk/cops.c                       | 1005 --------------------
 drivers/net/appletalk/cops.h                       |   61 --
 drivers/net/appletalk/cops_ffdrv.h                 |  532 -----------
 drivers/net/appletalk/cops_ltdrv.h                 |  241 -----
 include/net/Space.h                                |    1 -
 11 files changed, 1976 deletions(-)
 delete mode 100644 Documentation/networking/device_drivers/appletalk/cops.rst
 delete mode 100644 Documentation/networking/device_drivers/appletalk/index.rst
 delete mode 100644 drivers/net/appletalk/cops.c
 delete mode 100644 drivers/net/appletalk/cops.h
 delete mode 100644 drivers/net/appletalk/cops_ffdrv.h
 delete mode 100644 drivers/net/appletalk/cops_ltdrv.h

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/appletalk/cops.rst b/Documentation/networking/device_drivers/appletalk/cops.rst
deleted file mode 100644
index 964ba80599a9..000000000000
--- a/Documentation/networking/device_drivers/appletalk/cops.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-========================================
-The COPS LocalTalk Linux driver (cops.c)
-========================================
-
-By Jay Schulist <jschlst@samba.org>
-
-This driver has two modes and they are: Dayna mode and Tangent mode.
-Each mode corresponds with the type of card. It has been found
-that there are 2 main types of cards and all other cards are
-the same and just have different names or only have minor differences
-such as more IO ports. As this driver is tested it will
-become more clear exactly what cards are supported.
-
-Right now these cards are known to work with the COPS driver. The
-LT-200 cards work in a somewhat more limited capacity than the
-DL200 cards, which work very well and are in use by many people.
-
-TANGENT driver mode:
-	- Tangent ATB-II, Novell NL-1000, Daystar Digital LT-200
-
-DAYNA driver mode:
-	- Dayna DL2000/DaynaTalk PC (Half Length), COPS LT-95,
-	- Farallon PhoneNET PC III, Farallon PhoneNET PC II
-
-Other cards possibly supported mode unknown though:
-	- Dayna DL2000 (Full length)
-
-The COPS driver defaults to using Dayna mode. To change the driver's
-mode if you built a driver with dual support use board_type=1 or
-board_type=2 for Dayna or Tangent with insmod.
-
-Operation/loading of the driver
-===============================
-
-Use modprobe like this:	/sbin/modprobe cops.o (IO #) (IRQ #)
-If you do not specify any options the driver will try and use the IO = 0x240,
-IRQ = 5. As of right now I would only use IRQ 5 for the card, if autoprobing.
-
-To load multiple COPS driver Localtalk cards you can do one of the following::
-
-	insmod cops io=0x240 irq=5
-	insmod -o cops2 cops io=0x260 irq=3
-
-Or in lilo.conf put something like this::
-
-	append="ether=5,0x240,lt0 ether=3,0x260,lt1"
-
-Then bring up the interface with ifconfig. It will look something like this::
-
-  lt0       Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-F7-00-00-00-00-00-00-00-00
-	    inet addr:192.168.1.2  Bcast:192.168.1.255  Mask:255.255.255.0
-	    UP BROADCAST RUNNING NOARP MULTICAST  MTU:600  Metric:1
-	    RX packets:0 errors:0 dropped:0 overruns:0 frame:0
-	    TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 coll:0
-
-Netatalk Configuration
-======================
-
-You will need to configure atalkd with something like the following to make
-it work with the cops.c driver.
-
-* For single LTalk card use::
-
-    dummy -seed -phase 2 -net 2000 -addr 2000.10 -zone "1033"
-    lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033"
-
-* For multiple cards, Ethernet and LocalTalk::
-
-    eth0 -seed -phase 2 -net 3000 -addr 3000.20 -zone "1033"
-    lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033"
-
-* For multiple LocalTalk cards, and an Ethernet card.
-
-* Order seems to matter here, Ethernet last::
-
-    lt0 -seed -phase 1 -net 1000 -addr 1000.10 -zone "LocalTalk1"
-    lt1 -seed -phase 1 -net 2000 -addr 2000.20 -zone "LocalTalk2"
-    eth0 -seed -phase 2 -net 3000 -addr 3000.30 -zone "EtherTalk"
diff --git a/Documentation/networking/device_drivers/appletalk/index.rst b/Documentation/networking/device_drivers/appletalk/index.rst
deleted file mode 100644
index c196baeb0856..000000000000
--- a/Documentation/networking/device_drivers/appletalk/index.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
-
-AppleTalk Device Drivers
-========================
-
-Contents:
-
-.. toctree::
-   :maxdepth: 2
-
-   cops
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst
index 601eacaf12f3..2f0285a5bc80 100644
--- a/Documentation/networking/device_drivers/index.rst
+++ b/Documentation/networking/device_drivers/index.rst
@@ -8,7 +8,6 @@ Contents:
 .. toctree::
    :maxdepth: 2
 
-   appletalk/index
    atm/index
    cable/index
    can/index
diff --git a/drivers/net/Space.c b/drivers/net/Space.c
index 83214e2e70ab..dc50797a2ed0 100644
--- a/drivers/net/Space.c
+++ b/drivers/net/Space.c
@@ -247,12 +247,6 @@ static int __init net_olddevs_init(void)
 	for (num = 0; num < 8; ++num)
 		ethif_probe2(num);
 
-#ifdef CONFIG_COPS
-	cops_probe(0);
-	cops_probe(1);
-	cops_probe(2);
-#endif
-
 	return 0;
 }
 
diff --git a/drivers/net/appletalk/Kconfig b/drivers/net/appletalk/Kconfig
index b38ed52b82bc..b94f731e4576 100644
--- a/drivers/net/appletalk/Kconfig
+++ b/drivers/net/appletalk/Kconfig
@@ -37,36 +37,6 @@ config DEV_APPLETALK
 	  on a network.  If your Linux box is connected to such a network, and wish
 	  to do IP over it, or you have a LocalTalk card and wish to use it to
 	  connect to the AppleTalk network, say Y.
-	  
-
-config COPS
-	tristate "COPS LocalTalk PC support"
-	depends on DEV_APPLETALK && ISA
-	depends on NETDEVICES
-	select NETDEV_LEGACY_INIT
-	help
-	  This allows you to use COPS AppleTalk cards to connect to LocalTalk
-	  networks. You also need version 1.3.3 or later of the netatalk
-	  package. This driver is experimental, which means that it may not
-	  work. This driver will only work if you choose "AppleTalk DDP"
-	  networking support, above.
-	  Please read the file
-	  <file:Documentation/networking/device_drivers/appletalk/cops.rst>.
-
-config COPS_DAYNA
-	bool "Dayna firmware support"
-	depends on COPS
-	help
-	  Support COPS compatible cards with Dayna style firmware (Dayna
-	  DL2000/ Daynatalk/PC (half length), COPS LT-95, Farallon PhoneNET PC
-	  III, Farallon PhoneNET PC II).
-
-config COPS_TANGENT
-	bool "Tangent firmware support"
-	depends on COPS
-	help
-	  Support COPS compatible cards with Tangent style firmware (Tangent
-	  ATB_II, Novell NL-1000, Daystar Digital LT-200.
 
 config IPDDP
 	tristate "Appletalk-IP driver support"
diff --git a/drivers/net/appletalk/Makefile b/drivers/net/appletalk/Makefile
index 6db2943ce5d6..d8c7b23ec7ff 100644
--- a/drivers/net/appletalk/Makefile
+++ b/drivers/net/appletalk/Makefile
@@ -4,4 +4,3 @@
 #
 
 obj-$(CONFIG_IPDDP) += ipddp.o
-obj-$(CONFIG_COPS) += cops.o
diff --git a/drivers/net/appletalk/cops.c b/drivers/net/appletalk/cops.c
deleted file mode 100644
index 97f254bdbb16..000000000000
--- a/drivers/net/appletalk/cops.c
+++ /dev/null
@@ -1,1005 +0,0 @@
-/*      cops.c: LocalTalk driver for Linux.
- *
- *	Authors:
- *      - Jay Schulist <jschlst@samba.org>
- *
- *	With more than a little help from;
- *	- Alan Cox <alan@lxorguk.ukuu.org.uk>
- *
- *      Derived from:
- *      - skeleton.c: A network driver outline for linux.
- *        Written 1993-94 by Donald Becker.
- *	- ltpc.c: A driver for the LocalTalk PC card.
- *	  Written by Bradford W. Johnson.
- *
- *      Copyright 1993 United States Government as represented by the
- *      Director, National Security Agency.
- *
- *      This software may be used and distributed according to the terms
- *      of the GNU General Public License, incorporated herein by reference.
- *
- *	Changes:
- *	19970608	Alan Cox	Allowed dual card type support
- *					Can set board type in insmod
- *					Hooks for cops_setup routine
- *					(not yet implemented).
- *	19971101	Jay Schulist	Fixes for multiple lt* devices.
- *	19980607	Steven Hirsch	Fixed the badly broken support
- *					for Tangent type cards. Only
- *                                      tested on Daystar LT200. Some
- *                                      cleanup of formatting and program
- *                                      logic.  Added emacs 'local-vars'
- *                                      setup for Jay's brace style.
- *	20000211	Alan Cox	Cleaned up for softnet
- */
-
-static const char *version =
-"cops.c:v0.04 6/7/98 Jay Schulist <jschlst@samba.org>\n";
-/*
- *  Sources:
- *      COPS Localtalk SDK. This provides almost all of the information
- *      needed.
- */
-
-/*
- * insmod/modprobe configurable stuff.
- *	- IO Port, choose one your card supports or 0 if you dare.
- *	- IRQ, also choose one your card supports or nothing and let
- *	  the driver figure it out.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/interrupt.h>
-#include <linux/ptrace.h>
-#include <linux/ioport.h>
-#include <linux/in.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/if_ltalk.h>
-#include <linux/delay.h>	/* For udelay() */
-#include <linux/atalk.h>
-#include <linux/spinlock.h>
-#include <linux/bitops.h>
-#include <linux/jiffies.h>
-
-#include <net/Space.h>
-
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include "cops.h"		/* Our Stuff */
-#include "cops_ltdrv.h"		/* Firmware code for Tangent type cards. */
-#include "cops_ffdrv.h"		/* Firmware code for Dayna type cards. */
-
-/*
- *      The name of the card. Is used for messages and in the requests for
- *      io regions, irqs and dma channels
- */
-
-static const char *cardname = "cops";
-
-#ifdef CONFIG_COPS_DAYNA
-static int board_type = DAYNA;	/* Module exported */
-#else
-static int board_type = TANGENT;
-#endif
-
-static int io = 0x240;		/* Default IO for Dayna */
-static int irq = 5;		/* Default IRQ */
-
-/*
- *	COPS Autoprobe information.
- *	Right now if port address is right but IRQ is not 5 this will
- *      return a 5 no matter what since we will still get a status response.
- *      Need one more additional check to narrow down after we have gotten
- *      the ioaddr. But since only other possible IRQs is 3 and 4 so no real
- *	hurry on this. I *STRONGLY* recommend using IRQ 5 for your card with
- *	this driver.
- * 
- *	This driver has 2 modes and they are: Dayna mode and Tangent mode.
- *	Each mode corresponds with the type of card. It has been found
- *	that there are 2 main types of cards and all other cards are
- *	the same and just have different names or only have minor differences
- *	such as more IO ports. As this driver is tested it will
- *	become more clear on exactly what cards are supported. The driver
- *	defaults to using Dayna mode. To change the drivers mode, simply
- *	select Dayna or Tangent mode when configuring the kernel.
- *
- *      This driver should support:
- *      TANGENT driver mode:
- *              Tangent ATB-II, Novell NL-1000, Daystar Digital LT-200,
- *		COPS LT-1
- *      DAYNA driver mode:
- *              Dayna DL2000/DaynaTalk PC (Half Length), COPS LT-95, 
- *		Farallon PhoneNET PC III, Farallon PhoneNET PC II
- *	Other cards possibly supported mode unknown though:
- *		Dayna DL2000 (Full length), COPS LT/M (Micro-Channel)
- *
- *	Cards NOT supported by this driver but supported by the ltpc.c
- *	driver written by Bradford W. Johnson <johns393@maroon.tc.umn.edu>
- *		Farallon PhoneNET PC
- *		Original Apple LocalTalk PC card
- * 
- *      N.B.
- *
- *      The Daystar Digital LT200 boards do not support interrupt-driven
- *      IO.  You must specify 'irq=0xff' as a module parameter to invoke
- *      polled mode.  I also believe that the port probing logic is quite
- *      dangerous at best and certainly hopeless for a polled card.  Best to 
- *      specify both. - Steve H.
- *
- */
-
-/*
- * Zero terminated list of IO ports to probe.
- */
-
-static unsigned int ports[] = { 
-	0x240, 0x340, 0x200, 0x210, 0x220, 0x230, 0x260, 
-	0x2A0, 0x300, 0x310, 0x320, 0x330, 0x350, 0x360,
-	0
-};
-
-/*
- * Zero terminated list of IRQ ports to probe.
- */
-
-static int cops_irqlist[] = {
-	5, 4, 3, 0 
-};
-
-static struct timer_list cops_timer;
-static struct net_device *cops_timer_dev;
-
-/* use 0 for production, 1 for verification, 2 for debug, 3 for verbose debug */
-#ifndef COPS_DEBUG
-#define COPS_DEBUG 1 
-#endif
-static unsigned int cops_debug = COPS_DEBUG;
-
-/* The number of low I/O ports used by the card. */
-#define COPS_IO_EXTENT       8
-
-/* Information that needs to be kept for each board. */
-
-struct cops_local
-{
-        int board;			/* Holds what board type is. */
-	int nodeid;			/* Set to 1 once have nodeid. */
-        unsigned char node_acquire;	/* Node ID when acquired. */
-        struct atalk_addr node_addr;	/* Full node address */
-	spinlock_t lock;		/* RX/TX lock */
-};
-
-/* Index to functions, as function prototypes. */
-static int  cops_probe1 (struct net_device *dev, int ioaddr);
-static int  cops_irq (int ioaddr, int board);
-
-static int  cops_open (struct net_device *dev);
-static int  cops_jumpstart (struct net_device *dev);
-static void cops_reset (struct net_device *dev, int sleep);
-static void cops_load (struct net_device *dev);
-static int  cops_nodeid (struct net_device *dev, int nodeid);
-
-static irqreturn_t cops_interrupt (int irq, void *dev_id);
-static void cops_poll(struct timer_list *t);
-static void cops_timeout(struct net_device *dev, unsigned int txqueue);
-static void cops_rx (struct net_device *dev);
-static netdev_tx_t  cops_send_packet (struct sk_buff *skb,
-					    struct net_device *dev);
-static void set_multicast_list (struct net_device *dev);
-static int  cops_ioctl (struct net_device *dev, struct ifreq *rq, int cmd);
-static int  cops_close (struct net_device *dev);
-
-static void cleanup_card(struct net_device *dev)
-{
-	if (dev->irq)
-		free_irq(dev->irq, dev);
-	release_region(dev->base_addr, COPS_IO_EXTENT);
-}
-
-/*
- *      Check for a network adaptor of this type, and return '0' iff one exists.
- *      If dev->base_addr == 0, probe all likely locations.
- *      If dev->base_addr in [1..0x1ff], always return failure.
- *        otherwise go with what we pass in.
- */
-struct net_device * __init cops_probe(int unit)
-{
-	struct net_device *dev;
-	unsigned *port;
-	int base_addr;
-	int err = 0;
-
-	dev = alloc_ltalkdev(sizeof(struct cops_local));
-	if (!dev)
-		return ERR_PTR(-ENOMEM);
-
-	if (unit >= 0) {
-		sprintf(dev->name, "lt%d", unit);
-		netdev_boot_setup_check(dev);
-		irq = dev->irq;
-		base_addr = dev->base_addr;
-	} else {
-		base_addr = dev->base_addr = io;
-	}
-
-	if (base_addr > 0x1ff) {    /* Check a single specified location. */
-		err = cops_probe1(dev, base_addr);
-	} else if (base_addr != 0) { /* Don't probe at all. */
-		err = -ENXIO;
-	} else {
-		/* FIXME  Does this really work for cards which generate irq?
-		 * It's definitely N.G. for polled Tangent. sh
-		 * Dayna cards don't autoprobe well at all, but if your card is
-		 * at IRQ 5 & IO 0x240 we find it every time. ;) JS
-		 */
-		for (port = ports; *port && cops_probe1(dev, *port) < 0; port++)
-			;
-		if (!*port)
-			err = -ENODEV;
-	}
-	if (err)
-		goto out;
-	err = register_netdev(dev);
-	if (err)
-		goto out1;
-	return dev;
-out1:
-	cleanup_card(dev);
-out:
-	free_netdev(dev);
-	return ERR_PTR(err);
-}
-
-static const struct net_device_ops cops_netdev_ops = {
-	.ndo_open               = cops_open,
-        .ndo_stop               = cops_close,
-	.ndo_start_xmit   	= cops_send_packet,
-	.ndo_tx_timeout		= cops_timeout,
-        .ndo_do_ioctl           = cops_ioctl,
-	.ndo_set_rx_mode	= set_multicast_list,
-};
-
-/*
- *      This is the real probe routine. Linux has a history of friendly device
- *      probes on the ISA bus. A good device probes avoids doing writes, and
- *      verifies that the correct device exists and functions.
- */
-static int __init cops_probe1(struct net_device *dev, int ioaddr)
-{
-        struct cops_local *lp;
-	static unsigned version_printed;
-	int board = board_type;
-	int retval;
-	
-        if(cops_debug && version_printed++ == 0)
-		printk("%s", version);
-
-	/* Grab the region so no one else tries to probe our ioports. */
-	if (!request_region(ioaddr, COPS_IO_EXTENT, dev->name))
-		return -EBUSY;
-
-        /*
-         * Since this board has jumpered interrupts, allocate the interrupt
-         * vector now. There is no point in waiting since no other device
-         * can use the interrupt, and this marks the irq as busy. Jumpered
-         * interrupts are typically not reported by the boards, and we must
-         * used AutoIRQ to find them.
-	 */
-	dev->irq = irq;
-	switch (dev->irq)
-	{
-		case 0:
-			/* COPS AutoIRQ routine */
-			dev->irq = cops_irq(ioaddr, board);
-			if (dev->irq)
-				break;
-			fallthrough;	/* Once no IRQ found on this port */
-		case 1:
-			retval = -EINVAL;
-			goto err_out;
-
-		/* Fixup for users that don't know that IRQ 2 is really
-		 * IRQ 9, or don't know which one to set.
-		 */
-		case 2:
-			dev->irq = 9;
-			break;
-
-		/* Polled operation requested. Although irq of zero passed as
-		 * a parameter tells the init routines to probe, we'll
-		 * overload it to denote polled operation at runtime.
-		 */
-		case 0xff:
-			dev->irq = 0;
-			break;
-
-		default:
-			break;
-	}
-
-	dev->base_addr = ioaddr;
-
-	/* Reserve any actual interrupt. */
-	if (dev->irq) {
-		retval = request_irq(dev->irq, cops_interrupt, 0, dev->name, dev);
-		if (retval)
-			goto err_out;
-	}
-
-        lp = netdev_priv(dev);
-        spin_lock_init(&lp->lock);
-
-	/* Copy local board variable to lp struct. */
-	lp->board               = board;
-
-	dev->netdev_ops 	= &cops_netdev_ops;
-	dev->watchdog_timeo	= HZ * 2;
-
-
-	/* Tell the user where the card is and what mode we're in. */
-	if(board==DAYNA)
-		printk("%s: %s at %#3x, using IRQ %d, in Dayna mode.\n", 
-			dev->name, cardname, ioaddr, dev->irq);
-	if(board==TANGENT) {
-		if(dev->irq)
-			printk("%s: %s at %#3x, IRQ %d, in Tangent mode\n", 
-				dev->name, cardname, ioaddr, dev->irq);
-		else
-			printk("%s: %s at %#3x, using polled IO, in Tangent mode.\n", 
-				dev->name, cardname, ioaddr);
-
-	}
-        return 0;
-
-err_out:
-	release_region(ioaddr, COPS_IO_EXTENT);
-	return retval;
-}
-
-static int __init cops_irq (int ioaddr, int board)
-{       /*
-         * This does not use the IRQ to determine where the IRQ is. We just
-         * assume that when we get a correct status response that it's the IRQ.
-         * This really just verifies the IO port but since we only have access
-         * to such a small number of IRQs (5, 4, 3) this is not bad.
-         * This will probably not work for more than one card.
-         */
-        int irqaddr=0;
-        int i, x, status;
-
-        if(board==DAYNA)
-        {
-                outb(0, ioaddr+DAYNA_RESET);
-                inb(ioaddr+DAYNA_RESET);
-                mdelay(333);
-        }
-        if(board==TANGENT)
-        {
-                inb(ioaddr);
-                outb(0, ioaddr);
-                outb(0, ioaddr+TANG_RESET);
-        }
-
-        for(i=0; cops_irqlist[i] !=0; i++)
-        {
-                irqaddr = cops_irqlist[i];
-                for(x = 0xFFFF; x>0; x --)    /* wait for response */
-                {
-                        if(board==DAYNA)
-                        {
-                                status = (inb(ioaddr+DAYNA_CARD_STATUS)&3);
-                                if(status == 1)
-                                        return irqaddr;
-                        }
-                        if(board==TANGENT)
-                        {
-                                if((inb(ioaddr+TANG_CARD_STATUS)& TANG_TX_READY) !=0)
-                                        return irqaddr;
-                        }
-                }
-        }
-        return 0;       /* no IRQ found */
-}
-
-/*
- * Open/initialize the board. This is called (in the current kernel)
- * sometime after booting when the 'ifconfig' program is run.
- */
-static int cops_open(struct net_device *dev)
-{
-    struct cops_local *lp = netdev_priv(dev);
-
-	if(dev->irq==0)
-	{
-		/*
-		 * I don't know if the Dayna-style boards support polled 
-		 * operation.  For now, only allow it for Tangent.
-		 */
-		if(lp->board==TANGENT)	/* Poll 20 times per second */
-		{
-		    cops_timer_dev = dev;
-		    timer_setup(&cops_timer, cops_poll, 0);
-		    cops_timer.expires 	= jiffies + HZ/20;
-		    add_timer(&cops_timer);
-		} 
-		else 
-		{
-			printk(KERN_WARNING "%s: No irq line set\n", dev->name);
-			return -EAGAIN;
-		}
-	}
-
-	cops_jumpstart(dev);	/* Start the card up. */
-
-	netif_start_queue(dev);
-        return 0;
-}
-
-/*
- *	This allows for a dynamic start/restart of the entire card.
- */
-static int cops_jumpstart(struct net_device *dev)
-{
-	struct cops_local *lp = netdev_priv(dev);
-
-	/*
-         *      Once the card has the firmware loaded and has acquired
-         *      the nodeid, if it is reset it will lose it all.
-         */
-        cops_reset(dev,1);	/* Need to reset card before load firmware. */
-        cops_load(dev);		/* Load the firmware. */
-
-	/*
-	 *	If atalkd already gave us a nodeid we will use that
-	 *	one again, else we wait for atalkd to give us a nodeid
-	 *	in cops_ioctl. This may cause a problem if someone steals
-	 *	our nodeid while we are resetting.
-	 */	
-	if(lp->nodeid == 1)
-		cops_nodeid(dev,lp->node_acquire);
-
-	return 0;
-}
-
-static void tangent_wait_reset(int ioaddr)
-{
-	int timeout=0;
-
-	while(timeout++ < 5 && (inb(ioaddr+TANG_CARD_STATUS)&TANG_TX_READY)==0)
-		mdelay(1);   /* Wait 1 second */
-}
-
-/*
- *      Reset the LocalTalk board.
- */
-static void cops_reset(struct net_device *dev, int sleep)
-{
-        struct cops_local *lp = netdev_priv(dev);
-        int ioaddr=dev->base_addr;
-
-        if(lp->board==TANGENT)
-        {
-                inb(ioaddr);		/* Clear request latch. */
-                outb(0,ioaddr);		/* Clear the TANG_TX_READY flop. */
-                outb(0, ioaddr+TANG_RESET);	/* Reset the adapter. */
-
-		tangent_wait_reset(ioaddr);
-                outb(0, ioaddr+TANG_CLEAR_INT);
-        }
-        if(lp->board==DAYNA)
-        {
-                outb(0, ioaddr+DAYNA_RESET);	/* Assert the reset port */
-                inb(ioaddr+DAYNA_RESET);	/* Clear the reset */
-		if (sleep)
-			msleep(333);
-		else
-			mdelay(333);
-        }
-
-	netif_wake_queue(dev);
-}
-
-static void cops_load (struct net_device *dev)
-{
-        struct ifreq ifr;
-        struct ltfirmware *ltf= (struct ltfirmware *)&ifr.ifr_ifru;
-        struct cops_local *lp = netdev_priv(dev);
-        int ioaddr=dev->base_addr;
-	int length, i = 0;
-
-        strcpy(ifr.ifr_name,"lt0");
-
-        /* Get card's firmware code and do some checks on it. */
-#ifdef CONFIG_COPS_DAYNA        
-        if(lp->board==DAYNA)
-        {
-                ltf->length=sizeof(ffdrv_code);
-                ltf->data=ffdrv_code;
-        }
-        else
-#endif        
-#ifdef CONFIG_COPS_TANGENT
-        if(lp->board==TANGENT)
-        {
-                ltf->length=sizeof(ltdrv_code);
-                ltf->data=ltdrv_code;
-        }
-        else
-#endif
-	{
-		printk(KERN_INFO "%s; unsupported board type.\n", dev->name);
-		return;
-	}
-	
-        /* Check to make sure firmware is correct length. */
-        if(lp->board==DAYNA && ltf->length!=5983)
-        {
-                printk(KERN_WARNING "%s: Firmware is not length of FFDRV.BIN.\n", dev->name);
-                return;
-        }
-        if(lp->board==TANGENT && ltf->length!=2501)
-        {
-                printk(KERN_WARNING "%s: Firmware is not length of DRVCODE.BIN.\n", dev->name);
-                return;
-        }
-
-        if(lp->board==DAYNA)
-        {
-                /*
-                 *      We must wait for a status response
-                 *      with the DAYNA board.
-                 */
-                while(++i<65536)
-                {
-                       if((inb(ioaddr+DAYNA_CARD_STATUS)&3)==1)
-                                break;
-                }
-
-                if(i==65536)
-                        return;
-        }
-
-        /*
-         *      Upload the firmware and kick. Byte-by-byte works nicely here.
-         */
-	i=0;
-        length = ltf->length;
-        while(length--)
-        {
-                outb(ltf->data[i], ioaddr);
-                i++;
-        }
-
-	if(cops_debug > 1)
-		printk("%s: Uploaded firmware - %d bytes of %d bytes.\n", 
-			dev->name, i, ltf->length);
-
-        if(lp->board==DAYNA) 	/* Tell Dayna to run the firmware code. */
-                outb(1, ioaddr+DAYNA_INT_CARD);
-	else			/* Tell Tang to run the firmware code. */
-		inb(ioaddr);
-
-        if(lp->board==TANGENT)
-        {
-                tangent_wait_reset(ioaddr);
-                inb(ioaddr);	/* Clear initial ready signal. */
-        }
-}
-
-/*
- * 	Get the LocalTalk Nodeid from the card. We can suggest
- *	any nodeid 1-254. The card will try and get that exact
- *	address else we can specify 0 as the nodeid and the card
- *	will autoprobe for a nodeid.
- */
-static int cops_nodeid (struct net_device *dev, int nodeid)
-{
-	struct cops_local *lp = netdev_priv(dev);
-	int ioaddr = dev->base_addr;
-
-	if(lp->board == DAYNA)
-        {
-		/* Empty any pending adapter responses. */
-                while((inb(ioaddr+DAYNA_CARD_STATUS)&DAYNA_TX_READY)==0)
-                {
-			outb(0, ioaddr+COPS_CLEAR_INT);	/* Clear interrupts. */
-			if((inb(ioaddr+DAYNA_CARD_STATUS)&0x03)==DAYNA_RX_REQUEST)
-				cops_rx(dev);	/* Kick any packets waiting. */
-			schedule();
-                }
-
-                outb(2, ioaddr);       	/* Output command packet length as 2. */
-                outb(0, ioaddr);
-                outb(LAP_INIT, ioaddr);	/* Send LAP_INIT command byte. */
-                outb(nodeid, ioaddr);  	/* Suggest node address. */
-        }
-
-	if(lp->board == TANGENT)
-        {
-                /* Empty any pending adapter responses. */
-                while(inb(ioaddr+TANG_CARD_STATUS)&TANG_RX_READY)
-                {
-			outb(0, ioaddr+COPS_CLEAR_INT);	/* Clear interrupt. */
-			cops_rx(dev);          	/* Kick out packets waiting. */
-			schedule();
-                }
-
-		/* Not sure what Tangent does if nodeid picked is used. */
-                if(nodeid == 0)	         		/* Seed. */
-			nodeid = jiffies&0xFF;		/* Get a random try */
-                outb(2, ioaddr);        		/* Command length LSB */
-                outb(0, ioaddr);       			/* Command length MSB */
-                outb(LAP_INIT, ioaddr); 		/* Send LAP_INIT byte */
-                outb(nodeid, ioaddr); 		  	/* LAP address hint. */
-                outb(0xFF, ioaddr);     		/* Int. level to use */
-        }
-
-	lp->node_acquire=0;		/* Set nodeid holder to 0. */
-        while(lp->node_acquire==0)	/* Get *True* nodeid finally. */
-	{
-		outb(0, ioaddr+COPS_CLEAR_INT);	/* Clear any interrupt. */
-
-		if(lp->board == DAYNA)
-		{
-			if((inb(ioaddr+DAYNA_CARD_STATUS)&0x03)==DAYNA_RX_REQUEST)
-				cops_rx(dev);	/* Grab the nodeid put in lp->node_acquire. */
-		}
-		if(lp->board == TANGENT)
-		{	
-			if(inb(ioaddr+TANG_CARD_STATUS)&TANG_RX_READY)
-				cops_rx(dev);   /* Grab the nodeid put in lp->node_acquire. */
-		}
-		schedule();
-	}
-
-	if(cops_debug > 1)
-		printk(KERN_DEBUG "%s: Node ID %d has been acquired.\n", 
-			dev->name, lp->node_acquire);
-
-	lp->nodeid=1;	/* Set got nodeid to 1. */
-
-        return 0;
-}
-
-/*
- *	Poll the Tangent type cards to see if we have work.
- */
- 
-static void cops_poll(struct timer_list *unused)
-{
-	int ioaddr, status;
-	int boguscount = 0;
-	struct net_device *dev = cops_timer_dev;
-
-	del_timer(&cops_timer);
-
-	if(dev == NULL)
-		return;	/* We've been downed */
-
-	ioaddr = dev->base_addr;
-	do {
-		status=inb(ioaddr+TANG_CARD_STATUS);
-		if(status & TANG_RX_READY)
-			cops_rx(dev);
-		if(status & TANG_TX_READY)
-			netif_wake_queue(dev);
-		status = inb(ioaddr+TANG_CARD_STATUS);
-	} while((++boguscount < 20) && (status&(TANG_RX_READY|TANG_TX_READY)));
-
-	/* poll 20 times per second */
-	cops_timer.expires = jiffies + HZ/20;
-	add_timer(&cops_timer);
-}
-
-/*
- *      The typical workload of the driver:
- *      Handle the network interface interrupts.
- */
-static irqreturn_t cops_interrupt(int irq, void *dev_id)
-{
-        struct net_device *dev = dev_id;
-        struct cops_local *lp;
-        int ioaddr, status;
-        int boguscount = 0;
-
-        ioaddr = dev->base_addr;
-        lp = netdev_priv(dev);
-
-	if(lp->board==DAYNA)
-	{
-		do {
-			outb(0, ioaddr + COPS_CLEAR_INT);
-			status=inb(ioaddr+DAYNA_CARD_STATUS);
-			if((status&0x03)==DAYNA_RX_REQUEST)
-				cops_rx(dev);
-			netif_wake_queue(dev);
-		} while(++boguscount < 20);
-	}
-	else
-	{
-		do {
-			status=inb(ioaddr+TANG_CARD_STATUS);
-			if(status & TANG_RX_READY)
-				cops_rx(dev);
-			if(status & TANG_TX_READY)
-				netif_wake_queue(dev);
-			status=inb(ioaddr+TANG_CARD_STATUS);
-		} while((++boguscount < 20) && (status&(TANG_RX_READY|TANG_TX_READY)));
-	}
-
-        return IRQ_HANDLED;
-}
-
-/*
- *      We have a good packet(s), get it/them out of the buffers.
- */
-static void cops_rx(struct net_device *dev)
-{
-        int pkt_len = 0;
-        int rsp_type = 0;
-        struct sk_buff *skb = NULL;
-        struct cops_local *lp = netdev_priv(dev);
-        int ioaddr = dev->base_addr;
-        int boguscount = 0;
-        unsigned long flags;
-
-
-	spin_lock_irqsave(&lp->lock, flags);
-	
-        if(lp->board==DAYNA)
-        {
-                outb(0, ioaddr);                /* Send out Zero length. */
-                outb(0, ioaddr);
-                outb(DATA_READ, ioaddr);        /* Send read command out. */
-
-                /* Wait for DMA to turn around. */
-                while(++boguscount<1000000)
-                {
-			barrier();
-                        if((inb(ioaddr+DAYNA_CARD_STATUS)&0x03)==DAYNA_RX_READY)
-                                break;
-                }
-
-                if(boguscount==1000000)
-                {
-                        printk(KERN_WARNING "%s: DMA timed out.\n",dev->name);
-			spin_unlock_irqrestore(&lp->lock, flags);
-                        return;
-                }
-        }
-
-        /* Get response length. */
-	pkt_len = inb(ioaddr);
-        pkt_len |= (inb(ioaddr) << 8);
-        /* Input IO code. */
-        rsp_type=inb(ioaddr);
-
-        /* Malloc up new buffer. */
-        skb = dev_alloc_skb(pkt_len);
-        if(skb == NULL)
-        {
-                printk(KERN_WARNING "%s: Memory squeeze, dropping packet.\n",
-			dev->name);
-                dev->stats.rx_dropped++;
-                while(pkt_len--)        /* Discard packet */
-                        inb(ioaddr);
-                spin_unlock_irqrestore(&lp->lock, flags);
-                return;
-        }
-        skb->dev = dev;
-        skb_put(skb, pkt_len);
-        skb->protocol = htons(ETH_P_LOCALTALK);
-
-        insb(ioaddr, skb->data, pkt_len);               /* Eat the Data */
-
-        if(lp->board==DAYNA)
-                outb(1, ioaddr+DAYNA_INT_CARD);         /* Interrupt the card */
-
-        spin_unlock_irqrestore(&lp->lock, flags);  /* Restore interrupts. */
-
-        /* Check for bad response length */
-        if(pkt_len < 0 || pkt_len > MAX_LLAP_SIZE)
-        {
-		printk(KERN_WARNING "%s: Bad packet length of %d bytes.\n", 
-			dev->name, pkt_len);
-                dev->stats.tx_errors++;
-                dev_kfree_skb_any(skb);
-                return;
-        }
-
-        /* Set nodeid and then get out. */
-        if(rsp_type == LAP_INIT_RSP)
-        {	/* Nodeid taken from received packet. */
-                lp->node_acquire = skb->data[0];
-                dev_kfree_skb_any(skb);
-                return;
-        }
-
-        /* One last check to make sure we have a good packet. */
-        if(rsp_type != LAP_RESPONSE)
-        {
-                printk(KERN_WARNING "%s: Bad packet type %d.\n", dev->name, rsp_type);
-                dev->stats.tx_errors++;
-                dev_kfree_skb_any(skb);
-                return;
-        }
-
-        skb_reset_mac_header(skb);    /* Point to entire packet. */
-        skb_pull(skb,3);
-        skb_reset_transport_header(skb);    /* Point to data (Skip header). */
-
-        /* Update the counters. */
-        dev->stats.rx_packets++;
-        dev->stats.rx_bytes += skb->len;
-
-        /* Send packet to a higher place. */
-        netif_rx(skb);
-}
-
-static void cops_timeout(struct net_device *dev, unsigned int txqueue)
-{
-        struct cops_local *lp = netdev_priv(dev);
-        int ioaddr = dev->base_addr;
-
-	dev->stats.tx_errors++;
-        if(lp->board==TANGENT)
-        {
-		if((inb(ioaddr+TANG_CARD_STATUS)&TANG_TX_READY)==0)
-			printk(KERN_WARNING "%s: No TX complete interrupt.\n", dev->name);
-	}
-	printk(KERN_WARNING "%s: Transmit timed out.\n", dev->name);
-	cops_jumpstart(dev);	/* Restart the card. */
-	netif_trans_update(dev); /* prevent tx timeout */
-	netif_wake_queue(dev);
-}
-
-
-/*
- *	Make the card transmit a LocalTalk packet.
- */
-
-static netdev_tx_t cops_send_packet(struct sk_buff *skb,
-					  struct net_device *dev)
-{
-        struct cops_local *lp = netdev_priv(dev);
-        int ioaddr = dev->base_addr;
-        unsigned long flags;
-
-        /*
-         * Block a timer-based transmit from overlapping. 
-	 */
-	 
-	netif_stop_queue(dev);
-
-	spin_lock_irqsave(&lp->lock, flags);
-	if(lp->board == DAYNA)	 /* Wait for adapter transmit buffer. */
-		while((inb(ioaddr+DAYNA_CARD_STATUS)&DAYNA_TX_READY)==0)
-			cpu_relax();
-	if(lp->board == TANGENT) /* Wait for adapter transmit buffer. */
-		while((inb(ioaddr+TANG_CARD_STATUS)&TANG_TX_READY)==0)
-			cpu_relax();
-
-	/* Output IO length. */
-	outb(skb->len, ioaddr);
-	outb(skb->len >> 8, ioaddr);
-
-	/* Output IO code. */
-	outb(LAP_WRITE, ioaddr);
-
-	if(lp->board == DAYNA)	/* Check the transmit buffer again. */
-		while((inb(ioaddr+DAYNA_CARD_STATUS)&DAYNA_TX_READY)==0);
-
-	outsb(ioaddr, skb->data, skb->len);	/* Send out the data. */
-
-	if(lp->board==DAYNA)	/* Dayna requires you kick the card */
-		outb(1, ioaddr+DAYNA_INT_CARD);
-
-	spin_unlock_irqrestore(&lp->lock, flags);	/* Restore interrupts. */
-
-	/* Done sending packet, update counters and cleanup. */
-	dev->stats.tx_packets++;
-	dev->stats.tx_bytes += skb->len;
-	dev_kfree_skb (skb);
-	return NETDEV_TX_OK;
-}
-
-/*
- *	Dummy function to keep the Appletalk layer happy.
- */
- 
-static void set_multicast_list(struct net_device *dev)
-{
-        if(cops_debug >= 3)
-		printk("%s: set_multicast_list executed\n", dev->name);
-}
-
-/*
- *      System ioctls for the COPS LocalTalk card.
- */
- 
-static int cops_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-        struct cops_local *lp = netdev_priv(dev);
-        struct sockaddr_at *sa = (struct sockaddr_at *)&ifr->ifr_addr;
-        struct atalk_addr *aa = &lp->node_addr;
-
-        switch(cmd)
-        {
-                case SIOCSIFADDR:
-			/* Get and set the nodeid and network # atalkd wants. */
-			cops_nodeid(dev, sa->sat_addr.s_node);
-			aa->s_net               = sa->sat_addr.s_net;
-                        aa->s_node              = lp->node_acquire;
-
-			/* Set broardcast address. */
-                        dev->broadcast[0]       = 0xFF;
-			
-			/* Set hardware address. */
-                        dev->addr_len           = 1;
-			dev_addr_set(dev, &aa->s_node);
-                        return 0;
-
-                case SIOCGIFADDR:
-                        sa->sat_addr.s_net      = aa->s_net;
-                        sa->sat_addr.s_node     = aa->s_node;
-                        return 0;
-
-                default:
-                        return -EOPNOTSUPP;
-        }
-}
-
-/*
- *	The inverse routine to cops_open().
- */
- 
-static int cops_close(struct net_device *dev)
-{
-	struct cops_local *lp = netdev_priv(dev);
-
-	/* If we were running polled, yank the timer.
-	 */
-	if(lp->board==TANGENT && dev->irq==0)
-		del_timer(&cops_timer);
-
-	netif_stop_queue(dev);
-        return 0;
-}
-
-
-#ifdef MODULE
-static struct net_device *cops_dev;
-
-MODULE_LICENSE("GPL");
-module_param_hw(io, int, ioport, 0);
-module_param_hw(irq, int, irq, 0);
-module_param_hw(board_type, int, other, 0);
-
-static int __init cops_module_init(void)
-{
-	if (io == 0)
-		printk(KERN_WARNING "%s: You shouldn't autoprobe with insmod\n",
-			cardname);
-	cops_dev = cops_probe(-1);
-	return PTR_ERR_OR_ZERO(cops_dev);
-}
-
-static void __exit cops_module_exit(void)
-{
-	unregister_netdev(cops_dev);
-	cleanup_card(cops_dev);
-	free_netdev(cops_dev);
-}
-module_init(cops_module_init);
-module_exit(cops_module_exit);
-#endif /* MODULE */
diff --git a/drivers/net/appletalk/cops.h b/drivers/net/appletalk/cops.h
deleted file mode 100644
index 7a0bfb351929..000000000000
--- a/drivers/net/appletalk/cops.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*      cops.h: LocalTalk driver for Linux.
- *
- *      Authors:
- *      - Jay Schulist <jschlst@samba.org>
- */
-
-#ifndef __LINUX_COPSLTALK_H
-#define __LINUX_COPSLTALK_H
-
-#ifdef __KERNEL__
-
-/* Max LLAP size we will accept. */
-#define MAX_LLAP_SIZE		603
-
-/* Tangent */
-#define TANG_CARD_STATUS        1
-#define TANG_CLEAR_INT          1
-#define TANG_RESET              3
-
-#define TANG_TX_READY           1
-#define TANG_RX_READY           2
-
-/* Dayna */
-#define DAYNA_CMD_DATA          0
-#define DAYNA_CLEAR_INT         1
-#define DAYNA_CARD_STATUS       2
-#define DAYNA_INT_CARD          3
-#define DAYNA_RESET             4
-
-#define DAYNA_RX_READY          0
-#define DAYNA_TX_READY          1
-#define DAYNA_RX_REQUEST        3
-
-/* Same on both card types */
-#define COPS_CLEAR_INT  1
-
-/* LAP response codes received from the cards. */
-#define LAP_INIT        1       /* Init cmd */
-#define LAP_INIT_RSP    2       /* Init response */
-#define LAP_WRITE       3       /* Write cmd */
-#define DATA_READ       4       /* Data read */
-#define LAP_RESPONSE    4       /* Received ALAP frame response */
-#define LAP_GETSTAT     5       /* Get LAP and HW status */
-#define LAP_RSPSTAT     6       /* Status response */
-
-#endif
-
-/*
- *	Structure to hold the firmware information.
- */
-struct ltfirmware
-{
-        unsigned int length;
-        const unsigned char *data;
-};
-
-#define DAYNA 1
-#define TANGENT 2
-
-#endif
diff --git a/drivers/net/appletalk/cops_ffdrv.h b/drivers/net/appletalk/cops_ffdrv.h
deleted file mode 100644
index b02005087c1b..000000000000
--- a/drivers/net/appletalk/cops_ffdrv.h
+++ /dev/null
@@ -1,532 +0,0 @@
-
-/*
- *	The firmware this driver downloads into the Localtalk card is a
- *	separate program and is not GPL'd source code, even though the Linux
- *	side driver and the routine that loads this data into the card are.
- *	
- *	It is taken from the COPS SDK and is under the following license
- *
- *	This material is licensed to you strictly for use in conjunction with
- *	the use of COPS LocalTalk adapters.
- *	There is no charge for this SDK. And no waranty express or implied
- *	about its fitness for any purpose. However, we will cheerefully
- *	refund every penny you paid for this SDK...
- *	Regards,
- *
- *	Thomas F. Divine
- *	Chief Scientist
- */
-
-
-/*      cops_ffdrv.h: LocalTalk driver firmware dump for Linux.
- *
- *      Authors:
- *      - Jay Schulist <jschlst@samba.org>
- */
-
-
-#ifdef CONFIG_COPS_DAYNA
-
-static const unsigned char ffdrv_code[] = {
-	58,3,0,50,228,149,33,255,255,34,226,149,
-	249,17,40,152,33,202,154,183,237,82,77,68,
-	11,107,98,19,54,0,237,176,175,50,80,0,
-	62,128,237,71,62,32,237,57,51,62,12,237,
-	57,50,237,57,54,62,6,237,57,52,62,12,
-	237,57,49,33,107,137,34,32,128,33,83,130,
-	34,40,128,33,86,130,34,42,128,33,112,130,
-	34,36,128,33,211,130,34,38,128,62,0,237,
-	57,16,33,63,148,34,34,128,237,94,205,15,
-	130,251,205,168,145,24,141,67,111,112,121,114,
-	105,103,104,116,32,40,67,41,32,49,57,56,
-	56,32,45,32,68,97,121,110,97,32,67,111,
-	109,109,117,110,105,99,97,116,105,111,110,115,
-	32,32,32,65,108,108,32,114,105,103,104,116,
-	115,32,114,101,115,101,114,118,101,100,46,32,
-	32,40,68,40,68,7,16,8,34,7,22,6,
-	16,5,12,4,8,3,6,140,0,16,39,128,
-	0,4,96,10,224,6,0,7,126,2,64,11,
-	118,12,6,13,0,14,193,15,0,5,96,3,
-	192,1,64,9,8,62,9,211,66,62,192,211,
-	66,62,100,61,32,253,6,28,33,205,129,14,
-	66,237,163,194,253,129,6,28,33,205,129,14,
-	64,237,163,194,9,130,201,62,47,50,71,152,
-	62,47,211,68,58,203,129,237,57,20,58,204,
-	129,237,57,21,33,77,152,54,132,205,233,129,
-	58,228,149,254,209,40,6,56,4,62,0,24,
-	2,219,96,33,233,149,119,230,62,33,232,149,
-	119,213,33,8,152,17,7,0,25,119,19,25,
-	119,209,201,251,237,77,245,197,213,229,221,229,
-	205,233,129,62,1,50,106,137,205,158,139,221,
-	225,225,209,193,241,251,237,77,245,197,213,219,
-	72,237,56,16,230,46,237,57,16,237,56,12,
-	58,72,152,183,32,26,6,20,17,128,2,237,
-	56,46,187,32,35,237,56,47,186,32,29,219,
-	72,230,1,32,3,5,32,232,175,50,72,152,
-	229,221,229,62,1,50,106,137,205,158,139,221,
-	225,225,24,25,62,1,50,72,152,58,201,129,
-	237,57,12,58,202,129,237,57,13,237,56,16,
-	246,17,237,57,16,209,193,241,251,237,77,245,
-	197,229,213,221,229,237,56,16,230,17,237,57,
-	16,237,56,20,58,34,152,246,16,246,8,211,
-	68,62,6,61,32,253,58,34,152,246,8,211,
-	68,58,203,129,237,57,20,58,204,129,237,57,
-	21,237,56,16,246,34,237,57,16,221,225,209,
-	225,193,241,251,237,77,33,2,0,57,126,230,
-	3,237,100,1,40,2,246,128,230,130,245,62,
-	5,211,64,241,211,64,201,229,213,243,237,56,
-	16,230,46,237,57,16,237,56,12,251,70,35,
-	35,126,254,175,202,77,133,254,129,202,15,133,
-	230,128,194,191,132,43,58,44,152,119,33,76,
-	152,119,35,62,132,119,120,254,255,40,4,58,
-	49,152,119,219,72,43,43,112,17,3,0,237,
-	56,52,230,248,237,57,52,219,72,230,1,194,
-	141,131,209,225,237,56,52,246,6,237,57,52,
-	62,1,55,251,201,62,3,211,66,62,192,211,
-	66,62,48,211,66,0,0,219,66,230,1,40,
-	4,219,67,24,240,205,203,135,58,75,152,254,
-	255,202,128,132,58,49,152,254,161,250,207,131,
-	58,34,152,211,68,62,10,211,66,62,128,211,
-	66,62,11,211,66,62,6,211,66,24,0,62,
-	14,211,66,62,33,211,66,62,1,211,66,62,
-	64,211,66,62,3,211,66,62,209,211,66,62,
-	100,71,219,66,230,1,32,6,5,32,247,195,
-	248,132,219,67,71,58,44,152,184,194,248,132,
-	62,100,71,219,66,230,1,32,6,5,32,247,
-	195,248,132,219,67,62,100,71,219,66,230,1,
-	32,6,5,32,247,195,248,132,219,67,254,133,
-	32,7,62,0,50,74,152,24,17,254,173,32,
-	7,62,1,50,74,152,24,6,254,141,194,248,
-	132,71,209,225,58,49,152,254,132,32,10,62,
-	50,205,2,134,205,144,135,24,27,254,140,32,
-	15,62,110,205,2,134,62,141,184,32,5,205,
-	144,135,24,8,62,10,205,2,134,205,8,134,
-	62,1,50,106,137,205,158,139,237,56,52,246,
-	6,237,57,52,175,183,251,201,62,20,135,237,
-	57,20,175,237,57,21,237,56,16,246,2,237,
-	57,16,237,56,20,95,237,56,21,123,254,10,
-	48,244,237,56,16,230,17,237,57,16,209,225,
-	205,144,135,62,1,50,106,137,205,158,139,237,
-	56,52,246,6,237,57,52,175,183,251,201,209,
-	225,243,219,72,230,1,40,13,62,10,211,66,
-	0,0,219,66,230,192,202,226,132,237,56,52,
-	246,6,237,57,52,62,1,55,251,201,205,203,
-	135,62,1,50,106,137,205,158,139,237,56,52,
-	246,6,237,57,52,183,251,201,209,225,62,1,
-	50,106,137,205,158,139,237,56,52,246,6,237,
-	57,52,62,2,55,251,201,209,225,243,219,72,
-	230,1,202,213,132,62,10,211,66,0,0,219,
-	66,230,192,194,213,132,229,62,1,50,106,137,
-	42,40,152,205,65,143,225,17,3,0,205,111,
-	136,62,6,211,66,58,44,152,211,66,237,56,
-	52,246,6,237,57,52,183,251,201,209,197,237,
-	56,52,230,248,237,57,52,219,72,230,1,32,
-	15,193,225,237,56,52,246,6,237,57,52,62,
-	1,55,251,201,14,23,58,37,152,254,0,40,
-	14,14,2,254,1,32,5,62,140,119,24,3,
-	62,132,119,43,43,197,205,203,135,193,62,1,
-	211,66,62,64,211,66,62,3,211,66,62,193,
-	211,66,62,100,203,39,71,219,66,230,1,32,
-	6,5,32,247,195,229,133,33,238,151,219,67,
-	71,58,44,152,184,194,229,133,119,62,100,71,
-	219,66,230,1,32,6,5,32,247,195,229,133,
-	219,67,35,119,13,32,234,193,225,62,1,50,
-	106,137,205,158,139,237,56,52,246,6,237,57,
-	52,175,183,251,201,33,234,151,35,35,62,255,
-	119,193,225,62,1,50,106,137,205,158,139,237,
-	56,52,246,6,237,57,52,175,251,201,243,61,
-	32,253,251,201,62,3,211,66,62,192,211,66,
-	58,49,152,254,140,32,19,197,229,213,17,181,
-	129,33,185,129,1,2,0,237,176,209,225,193,
-	24,27,229,213,33,187,129,58,49,152,230,15,
-	87,30,2,237,92,25,17,181,129,126,18,19,
-	35,126,18,209,225,58,34,152,246,8,211,68,
-	58,49,152,254,165,40,14,254,164,40,10,62,
-	10,211,66,62,224,211,66,24,25,58,74,152,
-	254,0,40,10,62,10,211,66,62,160,211,66,
-	24,8,62,10,211,66,62,128,211,66,62,11,
-	211,66,62,6,211,66,205,147,143,62,5,211,
-	66,62,224,211,66,62,5,211,66,62,96,211,
-	66,62,5,61,32,253,62,5,211,66,62,224,
-	211,66,62,14,61,32,253,62,5,211,66,62,
-	233,211,66,62,128,211,66,58,181,129,61,32,
-	253,62,1,211,66,62,192,211,66,1,254,19,
-	237,56,46,187,32,6,13,32,247,195,226,134,
-	62,192,211,66,0,0,219,66,203,119,40,250,
-	219,66,203,87,40,250,243,237,56,16,230,17,
-	237,57,16,237,56,20,251,62,5,211,66,62,
-	224,211,66,58,182,129,61,32,253,229,33,181,
-	129,58,183,129,203,63,119,35,58,184,129,119,
-	225,62,10,211,66,62,224,211,66,62,11,211,
-	66,62,118,211,66,62,47,211,68,62,5,211,
-	66,62,233,211,66,58,181,129,61,32,253,62,
-	5,211,66,62,224,211,66,58,182,129,61,32,
-	253,62,5,211,66,62,96,211,66,201,229,213,
-	58,50,152,230,15,87,30,2,237,92,33,187,
-	129,25,17,181,129,126,18,35,19,126,18,209,
-	225,58,71,152,246,8,211,68,58,50,152,254,
-	165,40,14,254,164,40,10,62,10,211,66,62,
-	224,211,66,24,8,62,10,211,66,62,128,211,
-	66,62,11,211,66,62,6,211,66,195,248,135,
-	62,3,211,66,62,192,211,66,197,229,213,17,
-	181,129,33,183,129,1,2,0,237,176,209,225,
-	193,62,47,211,68,62,10,211,66,62,224,211,
-	66,62,11,211,66,62,118,211,66,62,1,211,
-	66,62,0,211,66,205,147,143,195,16,136,62,
-	3,211,66,62,192,211,66,197,229,213,17,181,
-	129,33,183,129,1,2,0,237,176,209,225,193,
-	62,47,211,68,62,10,211,66,62,224,211,66,
-	62,11,211,66,62,118,211,66,205,147,143,62,
-	5,211,66,62,224,211,66,62,5,211,66,62,
-	96,211,66,62,5,61,32,253,62,5,211,66,
-	62,224,211,66,62,14,61,32,253,62,5,211,
-	66,62,233,211,66,62,128,211,66,58,181,129,
-	61,32,253,62,1,211,66,62,192,211,66,1,
-	254,19,237,56,46,187,32,6,13,32,247,195,
-	88,136,62,192,211,66,0,0,219,66,203,119,
-	40,250,219,66,203,87,40,250,62,5,211,66,
-	62,224,211,66,58,182,129,61,32,253,62,5,
-	211,66,62,96,211,66,201,197,14,67,6,0,
-	62,3,211,66,62,192,211,66,62,48,211,66,
-	0,0,219,66,230,1,40,4,219,67,24,240,
-	62,5,211,66,62,233,211,66,62,128,211,66,
-	58,181,129,61,32,253,237,163,29,62,192,211,
-	66,219,66,230,4,40,250,237,163,29,32,245,
-	219,66,230,4,40,250,62,255,71,219,66,230,
-	4,40,3,5,32,247,219,66,230,4,40,250,
-	62,5,211,66,62,224,211,66,58,182,129,61,
-	32,253,62,5,211,66,62,96,211,66,58,71,
-	152,254,1,202,18,137,62,16,211,66,62,56,
-	211,66,62,14,211,66,62,33,211,66,62,1,
-	211,66,62,248,211,66,237,56,48,246,153,230,
-	207,237,57,48,62,3,211,66,62,221,211,66,
-	193,201,58,71,152,211,68,62,10,211,66,62,
-	128,211,66,62,11,211,66,62,6,211,66,62,
-	6,211,66,58,44,152,211,66,62,16,211,66,
-	62,56,211,66,62,48,211,66,0,0,62,14,
-	211,66,62,33,211,66,62,1,211,66,62,248,
-	211,66,237,56,48,246,145,246,8,230,207,237,
-	57,48,62,3,211,66,62,221,211,66,193,201,
-	44,3,1,0,70,69,1,245,197,213,229,175,
-	50,72,152,237,56,16,230,46,237,57,16,237,
-	56,12,62,1,211,66,0,0,219,66,95,230,
-	160,32,3,195,20,139,123,230,96,194,72,139,
-	62,48,211,66,62,1,211,66,62,64,211,66,
-	237,91,40,152,205,207,143,25,43,55,237,82,
-	218,70,139,34,42,152,98,107,58,44,152,190,
-	194,210,138,35,35,62,130,190,194,200,137,62,
-	1,50,48,152,62,175,190,202,82,139,62,132,
-	190,32,44,50,50,152,62,47,50,71,152,229,
-	175,50,106,137,42,40,152,205,65,143,225,54,
-	133,43,70,58,44,152,119,43,112,17,3,0,
-	62,10,205,2,134,205,111,136,195,158,138,62,
-	140,190,32,19,50,50,152,58,233,149,230,4,
-	202,222,138,62,1,50,71,152,195,219,137,126,
-	254,160,250,185,138,254,166,242,185,138,50,50,
-	152,43,126,35,229,213,33,234,149,95,22,0,
-	25,126,254,132,40,18,254,140,40,14,58,50,
-	152,230,15,87,126,31,21,242,65,138,56,2,
-	175,119,58,50,152,230,15,87,58,233,149,230,
-	62,31,21,242,85,138,218,98,138,209,225,195,
-	20,139,58,50,152,33,100,137,230,15,95,22,
-	0,25,126,50,71,152,209,225,58,50,152,254,
-	164,250,135,138,58,73,152,254,0,40,4,54,
-	173,24,2,54,133,43,70,58,44,152,119,43,
-	112,17,3,0,205,70,135,175,50,106,137,205,
-	208,139,58,199,129,237,57,12,58,200,129,237,
-	57,13,237,56,16,246,17,237,57,16,225,209,
-	193,241,251,237,77,62,129,190,194,227,138,54,
-	130,43,70,58,44,152,119,43,112,17,3,0,
-	205,144,135,195,20,139,35,35,126,254,132,194,
-	227,138,175,50,106,137,205,158,139,24,42,58,
-	201,154,254,1,40,7,62,1,50,106,137,24,
-	237,58,106,137,254,1,202,222,138,62,128,166,
-	194,222,138,221,229,221,33,67,152,205,127,142,
-	205,109,144,221,225,225,209,193,241,251,237,77,
-	58,106,137,254,1,202,44,139,58,50,152,254,
-	164,250,44,139,58,73,152,238,1,50,73,152,
-	221,229,221,33,51,152,205,127,142,221,225,62,
-	1,50,106,137,205,158,139,195,13,139,24,208,
-	24,206,24,204,230,64,40,3,195,20,139,195,
-	20,139,43,126,33,8,152,119,35,58,44,152,
-	119,43,237,91,35,152,205,203,135,205,158,139,
-	195,13,139,175,50,78,152,62,3,211,66,62,
-	192,211,66,201,197,33,4,0,57,126,35,102,
-	111,62,1,50,106,137,219,72,205,141,139,193,
-	201,62,1,50,78,152,34,40,152,54,0,35,
-	35,54,0,195,163,139,58,78,152,183,200,229,
-	33,181,129,58,183,129,119,35,58,184,129,119,
-	225,62,47,211,68,62,14,211,66,62,193,211,
-	66,62,10,211,66,62,224,211,66,62,11,211,
-	66,62,118,211,66,195,3,140,58,78,152,183,
-	200,58,71,152,211,68,254,69,40,4,254,70,
-	32,17,58,73,152,254,0,40,10,62,10,211,
-	66,62,160,211,66,24,8,62,10,211,66,62,
-	128,211,66,62,11,211,66,62,6,211,66,62,
-	6,211,66,58,44,152,211,66,62,16,211,66,
-	62,56,211,66,62,48,211,66,0,0,219,66,
-	230,1,40,4,219,67,24,240,62,14,211,66,
-	62,33,211,66,42,40,152,205,65,143,62,1,
-	211,66,62,248,211,66,237,56,48,246,145,246,
-	8,230,207,237,57,48,62,3,211,66,62,221,
-	211,66,201,62,16,211,66,62,56,211,66,62,
-	48,211,66,0,0,219,66,230,1,40,4,219,
-	67,24,240,62,14,211,66,62,33,211,66,62,
-	1,211,66,62,248,211,66,237,56,48,246,153,
-	230,207,237,57,48,62,3,211,66,62,221,211,
-	66,201,229,213,33,234,149,95,22,0,25,126,
-	254,132,40,4,254,140,32,2,175,119,123,209,
-	225,201,6,8,14,0,31,48,1,12,16,250,
-	121,201,33,4,0,57,94,35,86,33,2,0,
-	57,126,35,102,111,221,229,34,89,152,237,83,
-	91,152,221,33,63,152,205,127,142,58,81,152,
-	50,82,152,58,80,152,135,50,80,152,205,162,
-	140,254,3,56,16,58,81,152,135,60,230,15,
-	50,81,152,175,50,80,152,24,23,58,79,152,
-	205,162,140,254,3,48,13,58,81,152,203,63,
-	50,81,152,62,255,50,79,152,58,81,152,50,
-	82,152,58,79,152,135,50,79,152,62,32,50,
-	83,152,50,84,152,237,56,16,230,17,237,57,
-	16,219,72,62,192,50,93,152,62,93,50,94,
-	152,58,93,152,61,50,93,152,32,9,58,94,
-	152,61,50,94,152,40,44,62,170,237,57,20,
-	175,237,57,21,237,56,16,246,2,237,57,16,
-	219,72,230,1,202,29,141,237,56,20,71,237,
-	56,21,120,254,10,48,237,237,56,16,230,17,
-	237,57,16,243,62,14,211,66,62,65,211,66,
-	251,58,39,152,23,23,60,50,39,152,71,58,
-	82,152,160,230,15,40,22,71,14,10,219,66,
-	230,16,202,186,141,219,72,230,1,202,186,141,
-	13,32,239,16,235,42,89,152,237,91,91,152,
-	205,47,131,48,7,61,202,186,141,195,227,141,
-	221,225,33,0,0,201,221,33,55,152,205,127,
-	142,58,84,152,61,50,84,152,40,19,58,82,
-	152,246,1,50,82,152,58,79,152,246,1,50,
-	79,152,195,29,141,221,225,33,1,0,201,221,
-	33,59,152,205,127,142,58,80,152,246,1,50,
-	80,152,58,82,152,135,246,1,50,82,152,58,
-	83,152,61,50,83,152,194,29,141,221,225,33,
-	2,0,201,221,229,33,0,0,57,17,4,0,
-	25,126,50,44,152,230,128,50,85,152,58,85,
-	152,183,40,6,221,33,88,2,24,4,221,33,
-	150,0,58,44,152,183,40,53,60,40,50,60,
-	40,47,61,61,33,86,152,119,35,119,35,54,
-	129,175,50,48,152,221,43,221,229,225,124,181,
-	40,42,33,86,152,17,3,0,205,189,140,17,
-	232,3,27,123,178,32,251,58,48,152,183,40,
-	224,58,44,152,71,62,7,128,230,127,71,58,
-	85,152,176,50,44,152,24,162,221,225,201,183,
-	221,52,0,192,221,52,1,192,221,52,2,192,
-	221,52,3,192,55,201,245,62,1,211,100,241,
-	201,245,62,1,211,96,241,201,33,2,0,57,
-	126,35,102,111,237,56,48,230,175,237,57,48,
-	62,48,237,57,49,125,237,57,32,124,237,57,
-	33,62,0,237,57,34,62,88,237,57,35,62,
-	0,237,57,36,237,57,37,33,128,2,125,237,
-	57,38,124,237,57,39,237,56,48,246,97,230,
-	207,237,57,48,62,0,237,57,0,62,0,211,
-	96,211,100,201,33,2,0,57,126,35,102,111,
-	237,56,48,230,175,237,57,48,62,12,237,57,
-	49,62,76,237,57,32,62,0,237,57,33,237,
-	57,34,125,237,57,35,124,237,57,36,62,0,
-	237,57,37,33,128,2,125,237,57,38,124,237,
-	57,39,237,56,48,246,97,230,207,237,57,48,
-	62,1,211,96,201,33,2,0,57,126,35,102,
-	111,229,237,56,48,230,87,237,57,48,125,237,
-	57,40,124,237,57,41,62,0,237,57,42,62,
-	67,237,57,43,62,0,237,57,44,58,106,137,
-	254,1,32,5,33,6,0,24,3,33,128,2,
-	125,237,57,46,124,237,57,47,237,56,50,230,
-	252,246,2,237,57,50,225,201,33,4,0,57,
-	94,35,86,33,2,0,57,126,35,102,111,237,
-	56,48,230,87,237,57,48,125,237,57,40,124,
-	237,57,41,62,0,237,57,42,62,67,237,57,
-	43,62,0,237,57,44,123,237,57,46,122,237,
-	57,47,237,56,50,230,244,246,0,237,57,50,
-	237,56,48,246,145,230,207,237,57,48,201,213,
-	237,56,46,95,237,56,47,87,237,56,46,111,
-	237,56,47,103,183,237,82,32,235,33,128,2,
-	183,237,82,209,201,213,237,56,38,95,237,56,
-	39,87,237,56,38,111,237,56,39,103,183,237,
-	82,32,235,33,128,2,183,237,82,209,201,245,
-	197,1,52,0,237,120,230,253,237,121,193,241,
-	201,245,197,1,52,0,237,120,246,2,237,121,
-	193,241,201,33,2,0,57,126,35,102,111,126,
-	35,110,103,201,33,0,0,34,102,152,34,96,
-	152,34,98,152,33,202,154,34,104,152,237,91,
-	104,152,42,226,149,183,237,82,17,0,255,25,
-	34,100,152,203,124,40,6,33,0,125,34,100,
-	152,42,104,152,35,35,35,229,205,120,139,193,
-	201,205,186,149,229,42,40,152,35,35,35,229,
-	205,39,144,193,124,230,3,103,221,117,254,221,
-	116,255,237,91,42,152,35,35,35,183,237,82,
-	32,12,17,5,0,42,42,152,205,171,149,242,
-	169,144,42,40,152,229,205,120,139,193,195,198,
-	149,237,91,42,152,42,98,152,25,34,98,152,
-	19,19,19,42,102,152,25,34,102,152,237,91,
-	100,152,33,158,253,25,237,91,102,152,205,171,
-	149,242,214,144,33,0,0,34,102,152,62,1,
-	50,95,152,205,225,144,195,198,149,58,95,152,
-	183,200,237,91,96,152,42,102,152,205,171,149,
-	242,5,145,237,91,102,152,33,98,2,25,237,
-	91,96,152,205,171,149,250,37,145,237,91,96,
-	152,42,102,152,183,237,82,32,7,42,98,152,
-	125,180,40,13,237,91,102,152,42,96,152,205,
-	171,149,242,58,145,237,91,104,152,42,102,152,
-	25,35,35,35,229,205,120,139,193,175,50,95,
-	152,201,195,107,139,205,206,149,250,255,243,205,
-	225,144,251,58,230,149,183,194,198,149,17,1,
-	0,42,98,152,205,171,149,250,198,149,62,1,
-	50,230,149,237,91,96,152,42,104,152,25,221,
-	117,252,221,116,253,237,91,104,152,42,96,152,
-	25,35,35,35,221,117,254,221,116,255,35,35,
-	35,229,205,39,144,124,230,3,103,35,35,35,
-	221,117,250,221,116,251,235,221,110,252,221,102,
-	253,115,35,114,35,54,4,62,1,211,100,211,
-	84,195,198,149,33,0,0,34,102,152,34,96,
-	152,34,98,152,33,202,154,34,104,152,237,91,
-	104,152,42,226,149,183,237,82,17,0,255,25,
-	34,100,152,33,109,152,54,0,33,107,152,229,
-	205,240,142,193,62,47,50,34,152,62,132,50,
-	49,152,205,241,145,205,61,145,58,39,152,60,
-	50,39,152,24,241,205,206,149,251,255,33,109,
-	152,126,183,202,198,149,110,221,117,251,33,109,
-	152,54,0,221,126,251,254,1,40,28,254,3,
-	40,101,254,4,202,190,147,254,5,202,147,147,
-	254,8,40,87,33,107,152,229,205,240,142,195,
-	198,149,58,201,154,183,32,21,33,111,152,126,
-	50,229,149,205,52,144,33,110,152,110,38,0,
-	229,205,11,142,193,237,91,96,152,42,104,152,
-	25,221,117,254,221,116,255,35,35,54,2,17,
-	2,0,43,43,115,35,114,58,44,152,35,35,
-	119,58,228,149,35,119,62,1,211,100,211,84,
-	62,1,50,201,154,24,169,205,153,142,58,231,
-	149,183,40,250,175,50,231,149,33,110,152,126,
-	254,255,40,91,58,233,149,230,63,183,40,83,
-	94,22,0,33,234,149,25,126,183,40,13,33,
-	110,152,94,33,234,150,25,126,254,3,32,36,
-	205,81,148,125,180,33,110,152,94,22,0,40,
-	17,33,234,149,25,54,0,33,107,152,229,205,
-	240,142,193,195,198,149,33,234,150,25,54,0,
-	33,110,152,94,22,0,33,234,149,25,126,50,
-	49,152,254,132,32,37,62,47,50,34,152,42,
-	107,152,229,33,110,152,229,205,174,140,193,193,
-	125,180,33,110,152,94,22,0,33,234,150,202,
-	117,147,25,52,195,120,147,58,49,152,254,140,
-	32,7,62,1,50,34,152,24,210,62,32,50,
-	106,152,24,19,58,49,152,95,58,106,152,163,
-	183,58,106,152,32,11,203,63,50,106,152,58,
-	106,152,183,32,231,254,2,40,51,254,4,40,
-	38,254,8,40,26,254,16,40,13,254,32,32,
-	158,62,165,50,49,152,62,69,24,190,62,164,
-	50,49,152,62,70,24,181,62,163,50,49,152,
-	175,24,173,62,162,50,49,152,62,1,24,164,
-	62,161,50,49,152,62,3,24,155,25,54,0,
-	221,126,251,254,8,40,7,58,230,149,183,202,
-	32,146,33,107,152,229,205,240,142,193,211,84,
-	195,198,149,237,91,96,152,42,104,152,25,221,
-	117,254,221,116,255,35,35,54,6,17,2,0,
-	43,43,115,35,114,58,228,149,35,35,119,58,
-	233,149,35,119,205,146,142,195,32,146,237,91,
-	96,152,42,104,152,25,229,205,160,142,193,58,
-	231,149,183,40,250,175,50,231,149,243,237,91,
-	96,152,42,104,152,25,221,117,254,221,116,255,
-	78,35,70,221,113,252,221,112,253,89,80,42,
-	98,152,183,237,82,34,98,152,203,124,40,19,
-	33,0,0,34,98,152,34,102,152,34,96,152,
-	62,1,50,95,152,24,40,221,94,252,221,86,
-	253,19,19,19,42,96,152,25,34,96,152,237,
-	91,100,152,33,158,253,25,237,91,96,152,205,
-	171,149,242,55,148,33,0,0,34,96,152,175,
-	50,230,149,251,195,32,146,245,62,1,50,231,
-	149,62,16,237,57,0,211,80,241,251,237,77,
-	201,205,186,149,229,229,33,0,0,34,37,152,
-	33,110,152,126,50,234,151,58,44,152,33,235,
-	151,119,221,54,253,0,221,54,254,0,195,230,
-	148,33,236,151,54,175,33,3,0,229,33,234,
-	151,229,205,174,140,193,193,33,236,151,126,254,
-	255,40,74,33,245,151,110,221,117,255,33,249,
-	151,126,221,166,255,221,119,255,33,253,151,126,
-	221,166,255,221,119,255,58,232,149,95,221,126,
-	255,163,221,119,255,183,40,15,230,191,33,110,
-	152,94,22,0,33,234,149,25,119,24,12,33,
-	110,152,94,22,0,33,234,149,25,54,132,33,
-	0,0,195,198,149,221,110,253,221,102,254,35,
-	221,117,253,221,116,254,17,32,0,221,110,253,
-	221,102,254,205,171,149,250,117,148,58,233,149,
-	203,87,40,84,33,1,0,34,37,152,221,54,
-	253,0,221,54,254,0,24,53,33,236,151,54,
-	175,33,3,0,229,33,234,151,229,205,174,140,
-	193,193,33,236,151,126,254,255,40,14,33,110,
-	152,94,22,0,33,234,149,25,54,140,24,159,
-	221,110,253,221,102,254,35,221,117,253,221,116,
-	254,17,32,0,221,110,253,221,102,254,205,171,
-	149,250,12,149,33,2,0,34,37,152,221,54,
-	253,0,221,54,254,0,24,54,33,236,151,54,
-	175,33,3,0,229,33,234,151,229,205,174,140,
-	193,193,33,236,151,126,254,255,40,15,33,110,
-	152,94,22,0,33,234,149,25,54,132,195,211,
-	148,221,110,253,221,102,254,35,221,117,253,221,
-	116,254,17,32,0,221,110,253,221,102,254,205,
-	171,149,250,96,149,33,1,0,195,198,149,124,
-	170,250,179,149,237,82,201,124,230,128,237,82,
-	60,201,225,253,229,221,229,221,33,0,0,221,
-	57,233,221,249,221,225,253,225,201,233,225,253,
-	229,221,229,221,33,0,0,221,57,94,35,86,
-	35,235,57,249,235,233,0,0,0,0,0,0,
-	62,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	175,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,133,1,0,0,0,63,
-	255,255,255,255,0,0,0,63,0,0,0,0,
-	0,0,0,0,0,0,0,24,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0
-	} ;
-
-#endif
diff --git a/drivers/net/appletalk/cops_ltdrv.h b/drivers/net/appletalk/cops_ltdrv.h
deleted file mode 100644
index c699b1ad31da..000000000000
--- a/drivers/net/appletalk/cops_ltdrv.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- *	The firmware this driver downloads into the Localtalk card is a
- *	separate program and is not GPL'd source code, even though the Linux
- *	side driver and the routine that loads this data into the card are.
- *	
- *	It is taken from the COPS SDK and is under the following license
- *
- *	This material is licensed to you strictly for use in conjunction with
- *	the use of COPS LocalTalk adapters.
- *	There is no charge for this SDK. And no waranty express or implied
- *	about its fitness for any purpose. However, we will cheerefully
- *	refund every penny you paid for this SDK...
- *	Regards,
- *
- *	Thomas F. Divine
- *	Chief Scientist
- */
-
-
-/*      cops_ltdrv.h: LocalTalk driver firmware dump for Linux.
- *
- *      Authors:
- *      - Jay Schulist <jschlst@samba.org>
- */
- 
-
-#ifdef CONFIG_COPS_TANGENT
-
-static const unsigned char ltdrv_code[] = {
-	58,3,0,50,148,10,33,143,15,62,85,119,
-	190,32,9,62,170,119,190,32,3,35,24,241,
-	34,146,10,249,17,150,10,33,143,15,183,237,
-	82,77,68,11,107,98,19,54,0,237,176,62,
-	16,237,57,51,62,0,237,57,50,237,57,54,
-	62,12,237,57,49,62,195,33,39,2,50,56,
-	0,34,57,0,237,86,205,30,2,251,205,60,
-	10,24,169,67,111,112,121,114,105,103,104,116,
-	32,40,99,41,32,49,57,56,56,45,49,57,
-	57,50,44,32,80,114,105,110,116,105,110,103,
-	32,67,111,109,109,117,110,105,99,97,116,105,
-	111,110,115,32,65,115,115,111,99,105,97,116,
-	101,115,44,32,73,110,99,46,65,108,108,32,
-	114,105,103,104,116,115,32,114,101,115,101,114,
-	118,101,100,46,32,32,4,4,22,40,255,60,
-	4,96,10,224,6,0,7,126,2,64,11,246,
-	12,6,13,0,14,193,15,0,5,96,3,192,
-	1,0,9,8,62,3,211,82,62,192,211,82,
-	201,62,3,211,82,62,213,211,82,201,62,5,
-	211,82,62,224,211,82,201,62,5,211,82,62,
-	224,211,82,201,62,5,211,82,62,96,211,82,
-	201,6,28,33,180,1,14,82,237,163,194,4,
-	2,33,39,2,34,64,0,58,3,0,230,1,
-	192,62,11,237,121,62,118,237,121,201,33,182,
-	10,54,132,205,253,1,201,245,197,213,229,42,
-	150,10,14,83,17,98,2,67,20,237,162,58,
-	179,1,95,219,82,230,1,32,6,29,32,247,
-	195,17,3,62,1,211,82,219,82,95,230,160,
-	32,10,237,162,32,225,21,32,222,195,15,3,
-	237,162,123,230,96,194,21,3,62,48,211,82,
-	62,1,211,82,175,211,82,237,91,150,10,43,
-	55,237,82,218,19,3,34,152,10,98,107,58,
-	154,10,190,32,81,62,1,50,158,10,35,35,
-	62,132,190,32,44,54,133,43,70,58,154,10,
-	119,43,112,17,3,0,205,137,3,62,16,211,
-	82,62,56,211,82,205,217,1,42,150,10,14,
-	83,17,98,2,67,20,58,178,1,95,195,59,
-	2,62,129,190,194,227,2,54,130,43,70,58,
-	154,10,119,43,112,17,3,0,205,137,3,195,
-	254,2,35,35,126,254,132,194,227,2,205,61,
-	3,24,20,62,128,166,194,222,2,221,229,221,
-	33,175,10,205,93,6,205,144,7,221,225,225,
-	209,193,241,251,237,77,221,229,221,33,159,10,
-	205,93,6,221,225,205,61,3,195,247,2,24,
-	237,24,235,24,233,230,64,40,2,24,227,24,
-	225,175,50,179,10,205,208,1,201,197,33,4,
-	0,57,126,35,102,111,205,51,3,193,201,62,
-	1,50,179,10,34,150,10,54,0,58,179,10,
-	183,200,62,14,211,82,62,193,211,82,62,10,
-	211,82,62,224,211,82,62,6,211,82,58,154,
-	10,211,82,62,16,211,82,62,56,211,82,62,
-	48,211,82,219,82,230,1,40,4,219,83,24,
-	242,62,14,211,82,62,33,211,82,62,1,211,
-	82,62,9,211,82,62,32,211,82,205,217,1,
-	201,14,83,205,208,1,24,23,14,83,205,208,
-	1,205,226,1,58,174,1,61,32,253,205,244,
-	1,58,174,1,61,32,253,205,226,1,58,175,
-	1,61,32,253,62,5,211,82,62,233,211,82,
-	62,128,211,82,58,176,1,61,32,253,237,163,
-	27,62,192,211,82,219,82,230,4,40,250,237,
-	163,27,122,179,32,243,219,82,230,4,40,250,
-	58,178,1,71,219,82,230,4,40,3,5,32,
-	247,219,82,230,4,40,250,205,235,1,58,177,
-	1,61,32,253,205,244,1,201,229,213,35,35,
-	126,230,128,194,145,4,43,58,154,10,119,43,
-	70,33,181,10,119,43,112,17,3,0,243,62,
-	10,211,82,219,82,230,128,202,41,4,209,225,
-	62,1,55,251,201,205,144,3,58,180,10,254,
-	255,202,127,4,205,217,1,58,178,1,71,219,
-	82,230,1,32,6,5,32,247,195,173,4,219,
-	83,71,58,154,10,184,194,173,4,58,178,1,
-	71,219,82,230,1,32,6,5,32,247,195,173,
-	4,219,83,58,178,1,71,219,82,230,1,32,
-	6,5,32,247,195,173,4,219,83,254,133,194,
-	173,4,58,179,1,24,4,58,179,1,135,61,
-	32,253,209,225,205,137,3,205,61,3,183,251,
-	201,209,225,243,62,10,211,82,219,82,230,128,
-	202,164,4,62,1,55,251,201,205,144,3,205,
-	61,3,183,251,201,209,225,62,2,55,251,201,
-	243,62,14,211,82,62,33,211,82,251,201,33,
-	4,0,57,94,35,86,33,2,0,57,126,35,
-	102,111,221,229,34,193,10,237,83,195,10,221,
-	33,171,10,205,93,6,58,185,10,50,186,10,
-	58,184,10,135,50,184,10,205,112,6,254,3,
-	56,16,58,185,10,135,60,230,15,50,185,10,
-	175,50,184,10,24,23,58,183,10,205,112,6,
-	254,3,48,13,58,185,10,203,63,50,185,10,
-	62,255,50,183,10,58,185,10,50,186,10,58,
-	183,10,135,50,183,10,62,32,50,187,10,50,
-	188,10,6,255,219,82,230,16,32,3,5,32,
-	247,205,180,4,6,40,219,82,230,16,40,3,
-	5,32,247,62,10,211,82,219,82,230,128,194,
-	46,5,219,82,230,16,40,214,237,95,71,58,
-	186,10,160,230,15,40,32,71,14,10,62,10,
-	211,82,219,82,230,128,202,119,5,205,180,4,
-	195,156,5,219,82,230,16,202,156,5,13,32,
-	229,16,225,42,193,10,237,91,195,10,205,252,
-	3,48,7,61,202,156,5,195,197,5,221,225,
-	33,0,0,201,221,33,163,10,205,93,6,58,
-	188,10,61,50,188,10,40,19,58,186,10,246,
-	1,50,186,10,58,183,10,246,1,50,183,10,
-	195,46,5,221,225,33,1,0,201,221,33,167,
-	10,205,93,6,58,184,10,246,1,50,184,10,
-	58,186,10,135,246,1,50,186,10,58,187,10,
-	61,50,187,10,194,46,5,221,225,33,2,0,
-	201,221,229,33,0,0,57,17,4,0,25,126,
-	50,154,10,230,128,50,189,10,58,189,10,183,
-	40,6,221,33,88,2,24,4,221,33,150,0,
-	58,154,10,183,40,49,60,40,46,61,33,190,
-	10,119,35,119,35,54,129,175,50,158,10,221,
-	43,221,229,225,124,181,40,42,33,190,10,17,
-	3,0,205,206,4,17,232,3,27,123,178,32,
-	251,58,158,10,183,40,224,58,154,10,71,62,
-	7,128,230,127,71,58,189,10,176,50,154,10,
-	24,166,221,225,201,183,221,52,0,192,221,52,
-	1,192,221,52,2,192,221,52,3,192,55,201,
-	6,8,14,0,31,48,1,12,16,250,121,201,
-	33,2,0,57,94,35,86,35,78,35,70,35,
-	126,35,102,105,79,120,68,103,237,176,201,33,
-	2,0,57,126,35,102,111,62,17,237,57,48,
-	125,237,57,40,124,237,57,41,62,0,237,57,
-	42,62,64,237,57,43,62,0,237,57,44,33,
-	128,2,125,237,57,46,124,237,57,47,62,145,
-	237,57,48,211,68,58,149,10,211,66,201,33,
-	2,0,57,126,35,102,111,62,33,237,57,48,
-	62,64,237,57,32,62,0,237,57,33,237,57,
-	34,125,237,57,35,124,237,57,36,62,0,237,
-	57,37,33,128,2,125,237,57,38,124,237,57,
-	39,62,97,237,57,48,211,67,58,149,10,211,
-	66,201,237,56,46,95,237,56,47,87,237,56,
-	46,111,237,56,47,103,183,237,82,32,235,33,
-	128,2,183,237,82,201,237,56,38,95,237,56,
-	39,87,237,56,38,111,237,56,39,103,183,237,
-	82,32,235,33,128,2,183,237,82,201,205,106,
-	10,221,110,6,221,102,7,126,35,110,103,195,
-	118,10,205,106,10,33,0,0,34,205,10,34,
-	198,10,34,200,10,33,143,15,34,207,10,237,
-	91,207,10,42,146,10,183,237,82,17,0,255,
-	25,34,203,10,203,124,40,6,33,0,125,34,
-	203,10,42,207,10,229,205,37,3,195,118,10,
-	205,106,10,229,42,150,10,35,35,35,229,205,
-	70,7,193,124,230,3,103,221,117,254,221,116,
-	255,237,91,152,10,35,35,35,183,237,82,32,
-	12,17,5,0,42,152,10,205,91,10,242,203,
-	7,42,150,10,229,205,37,3,195,118,10,237,
-	91,152,10,42,200,10,25,34,200,10,42,205,
-	10,25,34,205,10,237,91,203,10,33,158,253,
-	25,237,91,205,10,205,91,10,242,245,7,33,
-	0,0,34,205,10,62,1,50,197,10,205,5,
-	8,33,0,0,57,249,195,118,10,205,106,10,
-	58,197,10,183,202,118,10,237,91,198,10,42,
-	205,10,205,91,10,242,46,8,237,91,205,10,
-	33,98,2,25,237,91,198,10,205,91,10,250,
-	78,8,237,91,198,10,42,205,10,183,237,82,
-	32,7,42,200,10,125,180,40,13,237,91,205,
-	10,42,198,10,205,91,10,242,97,8,237,91,
-	207,10,42,205,10,25,229,205,37,3,175,50,
-	197,10,195,118,10,205,29,3,33,0,0,57,
-	249,195,118,10,205,106,10,58,202,10,183,40,
-	22,205,14,7,237,91,209,10,19,19,19,205,
-	91,10,242,139,8,33,1,0,195,118,10,33,
-	0,0,195,118,10,205,126,10,252,255,205,108,
-	8,125,180,194,118,10,237,91,200,10,33,0,
-	0,205,91,10,242,118,10,237,91,207,10,42,
-	198,10,25,221,117,254,221,116,255,35,35,35,
-	229,205,70,7,193,124,230,3,103,35,35,35,
-	221,117,252,221,116,253,229,221,110,254,221,102,
-	255,229,33,212,10,229,205,124,6,193,193,221,
-	110,252,221,102,253,34,209,10,33,211,10,54,
-	4,33,209,10,227,205,147,6,193,62,1,50,
-	202,10,243,221,94,252,221,86,253,42,200,10,
-	183,237,82,34,200,10,203,124,40,17,33,0,
-	0,34,200,10,34,205,10,34,198,10,50,197,
-	10,24,37,221,94,252,221,86,253,42,198,10,
-	25,34,198,10,237,91,203,10,33,158,253,25,
-	237,91,198,10,205,91,10,242,68,9,33,0,
-	0,34,198,10,205,5,8,33,0,0,57,249,
-	251,195,118,10,205,106,10,33,49,13,126,183,
-	40,16,205,42,7,237,91,47,13,19,19,19,
-	205,91,10,242,117,9,58,142,15,198,1,50,
-	142,15,195,118,10,33,49,13,126,254,1,40,
-	25,254,3,202,7,10,254,5,202,21,10,33,
-	49,13,54,0,33,47,13,229,205,207,6,195,
-	118,10,58,141,15,183,32,72,33,51,13,126,
-	50,149,10,205,86,7,33,50,13,126,230,127,
-	183,32,40,58,142,15,230,127,50,142,15,183,
-	32,5,198,1,50,142,15,33,50,13,126,111,
-	23,159,103,203,125,58,142,15,40,5,198,128,
-	50,142,15,33,50,13,119,33,50,13,126,111,
-	23,159,103,229,205,237,5,193,33,211,10,54,
-	2,33,2,0,34,209,10,58,154,10,33,212,
-	10,119,58,148,10,33,213,10,119,33,209,10,
-	229,205,147,6,193,24,128,42,47,13,229,33,
-	50,13,229,205,191,4,193,24,239,33,211,10,
-	54,6,33,3,0,34,209,10,58,154,10,33,
-	212,10,119,58,148,10,33,213,10,119,33,214,
-	10,54,5,33,209,10,229,205,147,6,24,200,
-	205,106,10,33,49,13,54,0,33,47,13,229,
-	205,207,6,33,209,10,227,205,147,6,193,205,
-	80,9,205,145,8,24,248,124,170,250,99,10,
-	237,82,201,124,230,128,237,82,60,201,225,253,
-	229,221,229,221,33,0,0,221,57,233,221,249,
-	221,225,253,225,201,233,225,253,229,221,229,221,
-	33,0,0,221,57,94,35,86,35,235,57,249,
-	235,233,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0
-	} ;
-
-#endif
diff --git a/include/net/Space.h b/include/net/Space.h
index c29f3d51c078..ef42629f4258 100644
--- a/include/net/Space.h
+++ b/include/net/Space.h
@@ -10,4 +10,3 @@ struct net_device *smc_init(int unit);
 struct net_device *cs89x0_probe(int unit);
 struct net_device *tc515_probe(int unit);
 struct net_device *lance_probe(int unit);
-struct net_device *cops_probe(int unit);
-- 
cgit v1.2.3-70-g09d2


From 5579ee462dfe768297563a6083e21df52c3ad856 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 2 Oct 2023 13:17:36 +0000
Subject: net_sched: export pfifo_fast prio2band[]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pfifo_fast prio2band[] is renamed to sch_default_prio2band[]
and exported because we want to share it in FQ.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Dave Taht <dave.taht@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sch_generic.h | 1 +
 net/sched/sch_generic.c   | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f232512505f8..c7318c73cfd6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -587,6 +587,7 @@ static inline void sch_tree_unlock(struct Qdisc *q)
 extern struct Qdisc noop_qdisc;
 extern struct Qdisc_ops noop_qdisc_ops;
 extern struct Qdisc_ops pfifo_fast_ops;
+extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1];
 extern struct Qdisc_ops mq_qdisc_ops;
 extern struct Qdisc_ops noqueue_qdisc_ops;
 extern const struct Qdisc_ops *default_qdisc_ops;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5d7e23f4cc0e..4195a4bc26ca 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -694,9 +694,10 @@ struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
 	.owner		=	THIS_MODULE,
 };
 
-static const u8 prio2band[TC_PRIO_MAX + 1] = {
-	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = {
+	1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
 };
+EXPORT_SYMBOL(sch_default_prio2band);
 
 /* 3-band FIFO queue: old style, but should be a bit faster than
    generic prio+fifo combination.
@@ -721,7 +722,7 @@ static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
 			      struct sk_buff **to_free)
 {
-	int band = prio2band[skb->priority & TC_PRIO_MAX];
+	int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX];
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 	struct skb_array *q = band2list(priv, band);
 	unsigned int pkt_len = qdisc_pkt_len(skb);
@@ -830,7 +831,7 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
 {
 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
 
-	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+	memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1);
 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 		goto nla_put_failure;
 	return skb->len;
-- 
cgit v1.2.3-70-g09d2


From 29f834aa326e659ed354c406056e94ea3d29706a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 2 Oct 2023 13:17:37 +0000
Subject: net_sched: sch_fq: add 3 bands and WRR scheduling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before Google adopted FQ for its production servers,
we had to ensure AF4 packets would get a higher share
than BE1 ones.

As discussed this week in Netconf 2023 in Paris, it is time
to upstream this for public use.

After this patch FQ can replace pfifo_fast, with the following
differences :

- FQ uses WRR instead of strict prio, to avoid starvation of
  low priority packets.

- We make sure each band/prio tracks its own usage against sch->limit.
  This was done to make sure flood of low priority packets would not
  prevent AF4 packets to be queued. Contributed by Willem.

- priomap can be changed, if needed (default value are the ones
  coming from pfifo_fast).

In this patch, we set default band weights so that :

- high prio (band=0) packets get 90% of the bandwidth
  if they compete with low prio (band=2) packets.

- high prio packets get 75% of the bandwidth
  if they compete with medium prio (band=1) packets.

Following patch in this series adds the possibility to tune
the per-band weights.

As we added many fields in 'struct fq_sched_data', we had
to make sure to have the first cache line read-mostly, and
avoid wasting precious cache lines.

More optimizations are possible but will be sent separately.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Dave Taht <dave.taht@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/pkt_sched.h |  11 ++-
 net/sched/sch_fq.c             | 204 ++++++++++++++++++++++++++++++++---------
 2 files changed, 171 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 579f641846b8..ec5ab44d41a2 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -941,15 +941,19 @@ enum {
 
 	TCA_FQ_HORIZON_DROP,	/* drop packets beyond horizon, or cap their EDT */
 
+	TCA_FQ_PRIOMAP,		/* prio2band */
+
 	__TCA_FQ_MAX
 };
 
 #define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
 
+#define FQ_BANDS 3
+
 struct tc_fq_qd_stats {
 	__u64	gc_flows;
-	__u64	highprio_packets;
-	__u64	tcp_retrans;
+	__u64	highprio_packets;	/* obsolete */
+	__u64	tcp_retrans;		/* obsolete */
 	__u64	throttled;
 	__u64	flows_plimit;
 	__u64	pkts_too_long;
@@ -963,6 +967,9 @@ struct tc_fq_qd_stats {
 	__u64	horizon_drops;
 	__u64	horizon_caps;
 	__u64	fastpath_packets;
+	__u64	band_drops[FQ_BANDS];
+	__u32	band_pkt_count[FQ_BANDS];
+	__u32	pad;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 818ac786379d..081105801fa6 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -51,7 +51,8 @@
 #include <net/tcp.h>
 
 struct fq_skb_cb {
-	u64	        time_to_send;
+	u64	time_to_send;
+	u8	band;
 };
 
 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
@@ -84,32 +85,28 @@ struct fq_flow {
 	u32		socket_hash;	/* sk_hash */
 	int		qlen;		/* number of packets in flow queue */
 
-/* Second cache line, used in fq_dequeue() */
+/* Second cache line */
 	int		credit;
-	/* 32bit hole on 64bit arches */
-
+	int		band;
 	struct fq_flow *next;		/* next pointer in RR lists */
 
 	struct rb_node  rate_node;	/* anchor in q->delayed tree */
 	u64		time_next_packet;
-} ____cacheline_aligned_in_smp;
+};
 
 struct fq_flow_head {
 	struct fq_flow *first;
 	struct fq_flow *last;
 };
 
-struct fq_sched_data {
+struct fq_perband_flows {
 	struct fq_flow_head new_flows;
-
 	struct fq_flow_head old_flows;
+	int		    credit;
+	int		    quantum; /* based on band nr : 576KB, 192KB, 64KB */
+};
 
-	struct rb_root	delayed;	/* for rate limited flows */
-	u64		time_next_delayed_flow;
-	unsigned long	unthrottle_latency_ns;
-
-	struct fq_flow	internal;	/* for non classified or high prio packets */
-
+struct fq_sched_data {
 /* Read mostly cache line */
 
 	u32		quantum;
@@ -125,10 +122,21 @@ struct fq_sched_data {
 	u8		rate_enable;
 	u8		fq_trees_log;
 	u8		horizon_drop;
+	u8		prio2band[(TC_PRIO_MAX + 1) >> 2];
 	u32		timer_slack; /* hrtimer slack in ns */
 
 /* Read/Write fields. */
 
+	unsigned int band_nr; /* band being serviced in fq_dequeue() */
+
+	struct fq_perband_flows band_flows[FQ_BANDS];
+
+	struct fq_flow	internal;	/* fastpath queue. */
+	struct rb_root	delayed;	/* for rate limited flows */
+	u64		time_next_delayed_flow;
+	unsigned long	unthrottle_latency_ns;
+
+	u32		band_pkt_count[FQ_BANDS];
 	u32		flows;
 	u32		inactive_flows; /* Flows with no packet to send. */
 	u32		throttled_flows;
@@ -139,7 +147,7 @@ struct fq_sched_data {
 
 /* Seldom used fields. */
 
-	u64		stat_internal_packets; /* aka highprio */
+	u64		stat_band_drops[FQ_BANDS];
 	u64		stat_ce_mark;
 	u64		stat_horizon_drops;
 	u64		stat_horizon_caps;
@@ -148,6 +156,12 @@ struct fq_sched_data {
 	u64		stat_allocation_errors;
 };
 
+/* return the i-th 2-bit value ("crumb") */
+static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
+{
+	return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3;
+}
+
 /*
  * f->tail and f->age share the same location.
  * We can use the low order bit to differentiate if this location points
@@ -172,8 +186,19 @@ static bool fq_flow_is_throttled(const struct fq_flow *f)
 	return f->next == &throttled;
 }
 
-static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+enum new_flow {
+	NEW_FLOW,
+	OLD_FLOW
+};
+
+static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
+			     enum new_flow list_sel)
 {
+	struct fq_perband_flows *pband = &q->band_flows[flow->band];
+	struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
+					&pband->new_flows :
+					&pband->old_flows;
+
 	if (head->first)
 		head->last->next = flow;
 	else
@@ -186,7 +211,7 @@ static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
 {
 	rb_erase(&f->rate_node, &q->delayed);
 	q->throttled_flows--;
-	fq_flow_add_tail(&q->old_flows, f);
+	fq_flow_add_tail(q, f, OLD_FLOW);
 }
 
 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
@@ -326,11 +351,6 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
 	struct rb_root *root;
 	struct fq_flow *f;
 
-	/* warning: no starvation prevention... */
-	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) {
-		q->stat_internal_packets++; /* highprio packet */
-		return &q->internal;
-	}
 	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
 	 * or a listener (SYNCOOKIE mode)
 	 * 1) request sockets are not full blown,
@@ -509,9 +529,13 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct fq_flow *f;
 	u64 now;
+	u8 band;
 
-	if (unlikely(sch->q.qlen >= sch->limit))
+	band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
+	if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
+		q->stat_band_drops[band]++;
 		return qdisc_drop(skb, sch, to_free);
+	}
 
 	now = ktime_get_ns();
 	if (!skb->tstamp) {
@@ -538,11 +562,14 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 
 		if (fq_flow_is_detached(f)) {
-			fq_flow_add_tail(&q->new_flows, f);
+			fq_flow_add_tail(q, f, NEW_FLOW);
 			if (time_after(jiffies, f->age + q->flow_refill_delay))
 				f->credit = max_t(u32, f->credit, q->quantum);
 		}
 
+		f->band = band;
+		q->band_pkt_count[band]++;
+		fq_skb_cb(skb)->band = band;
 		if (f->qlen == 0)
 			q->inactive_flows--;
 	}
@@ -584,13 +611,26 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 	}
 }
 
+static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
+{
+	if (pband->credit <= 0)
+		return NULL;
+
+	if (pband->new_flows.first)
+		return &pband->new_flows;
+
+	return pband->old_flows.first ? &pband->old_flows : NULL;
+}
+
 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
+	struct fq_perband_flows *pband;
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
 	unsigned long rate;
+	int retry;
 	u32 plen;
 	u64 now;
 
@@ -606,24 +646,31 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 
 	now = ktime_get_ns();
 	fq_check_throttled(q, now);
+	retry = 0;
+	pband = &q->band_flows[q->band_nr];
 begin:
-	head = &q->new_flows;
-	if (!head->first) {
-		head = &q->old_flows;
-		if (!head->first) {
-			if (q->time_next_delayed_flow != ~0ULL)
-				qdisc_watchdog_schedule_range_ns(&q->watchdog,
+	head = fq_pband_head_select(pband);
+	if (!head) {
+		while (++retry < FQ_BANDS) {
+			if (++q->band_nr == FQ_BANDS)
+				q->band_nr = 0;
+			pband = &q->band_flows[q->band_nr];
+			pband->credit = min(pband->credit + pband->quantum,
+					    pband->quantum);
+			goto begin;
+		}
+		if (q->time_next_delayed_flow != ~0ULL)
+			qdisc_watchdog_schedule_range_ns(&q->watchdog,
 							q->time_next_delayed_flow,
 							q->timer_slack);
-			return NULL;
-		}
+		return NULL;
 	}
 	f = head->first;
-
+	retry = 0;
 	if (f->credit <= 0) {
 		f->credit += q->quantum;
 		head->first = f->next;
-		fq_flow_add_tail(&q->old_flows, f);
+		fq_flow_add_tail(q, f, OLD_FLOW);
 		goto begin;
 	}
 
@@ -645,12 +692,13 @@ begin:
 		}
 		if (--f->qlen == 0)
 			q->inactive_flows++;
+		q->band_pkt_count[fq_skb_cb(skb)->band]--;
 		fq_dequeue_skb(sch, f, skb);
 	} else {
 		head->first = f->next;
 		/* force a pass through old_flows to prevent starvation */
-		if ((head == &q->new_flows) && q->old_flows.first) {
-			fq_flow_add_tail(&q->old_flows, f);
+		if (head == &pband->new_flows) {
+			fq_flow_add_tail(q, f, OLD_FLOW);
 		} else {
 			fq_flow_set_detached(f);
 		}
@@ -658,6 +706,7 @@ begin:
 	}
 	plen = qdisc_pkt_len(skb);
 	f->credit -= plen;
+	pband->credit -= plen;
 
 	if (!q->rate_enable)
 		goto out;
@@ -749,8 +798,10 @@ static void fq_reset(struct Qdisc *sch)
 			kmem_cache_free(fq_flow_cachep, f);
 		}
 	}
-	q->new_flows.first	= NULL;
-	q->old_flows.first	= NULL;
+	for (idx = 0; idx < FQ_BANDS; idx++) {
+		q->band_flows[idx].new_flows.first = NULL;
+		q->band_flows[idx].old_flows.first = NULL;
+	}
 	q->delayed		= RB_ROOT;
 	q->flows		= 0;
 	q->inactive_flows	= 0;
@@ -864,8 +915,54 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_TIMER_SLACK]		= { .type = NLA_U32 },
 	[TCA_FQ_HORIZON]		= { .type = NLA_U32 },
 	[TCA_FQ_HORIZON_DROP]		= { .type = NLA_U8 },
+	[TCA_FQ_PRIOMAP]		= {
+			.type = NLA_BINARY,
+			.len = sizeof(struct tc_prio_qopt),
+		},
 };
 
+/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
+static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
+{
+	const int num_elems = TC_PRIO_MAX + 1;
+	int i;
+
+	memset(out, 0, num_elems / 4);
+	for (i = 0; i < num_elems; i++)
+		out[i / 4] |= in[i] << (2 * (i & 0x3));
+}
+
+static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
+{
+	const int num_elems = TC_PRIO_MAX + 1;
+	int i;
+
+	for (i = 0; i < num_elems; i++)
+		out[i] = fq_prio2band(in, i);
+}
+
+static int fq_load_priomap(struct fq_sched_data *q,
+			   const struct nlattr *attr,
+			   struct netlink_ext_ack *extack)
+{
+	const struct tc_prio_qopt *map = nla_data(attr);
+	int i;
+
+	if (map->bands != FQ_BANDS) {
+		NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
+		return -EINVAL;
+	}
+	for (i = 0; i < TC_PRIO_MAX + 1; i++) {
+		if (map->priomap[i] >= FQ_BANDS) {
+			NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d",
+					       i, map->priomap[i]);
+			return -EINVAL;
+		}
+	}
+	fq_prio2band_compress_crumb(map->priomap, q->prio2band);
+	return 0;
+}
+
 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 		     struct netlink_ext_ack *extack)
 {
@@ -940,6 +1037,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
 	}
 
+	if (!err && tb[TCA_FQ_PRIOMAP])
+		err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
+
 	if (tb[TCA_FQ_ORPHAN_MASK])
 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
 
@@ -991,7 +1091,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 		   struct netlink_ext_ack *extack)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
-	int err;
+	int i, err;
 
 	sch->limit		= 10000;
 	q->flow_plimit		= 100;
@@ -1001,8 +1101,13 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->flow_max_rate	= ~0UL;
 	q->time_next_delayed_flow = ~0ULL;
 	q->rate_enable		= 1;
-	q->new_flows.first	= NULL;
-	q->old_flows.first	= NULL;
+	for (i = 0; i < FQ_BANDS; i++) {
+		q->band_flows[i].new_flows.first = NULL;
+		q->band_flows[i].old_flows.first = NULL;
+	}
+	q->band_flows[0].quantum = 9 << 16;
+	q->band_flows[1].quantum = 3 << 16;
+	q->band_flows[2].quantum = 1 << 16;
 	q->delayed		= RB_ROOT;
 	q->fq_root		= NULL;
 	q->fq_trees_log		= ilog2(1024);
@@ -1017,6 +1122,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	/* Default ce_threshold of 4294 seconds */
 	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
 
+	fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
 	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
 
 	if (opt)
@@ -1031,6 +1137,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	u64 ce_threshold = q->ce_threshold;
+	struct tc_prio_qopt prio = {
+		.bands = FQ_BANDS,
+	};
 	u64 horizon = q->horizon;
 	struct nlattr *opts;
 
@@ -1062,6 +1171,10 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
 		goto nla_put_failure;
 
+	fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
+	if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
@@ -1072,11 +1185,14 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct tc_fq_qd_stats st;
+	int i;
+
+	st.pad = 0;
 
 	sch_tree_lock(sch);
 
 	st.gc_flows		  = q->stat_gc_flows;
-	st.highprio_packets	  = q->stat_internal_packets;
+	st.highprio_packets	  = 0;
 	st.fastpath_packets	  = q->internal.stat_fastpath_packets;
 	st.tcp_retrans		  = 0;
 	st.throttled		  = q->stat_throttled;
@@ -1093,6 +1209,10 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.ce_mark		  = q->stat_ce_mark;
 	st.horizon_drops	  = q->stat_horizon_drops;
 	st.horizon_caps		  = q->stat_horizon_caps;
+	for (i = 0; i < FQ_BANDS; i++) {
+		st.band_drops[i]  = q->stat_band_drops[i];
+		st.band_pkt_count[i] = q->band_pkt_count[i];
+	}
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
@@ -1120,7 +1240,7 @@ static int __init fq_module_init(void)
 
 	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
 					   sizeof(struct fq_flow),
-					   0, 0, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!fq_flow_cachep)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-70-g09d2


From 49e7265fd098fdade2bbdd9331e6b914cda7fa83 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 2 Oct 2023 13:17:38 +0000
Subject: net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This attribute can be used to tune the per band weight
and report them in "tc qdisc show" output:

qdisc fq 802f: parent 1:9 limit 100000p flow_limit 500p buckets 1024 orphan_mask 1023
 quantum 8364b initial_quantum 41820b low_rate_threshold 550Kbit
 refill_delay 40ms timer_slack 10us horizon 10s horizon_drop
 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 weights 589824 196608 65536
 Sent 236460814 bytes 792991 pkt (dropped 0, overlimits 0 requeues 0)
 rate 25816bit 10pps backlog 0b 0p requeues 0
  flows 4 (inactive 4 throttled 0)
  gc 0 throttled 19 latency 17.6us fastpath 773882

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Dave Taht <dave.taht@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/pkt_sched.h |  3 +++
 net/sched/sch_fq.c             | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index ec5ab44d41a2..f762a10bfb78 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -943,12 +943,15 @@ enum {
 
 	TCA_FQ_PRIOMAP,		/* prio2band */
 
+	TCA_FQ_WEIGHTS,		/* Weights for each band */
+
 	__TCA_FQ_MAX
 };
 
 #define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
 
 #define FQ_BANDS 3
+#define FQ_MIN_WEIGHT 16384
 
 struct tc_fq_qd_stats {
 	__u64	gc_flows;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 081105801fa6..8eacdb54e72f 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -919,6 +919,10 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 			.type = NLA_BINARY,
 			.len = sizeof(struct tc_prio_qopt),
 		},
+	[TCA_FQ_WEIGHTS]		= {
+			.type = NLA_BINARY,
+			.len = FQ_BANDS * sizeof(s32),
+		},
 };
 
 /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
@@ -941,6 +945,25 @@ static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
 		out[i] = fq_prio2band(in, i);
 }
 
+static int fq_load_weights(struct fq_sched_data *q,
+			   const struct nlattr *attr,
+			   struct netlink_ext_ack *extack)
+{
+	s32 *weights = nla_data(attr);
+	int i;
+
+	for (i = 0; i < FQ_BANDS; i++) {
+		if (weights[i] < FQ_MIN_WEIGHT) {
+			NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d",
+					       weights[i], FQ_MIN_WEIGHT);
+			return -EINVAL;
+		}
+	}
+	for (i = 0; i < FQ_BANDS; i++)
+		q->band_flows[i].quantum = weights[i];
+	return 0;
+}
+
 static int fq_load_priomap(struct fq_sched_data *q,
 			   const struct nlattr *attr,
 			   struct netlink_ext_ack *extack)
@@ -1040,6 +1063,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 	if (!err && tb[TCA_FQ_PRIOMAP])
 		err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
 
+	if (!err && tb[TCA_FQ_WEIGHTS])
+		err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack);
+
 	if (tb[TCA_FQ_ORPHAN_MASK])
 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
 
@@ -1142,6 +1168,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	};
 	u64 horizon = q->horizon;
 	struct nlattr *opts;
+	s32 weights[3];
 
 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
 	if (opts == NULL)
@@ -1175,6 +1202,12 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
 		goto nla_put_failure;
 
+	weights[0] = q->band_flows[0].quantum;
+	weights[1] = q->band_flows[1].quantum;
+	weights[2] = q->band_flows[2].quantum;
+	if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-- 
cgit v1.2.3-70-g09d2


From 7151d87a175c6618fe81705755eb3dc4199cad4e Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Wed, 27 Sep 2023 10:31:30 +0200
Subject: virtchnl: Add header dependencies

The <linux/avf/virtchnl.h> uses BIT, struct_size and ETH_ALEN macros
but does not include appropriate header files that defines them.
Add these dependencies so this header file can be included anywhere.

Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index dd71d3009771..6b3acf15be5c 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -4,6 +4,10 @@
 #ifndef _VIRTCHNL_H_
 #define _VIRTCHNL_H_
 
+#include <linux/bitops.h>
+#include <linux/overflow.h>
+#include <uapi/linux/if_ether.h>
+
 /* Description:
  * This header file describes the Virtual Function (VF) - Physical Function
  * (PF) communication protocol used by the drivers for all devices starting
-- 
cgit v1.2.3-70-g09d2


From 9beebc2b5d0038a65977a7a14909598c64ce070f Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Thu, 28 Sep 2023 09:24:28 +0200
Subject: can: dev: add can_state_get_by_berr_counter() to return the CAN state
 based on the current error counters

Some CAN controllers do not have a register that contains the current
CAN state, but only a register that contains the error counters.

Introduce a new function can_state_get_by_berr_counter() that returns
the current TX and RX state depending on the provided CAN bit error
counters.

Link: https://lore.kernel.org/all/20231005-at91_can-rx_offload-v2-1-9987d53600e0@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c | 22 ++++++++++++++++++++++
 include/linux/can/dev.h   |  4 ++++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 82b12902fc35..3a3be5cdfc1f 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -90,6 +90,28 @@ const char *can_get_state_str(const enum can_state state)
 }
 EXPORT_SYMBOL_GPL(can_get_state_str);
 
+static enum can_state can_state_err_to_state(u16 err)
+{
+	if (err < CAN_ERROR_WARNING_THRESHOLD)
+		return CAN_STATE_ERROR_ACTIVE;
+	if (err < CAN_ERROR_PASSIVE_THRESHOLD)
+		return CAN_STATE_ERROR_WARNING;
+	if (err < CAN_BUS_OFF_THRESHOLD)
+		return CAN_STATE_ERROR_PASSIVE;
+
+	return CAN_STATE_BUS_OFF;
+}
+
+void can_state_get_by_berr_counter(const struct net_device *dev,
+				   const struct can_berr_counter *bec,
+				   enum can_state *tx_state,
+				   enum can_state *rx_state)
+{
+	*tx_state = can_state_err_to_state(bec->txerr);
+	*rx_state = can_state_err_to_state(bec->rxerr);
+}
+EXPORT_SYMBOL_GPL(can_state_get_by_berr_counter);
+
 void can_change_state(struct net_device *dev, struct can_frame *cf,
 		      enum can_state tx_state, enum can_state rx_state)
 {
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 982ba245eb41..1b92aed49363 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -195,6 +195,10 @@ int can_restart_now(struct net_device *dev);
 void can_bus_off(struct net_device *dev);
 
 const char *can_get_state_str(const enum can_state state);
+void can_state_get_by_berr_counter(const struct net_device *dev,
+				   const struct can_berr_counter *bec,
+				   enum can_state *tx_state,
+				   enum can_state *rx_state);
 void can_change_state(struct net_device *dev, struct can_frame *cf,
 		      enum can_state tx_state, enum can_state rx_state);
 
-- 
cgit v1.2.3-70-g09d2


From 2253bb3ff242af3b1a53cb98e2e4eb1eec82dd2a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 3 Oct 2023 16:18:13 -0700
Subject: nexthop: Annotate struct nh_res_table with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct nh_res_table.

Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231003231813.work.042-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nexthop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 2b12725de9c0..0b1172dbb334 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -92,7 +92,7 @@ struct nh_res_table {
 	u32			unbalanced_timer;
 
 	u16			num_nh_buckets;
-	struct nh_res_bucket	nh_buckets[];
+	struct nh_res_bucket	nh_buckets[] __counted_by(num_nh_buckets);
 };
 
 struct nh_grp_entry {
-- 
cgit v1.2.3-70-g09d2


From 3e584e32b19d4c8d4591dc720277e2001d135ee1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 3 Oct 2023 16:18:18 -0700
Subject: nexthop: Annotate struct nh_notifier_res_table_info with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct
nh_notifier_res_table_info.

Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: llvm@lists.linux.dev
Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231003231818.work.883-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nexthop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 0b1172dbb334..dbd90e8fe9d6 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -200,7 +200,7 @@ struct nh_notifier_res_bucket_info {
 
 struct nh_notifier_res_table_info {
 	u16 num_nh_buckets;
-	struct nh_notifier_single_info nhs[];
+	struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets);
 };
 
 struct nh_notifier_info {
-- 
cgit v1.2.3-70-g09d2


From 9c8c3fa3a52bc55696ccc4dfcb8a49f969b5fb0e Mon Sep 17 00:00:00 2001
From: Akihiko Odaki <akihiko.odaki@daynix.com>
Date: Thu, 5 Oct 2023 16:21:36 +0900
Subject: bpf: Fix the comment for bpf_restore_data_end()

The comment used to say:
> Restore data saved by bpf_compute_data_pointers().

But bpf_compute_data_pointers() does not save the data;
bpf_compute_and_save_data_end() does.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20231005072137.29870-1-akihiko.odaki@daynix.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 27406aee2d40..ff7ecc89d3dd 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -736,7 +736,7 @@ static inline void bpf_compute_and_save_data_end(
 	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
-/* Restore data saved by bpf_compute_data_pointers(). */
+/* Restore data saved by bpf_compute_and_save_data_end(). */
 static inline void bpf_restore_data_end(
 	struct sk_buff *skb, void *saved_data_end)
 {
-- 
cgit v1.2.3-70-g09d2


From cf9ecad9772554a5b78c4f31b098ed680bc5251e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 3 Oct 2023 16:21:47 -0700
Subject: nexthop: Annotate struct nh_notifier_grp_info with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct nh_notifier_grp_info.

Cc: David Ahern <dsahern@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: netdev@vger.kernel.org
Cc: llvm@lists.linux.dev
Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index dbd90e8fe9d6..8f5a2ab45920 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -187,7 +187,7 @@ struct nh_notifier_grp_entry_info {
 struct nh_notifier_grp_info {
 	u16 num_nh;
 	bool is_fdb;
-	struct nh_notifier_grp_entry_info nh_entries[];
+	struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh);
 };
 
 struct nh_notifier_res_bucket_info {
-- 
cgit v1.2.3-70-g09d2


From 2a92fccdaca837f65990139207981014970f996c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 3 Oct 2023 18:44:49 -0700
Subject: nexthop: Annotate struct nh_group with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct nh_group.

Cc: David Ahern <dsahern@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: netdev@vger.kernel.org
Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 8f5a2ab45920..d92046a4a078 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -126,7 +126,7 @@ struct nh_group {
 	bool			has_v4;
 
 	struct nh_res_table __rcu *res_table;
-	struct nh_grp_entry	nh_entries[];
+	struct nh_grp_entry	nh_entries[] __counted_by(num_nh);
 };
 
 struct nexthop {
-- 
cgit v1.2.3-70-g09d2


From 99474727d5d5b56f6d417ae6010d93407bbaaebb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 3 Oct 2023 16:18:33 -0700
Subject: flow_offload: Annotate struct flow_action_entry with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct flow_action_entry.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: netdev@vger.kernel.org
Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 9efa9a59e81f..314087a5e181 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -333,7 +333,7 @@ struct flow_action_entry {
 
 struct flow_action {
 	unsigned int			num_entries;
-	struct flow_action_entry	entries[];
+	struct flow_action_entry	entries[] __counted_by(num_entries);
 };
 
 static inline bool flow_action_has_entries(const struct flow_action *action)
-- 
cgit v1.2.3-70-g09d2


From bc5bc309db45a7ab218ce8259ba9bc7659be61ca Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Thu, 5 Oct 2023 08:41:23 +0000
Subject: bpf: Inherit system settings for CPU security mitigations

Currently, there exists a system-wide setting related to CPU security
mitigations, denoted as 'mitigations='. When set to 'mitigations=off', it
deactivates all optional CPU mitigations. Therefore, if we implement a
system-wide 'mitigations=off' setting, it should inherently bypass Spectre
v1 and Spectre v4 in the BPF subsystem.

Please note that there is also a more specific 'nospectre_v1' setting on
x86 and ppc architectures, though it is not currently exported. For the
time being, let's disregard more fine-grained options.

This idea emerged during our discussion about potential Spectre v1 attacks
with Luis [0].

  [0] https://lore.kernel.org/bpf/b4fc15f7-b204-767e-ebb9-fdb4233961fb@iogearbox.net

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Song Liu <song@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Cc: Luis Gerhorst <gerhorst@cs.fau.de>
Link: https://lore.kernel.org/bpf/20231005084123.1338-1-laoar.shao@gmail.com
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a82efd34b741..61bde4520f5c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void)
 
 static inline bool bpf_bypass_spec_v1(void)
 {
-	return perfmon_capable();
+	return perfmon_capable() || cpu_mitigations_off();
 }
 
 static inline bool bpf_bypass_spec_v4(void)
 {
-	return perfmon_capable();
+	return perfmon_capable() || cpu_mitigations_off();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
-- 
cgit v1.2.3-70-g09d2


From 3abbd0699b678fc48e0100704338cff9180fe4bb Mon Sep 17 00:00:00 2001
From: Giulio Benetti <giulio.benetti+tekvox@benettiengineering.com>
Date: Thu, 5 Oct 2023 20:29:15 +0200
Subject: net: phy: broadcom: add support for BCM5221 phy

This patch adds the BCM5221 PHY support by reusing brcm_fet_*()
callbacks and adding quirks for BCM5221 when needed.

Cc: Jim Reinhart <jimr@tekvox.com>
Cc: James Autry <jautry@tekvox.com>
Cc: Matthew Maron <matthewm@tekvox.com>
Signed-off-by: Giulio Benetti <giulio.benetti+tekvox@benettiengineering.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20231005182915.153815-1-giulio.benetti@benettiengineering.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/broadcom.c | 154 +++++++++++++++++++++++++++++++++++----------
 include/linux/brcmphy.h    |  10 +++
 2 files changed, 131 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 04b2e6eeb195..3a627105675a 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -704,16 +704,21 @@ static int brcm_fet_config_init(struct phy_device *phydev)
 	if (err < 0 && err != -EIO)
 		return err;
 
+	/* Read to clear status bits */
 	reg = phy_read(phydev, MII_BRCM_FET_INTREG);
 	if (reg < 0)
 		return reg;
 
 	/* Unmask events we are interested in and mask interrupts globally. */
-	reg = MII_BRCM_FET_IR_DUPLEX_EN |
-	      MII_BRCM_FET_IR_SPEED_EN |
-	      MII_BRCM_FET_IR_LINK_EN |
-	      MII_BRCM_FET_IR_ENABLE |
-	      MII_BRCM_FET_IR_MASK;
+	if (phydev->phy_id == PHY_ID_BCM5221)
+		reg = MII_BRCM_FET_IR_ENABLE |
+		      MII_BRCM_FET_IR_MASK;
+	else
+		reg = MII_BRCM_FET_IR_DUPLEX_EN |
+		      MII_BRCM_FET_IR_SPEED_EN |
+		      MII_BRCM_FET_IR_LINK_EN |
+		      MII_BRCM_FET_IR_ENABLE |
+		      MII_BRCM_FET_IR_MASK;
 
 	err = phy_write(phydev, MII_BRCM_FET_INTREG, reg);
 	if (err < 0)
@@ -726,42 +731,49 @@ static int brcm_fet_config_init(struct phy_device *phydev)
 
 	reg = brcmtest | MII_BRCM_FET_BT_SRE;
 
-	err = phy_write(phydev, MII_BRCM_FET_BRCMTEST, reg);
-	if (err < 0)
-		return err;
+	phy_lock_mdio_bus(phydev);
 
-	/* Set the LED mode */
-	reg = phy_read(phydev, MII_BRCM_FET_SHDW_AUXMODE4);
-	if (reg < 0) {
-		err = reg;
-		goto done;
+	err = __phy_write(phydev, MII_BRCM_FET_BRCMTEST, reg);
+	if (err < 0) {
+		phy_unlock_mdio_bus(phydev);
+		return err;
 	}
 
-	reg &= ~MII_BRCM_FET_SHDW_AM4_LED_MASK;
-	reg |= MII_BRCM_FET_SHDW_AM4_LED_MODE1;
+	if (phydev->phy_id != PHY_ID_BCM5221) {
+		/* Set the LED mode */
+		reg = __phy_read(phydev, MII_BRCM_FET_SHDW_AUXMODE4);
+		if (reg < 0) {
+			err = reg;
+			goto done;
+		}
 
-	err = phy_write(phydev, MII_BRCM_FET_SHDW_AUXMODE4, reg);
-	if (err < 0)
-		goto done;
+		err = __phy_modify(phydev, MII_BRCM_FET_SHDW_AUXMODE4,
+				   MII_BRCM_FET_SHDW_AM4_LED_MASK,
+				   MII_BRCM_FET_SHDW_AM4_LED_MODE1);
+		if (err < 0)
+			goto done;
 
-	/* Enable auto MDIX */
-	err = phy_set_bits(phydev, MII_BRCM_FET_SHDW_MISCCTRL,
-			   MII_BRCM_FET_SHDW_MC_FAME);
-	if (err < 0)
-		goto done;
+		/* Enable auto MDIX */
+		err = __phy_set_bits(phydev, MII_BRCM_FET_SHDW_MISCCTRL,
+				     MII_BRCM_FET_SHDW_MC_FAME);
+		if (err < 0)
+			goto done;
+	}
 
 	if (phydev->dev_flags & PHY_BRCM_AUTO_PWRDWN_ENABLE) {
 		/* Enable auto power down */
-		err = phy_set_bits(phydev, MII_BRCM_FET_SHDW_AUXSTAT2,
-				   MII_BRCM_FET_SHDW_AS2_APDE);
+		err = __phy_set_bits(phydev, MII_BRCM_FET_SHDW_AUXSTAT2,
+				     MII_BRCM_FET_SHDW_AS2_APDE);
 	}
 
 done:
 	/* Disable shadow register access */
-	err2 = phy_write(phydev, MII_BRCM_FET_BRCMTEST, brcmtest);
+	err2 = __phy_write(phydev, MII_BRCM_FET_BRCMTEST, brcmtest);
 	if (!err)
 		err = err2;
 
+	phy_unlock_mdio_bus(phydev);
+
 	return err;
 }
 
@@ -840,23 +852,86 @@ static int brcm_fet_suspend(struct phy_device *phydev)
 
 	reg = brcmtest | MII_BRCM_FET_BT_SRE;
 
-	err = phy_write(phydev, MII_BRCM_FET_BRCMTEST, reg);
-	if (err < 0)
+	phy_lock_mdio_bus(phydev);
+
+	err = __phy_write(phydev, MII_BRCM_FET_BRCMTEST, reg);
+	if (err < 0) {
+		phy_unlock_mdio_bus(phydev);
 		return err;
+	}
+
+	if (phydev->phy_id == PHY_ID_BCM5221)
+		/* Force Low Power Mode with clock enabled */
+		reg = BCM5221_SHDW_AM4_EN_CLK_LPM | BCM5221_SHDW_AM4_FORCE_LPM;
+	else
+		/* Set standby mode */
+		reg = MII_BRCM_FET_SHDW_AM4_STANDBY;
 
-	/* Set standby mode */
-	err = phy_modify(phydev, MII_BRCM_FET_SHDW_AUXMODE4,
-			 MII_BRCM_FET_SHDW_AM4_STANDBY,
-			 MII_BRCM_FET_SHDW_AM4_STANDBY);
+	err = __phy_set_bits(phydev, MII_BRCM_FET_SHDW_AUXMODE4, reg);
 
 	/* Disable shadow register access */
-	err2 = phy_write(phydev, MII_BRCM_FET_BRCMTEST, brcmtest);
+	err2 = __phy_write(phydev, MII_BRCM_FET_BRCMTEST, brcmtest);
 	if (!err)
 		err = err2;
 
+	phy_unlock_mdio_bus(phydev);
+
 	return err;
 }
 
+static int bcm5221_config_aneg(struct phy_device *phydev)
+{
+	int ret, val;
+
+	ret = genphy_config_aneg(phydev);
+	if (ret)
+		return ret;
+
+	switch (phydev->mdix_ctrl) {
+	case ETH_TP_MDI:
+		val = BCM5221_AEGSR_MDIX_DIS;
+		break;
+	case ETH_TP_MDI_X:
+		val = BCM5221_AEGSR_MDIX_DIS | BCM5221_AEGSR_MDIX_MAN_SWAP;
+		break;
+	case ETH_TP_MDI_AUTO:
+		val = 0;
+		break;
+	default:
+		return 0;
+	}
+
+	return phy_modify(phydev, BCM5221_AEGSR, BCM5221_AEGSR_MDIX_MAN_SWAP |
+						 BCM5221_AEGSR_MDIX_DIS,
+						 val);
+}
+
+static int bcm5221_read_status(struct phy_device *phydev)
+{
+	int ret;
+
+	/* Read MDIX status */
+	ret = phy_read(phydev, BCM5221_AEGSR);
+	if (ret < 0)
+		return ret;
+
+	if (ret & BCM5221_AEGSR_MDIX_DIS) {
+		if (ret & BCM5221_AEGSR_MDIX_MAN_SWAP)
+			phydev->mdix_ctrl = ETH_TP_MDI_X;
+		else
+			phydev->mdix_ctrl = ETH_TP_MDI;
+	} else {
+		phydev->mdix_ctrl = ETH_TP_MDI_AUTO;
+	}
+
+	if (ret & BCM5221_AEGSR_MDIX_STATUS)
+		phydev->mdix = ETH_TP_MDI_X;
+	else
+		phydev->mdix = ETH_TP_MDI;
+
+	return genphy_read_status(phydev);
+}
+
 static void bcm54xx_phy_get_wol(struct phy_device *phydev,
 				struct ethtool_wolinfo *wol)
 {
@@ -1221,6 +1296,18 @@ static struct phy_driver broadcom_drivers[] = {
 	.handle_interrupt = brcm_fet_handle_interrupt,
 	.suspend	= brcm_fet_suspend,
 	.resume		= brcm_fet_config_init,
+}, {
+	.phy_id		= PHY_ID_BCM5221,
+	.phy_id_mask	= 0xfffffff0,
+	.name		= "Broadcom BCM5221",
+	/* PHY_BASIC_FEATURES */
+	.config_init	= brcm_fet_config_init,
+	.config_intr	= brcm_fet_config_intr,
+	.handle_interrupt = brcm_fet_handle_interrupt,
+	.suspend	= brcm_fet_suspend,
+	.resume		= brcm_fet_config_init,
+	.config_aneg	= bcm5221_config_aneg,
+	.read_status	= bcm5221_read_status,
 }, {
 	.phy_id		= PHY_ID_BCM5395,
 	.phy_id_mask	= 0xfffffff0,
@@ -1296,6 +1383,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = {
 	{ PHY_ID_BCM50610M, 0xfffffff0 },
 	{ PHY_ID_BCM57780, 0xfffffff0 },
 	{ PHY_ID_BCMAC131, 0xfffffff0 },
+	{ PHY_ID_BCM5221, 0xfffffff0 },
 	{ PHY_ID_BCM5241, 0xfffffff0 },
 	{ PHY_ID_BCM5395, 0xfffffff0 },
 	{ PHY_ID_BCM53125, 0xfffffff0 },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index c55810a43541..1394ba302367 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -11,6 +11,7 @@
 
 #define PHY_ID_BCM50610			0x0143bd60
 #define PHY_ID_BCM50610M		0x0143bd70
+#define PHY_ID_BCM5221			0x004061e0
 #define PHY_ID_BCM5241			0x0143bc30
 #define PHY_ID_BCMAC131			0x0143bc70
 #define PHY_ID_BCM5481			0x0143bca0
@@ -331,6 +332,15 @@
 
 #define BCM54XX_WOL_INT_STATUS		(MII_BCM54XX_EXP_SEL_WOL + 0x94)
 
+/* BCM5221 Registers */
+#define BCM5221_AEGSR			0x1C
+#define BCM5221_AEGSR_MDIX_STATUS	BIT(13)
+#define BCM5221_AEGSR_MDIX_MAN_SWAP	BIT(12)
+#define BCM5221_AEGSR_MDIX_DIS		BIT(11)
+
+#define BCM5221_SHDW_AM4_EN_CLK_LPM	BIT(2)
+#define BCM5221_SHDW_AM4_FORCE_LPM	BIT(1)
+
 /*****************************************************************************/
 /* Fast Ethernet Transceiver definitions. */
 /*****************************************************************************/
-- 
cgit v1.2.3-70-g09d2


From d6247ecb6c1e17d7a33317090627f5bfe563cbb2 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 4 Oct 2023 11:23:38 -0500
Subject: bpf: Add ability to pin bpf timer to calling CPU

BPF supports creating high resolution timers using bpf_timer_* helper
functions. Currently, only the BPF_F_TIMER_ABS flag is supported, which
specifies that the timeout should be interpreted as absolute time. It
would also be useful to be able to pin that timer to a core. For
example, if you wanted to make a subset of cores run without timer
interrupts, and only have the timer be invoked on a single core.

This patch adds support for this with a new BPF_F_TIMER_CPU_PIN flag.
When specified, the HRTIMER_MODE_PINNED flag is passed to
hrtimer_start(). A subsequent patch will update selftests to validate.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20231004162339.200702-2-void@manifault.com
---
 include/uapi/linux/bpf.h       | 4 ++++
 kernel/bpf/helpers.c           | 5 ++++-
 tools/include/uapi/linux/bpf.h | 4 ++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70bfa997e896..a7d4a1a69f21 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5096,6 +5096,8 @@ union bpf_attr {
  *		**BPF_F_TIMER_ABS**
  *			Start the timer in absolute expire value instead of the
  *			default relative one.
+ *		**BPF_F_TIMER_CPU_PIN**
+ *			Timer will be pinned to the CPU of the caller.
  *
  *	Return
  *		0 on success.
@@ -7309,9 +7311,11 @@ struct bpf_core_relo {
  * Flags to control bpf_timer_start() behaviour.
  *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
  *       relative to current time.
+ *     - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller.
  */
 enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
+	BPF_F_TIMER_CPU_PIN = (1ULL << 1),
 };
 
 /* BPF numbers iterator state */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index dd1c69ee3375..d2840dd5b00d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1272,7 +1272,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 
 	if (in_nmi())
 		return -EOPNOTSUPP;
-	if (flags > BPF_F_TIMER_ABS)
+	if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
 		return -EINVAL;
 	__bpf_spin_lock_irqsave(&timer->lock);
 	t = timer->timer;
@@ -1286,6 +1286,9 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 	else
 		mode = HRTIMER_MODE_REL_SOFT;
 
+	if (flags & BPF_F_TIMER_CPU_PIN)
+		mode |= HRTIMER_MODE_PINNED;
+
 	hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 70bfa997e896..a7d4a1a69f21 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5096,6 +5096,8 @@ union bpf_attr {
  *		**BPF_F_TIMER_ABS**
  *			Start the timer in absolute expire value instead of the
  *			default relative one.
+ *		**BPF_F_TIMER_CPU_PIN**
+ *			Timer will be pinned to the CPU of the caller.
  *
  *	Return
  *		0 on success.
@@ -7309,9 +7311,11 @@ struct bpf_core_relo {
  * Flags to control bpf_timer_start() behaviour.
  *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
  *       relative to current time.
+ *     - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller.
  */
 enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
+	BPF_F_TIMER_CPU_PIN = (1ULL << 1),
 };
 
 /* BPF numbers iterator state */
-- 
cgit v1.2.3-70-g09d2


From dab4e1f06cabb6834de14264394ccab197007302 Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <m@lambda.lt>
Date: Sat, 7 Oct 2023 10:14:14 +0200
Subject: bpf: Derive source IP addr via bpf_*_fib_lookup()

Extend the bpf_fib_lookup() helper by making it to return the source
IPv4/IPv6 address if the BPF_FIB_LOOKUP_SRC flag is set.

For example, the following snippet can be used to derive the desired
source IP address:

    struct bpf_fib_lookup p = { .ipv4_dst = ip4->daddr };

    ret = bpf_skb_fib_lookup(skb, p, sizeof(p),
            BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH);
    if (ret != BPF_FIB_LKUP_RET_SUCCESS)
        return TC_ACT_SHOT;

    /* the p.ipv4_src now contains the source address */

The inability to derive the proper source address may cause malfunctions
in BPF-based dataplanes for hosts containing netdevs with more than one
routable IP address or for multi-homed hosts.

For example, Cilium implements packet masquerading in BPF. If an
egressing netdev to which the Cilium's BPF prog is attached has
multiple IP addresses, then only one [hardcoded] IP address can be used for
masquerading. This breaks connectivity if any other IP address should have
been selected instead, for example, when a public and private addresses
are attached to the same egress interface.

The change was tested with Cilium [1].

Nikolay Aleksandrov helped to figure out the IPv6 addr selection.

[1]: https://github.com/cilium/cilium/pull/28283

Signed-off-by: Martynas Pumputis <m@lambda.lt>
Link: https://lore.kernel.org/r/20231007081415.33502-2-m@lambda.lt
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/ipv6_stubs.h       |  5 +++++
 include/uapi/linux/bpf.h       | 10 ++++++++++
 net/core/filter.c              | 18 +++++++++++++++++-
 net/ipv6/af_inet6.c            |  1 +
 tools/include/uapi/linux/bpf.h | 10 ++++++++++
 5 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index c48186bf4737..21da31e1dff5 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -85,6 +85,11 @@ struct ipv6_bpf_stub {
 			       sockptr_t optval, unsigned int optlen);
 	int (*ipv6_getsockopt)(struct sock *sk, int level, int optname,
 			       sockptr_t optval, sockptr_t optlen);
+	int (*ipv6_dev_get_saddr)(struct net *net,
+				  const struct net_device *dst_dev,
+				  const struct in6_addr *daddr,
+				  unsigned int prefs,
+				  struct in6_addr *saddr);
 };
 extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a7d4a1a69f21..e0aa457f94a9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3264,6 +3264,11 @@ union bpf_attr {
  *			and *params*->smac will not be set as output. A common
  *			use case is to call **bpf_redirect_neigh**\ () after
  *			doing **bpf_fib_lookup**\ ().
+ *		**BPF_FIB_LOOKUP_SRC**
+ *			Derive and set source IP addr in *params*->ipv{4,6}_src
+ *			for the nexthop. If the src addr cannot be derived,
+ *			**BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this
+ *			case, *params*->dmac and *params*->smac are not set either.
  *
  *		*ctx* is either **struct xdp_md** for XDP programs or
  *		**struct sk_buff** tc cls_act programs.
@@ -6964,6 +6969,7 @@ enum {
 	BPF_FIB_LOOKUP_OUTPUT  = (1U << 1),
 	BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 	BPF_FIB_LOOKUP_TBID    = (1U << 3),
+	BPF_FIB_LOOKUP_SRC     = (1U << 4),
 };
 
 enum {
@@ -6976,6 +6982,7 @@ enum {
 	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires encapsulation */
 	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neighbor entry for nh */
 	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* fragmentation required to fwd */
+	BPF_FIB_LKUP_RET_NO_SRC_ADDR,  /* failed to derive IP src addr */
 };
 
 struct bpf_fib_lookup {
@@ -7010,6 +7017,9 @@ struct bpf_fib_lookup {
 		__u32	rt_metric;
 	};
 
+	/* input: source address to consider for lookup
+	 * output: source address result from lookup
+	 */
 	union {
 		__be32		ipv4_src;
 		__u32		ipv6_src[4];  /* in6_addr; network order */
diff --git a/net/core/filter.c b/net/core/filter.c
index a094694899c9..3880bf0b740d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5850,6 +5850,9 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	params->rt_metric = res.fi->fib_priority;
 	params->ifindex = dev->ifindex;
 
+	if (flags & BPF_FIB_LOOKUP_SRC)
+		params->ipv4_src = fib_result_prefsrc(net, &res);
+
 	/* xdp and cls_bpf programs are run in RCU-bh so
 	 * rcu_read_lock_bh is not needed here
 	 */
@@ -5992,6 +5995,18 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	params->rt_metric = res.f6i->fib6_metric;
 	params->ifindex = dev->ifindex;
 
+	if (flags & BPF_FIB_LOOKUP_SRC) {
+		if (res.f6i->fib6_prefsrc.plen) {
+			*src = res.f6i->fib6_prefsrc.addr;
+		} else {
+			err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
+								&fl6.daddr, 0,
+								src);
+			if (err)
+				return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
+		}
+	}
+
 	if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
 		goto set_fwd_params;
 
@@ -6010,7 +6025,8 @@ set_fwd_params:
 #endif
 
 #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
-			     BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID)
+			     BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
+			     BPF_FIB_LOOKUP_SRC)
 
 BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c6ad0d6e99b5..6337fb4504fd 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1061,6 +1061,7 @@ static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
 	.udp6_lib_lookup = __udp6_lib_lookup,
 	.ipv6_setsockopt = do_ipv6_setsockopt,
 	.ipv6_getsockopt = do_ipv6_getsockopt,
+	.ipv6_dev_get_saddr = ipv6_dev_get_saddr,
 };
 
 static int __init inet6_init(void)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a7d4a1a69f21..e0aa457f94a9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3264,6 +3264,11 @@ union bpf_attr {
  *			and *params*->smac will not be set as output. A common
  *			use case is to call **bpf_redirect_neigh**\ () after
  *			doing **bpf_fib_lookup**\ ().
+ *		**BPF_FIB_LOOKUP_SRC**
+ *			Derive and set source IP addr in *params*->ipv{4,6}_src
+ *			for the nexthop. If the src addr cannot be derived,
+ *			**BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this
+ *			case, *params*->dmac and *params*->smac are not set either.
  *
  *		*ctx* is either **struct xdp_md** for XDP programs or
  *		**struct sk_buff** tc cls_act programs.
@@ -6964,6 +6969,7 @@ enum {
 	BPF_FIB_LOOKUP_OUTPUT  = (1U << 1),
 	BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 	BPF_FIB_LOOKUP_TBID    = (1U << 3),
+	BPF_FIB_LOOKUP_SRC     = (1U << 4),
 };
 
 enum {
@@ -6976,6 +6982,7 @@ enum {
 	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires encapsulation */
 	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neighbor entry for nh */
 	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* fragmentation required to fwd */
+	BPF_FIB_LKUP_RET_NO_SRC_ADDR,  /* failed to derive IP src addr */
 };
 
 struct bpf_fib_lookup {
@@ -7010,6 +7017,9 @@ struct bpf_fib_lookup {
 		__u32	rt_metric;
 	};
 
+	/* input: source address to consider for lookup
+	 * output: source address result from lookup
+	 */
 	union {
 		__be32		ipv4_src;
 		__u32		ipv6_src[4];  /* in6_addr; network order */
-- 
cgit v1.2.3-70-g09d2


From 95b9a87c6a6b708cccda1f9b7baf9920b80cdabf Mon Sep 17 00:00:00 2001
From: David Morley <morleyd@google.com>
Date: Fri, 6 Oct 2023 01:18:40 +0000
Subject: tcp: record last received ipv6 flowlabel

In order to better estimate whether a data packet has been
retransmitted or is the result of a TLP, we save the last received
ipv6 flowlabel.

To make space for this field we resize the "ato" field in
inet_connection_sock as the current value of TCP_DELACK_MAX can be
fully contained in 8 bits and add a compile_time_assert ensuring this
field is the required size.

v2: addressed kernel bot feedback about dccp_delack_timer()
v3: addressed build error introduced by commit bbf80d713fe7 ("tcp:
derive delack_max from rto_min")

Signed-off-by: David Morley <morleyd@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Tested-by: David Morley <morleyd@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/inet_connection_sock.h |  5 ++++-
 include/net/tcp.h                  |  2 ++
 net/dccp/timer.c                   |  4 ++--
 net/ipv4/tcp.c                     |  4 ++--
 net/ipv4/tcp_input.c               | 15 +++++++++++++++
 net/ipv4/tcp_timer.c               |  2 +-
 6 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5d2fcc137b88..d6d9d1c1985a 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -114,7 +114,10 @@ struct inet_connection_sock {
 		__u8		  quick;	 /* Scheduled number of quick acks	   */
 		__u8		  pingpong;	 /* The session is interactive		   */
 		__u8		  retry;	 /* Number of attempts			   */
-		__u32		  ato;		 /* Predicted tick of soft clock	   */
+		#define ATO_BITS 8
+		__u32		  ato:ATO_BITS,	 /* Predicted tick of soft clock	   */
+				  lrcv_flowlabel:20, /* last received ipv6 flowlabel	   */
+				  unused:4;
 		unsigned long	  timeout;	 /* Currently scheduled timeout		   */
 		__u32		  lrcvtime;	 /* timestamp of last received data packet */
 		__u16		  last_seg_size; /* Size of last incoming segment	   */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9eb0a2855311..7fdedf5c71f0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -131,6 +131,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
 
 #define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
+static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
+
 #if HZ >= 100
 #define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
 #define TCP_ATO_MIN	((unsigned)(HZ/25))
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index b3255e87cc7e..a4cfb47b60e5 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -196,8 +196,8 @@ static void dccp_delack_timer(struct timer_list *t)
 	if (inet_csk_ack_scheduled(sk)) {
 		if (!inet_csk_in_pingpong_mode(sk)) {
 			/* Delayed ACK missed: inflate ATO. */
-			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
-						 icsk->icsk_rto);
+			icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1,
+						   icsk->icsk_rto);
 		} else {
 			/* Delayed ACK missed: leave pingpong mode and
 			 * deflate ATO.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9a8b134d8ada..faabb5a4a378 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3756,8 +3756,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
 
 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
-	info->tcpi_ato = jiffies_to_usecs(min(icsk->icsk_ack.ato,
-					      tcp_delack_max(sk)));
+	info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
+						tcp_delack_max(sk)));
 	info->tcpi_snd_mss = tp->mss_cache;
 	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4b8f2e74d71d..2ae4b211c125 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -778,6 +778,16 @@ new_measure:
 	tp->rcvq_space.time = tp->tcp_mstamp;
 }
 
+static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (skb->protocol == htons(ETH_P_IPV6))
+		icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb)));
+#endif
+}
+
 /* There is something which you must keep in mind when you analyze the
  * behavior of the tp->ato delayed ack timeout interval.  When a
  * connection starts up, we want to ack as quickly as possible.  The
@@ -826,6 +836,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 	icsk->icsk_ack.lrcvtime = now;
+	tcp_save_lrcv_flowlabel(sk, skb);
 
 	tcp_ecn_check_ce(sk, skb);
 
@@ -4519,6 +4530,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
 	    sk_rethink_txhash(sk))
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+
+	/* Save last flowlabel after a spurious retrans. */
+	tcp_save_lrcv_flowlabel(sk, skb);
 }
 
 static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -4835,6 +4849,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	u32 seq, end_seq;
 	bool fragstolen;
 
+	tcp_save_lrcv_flowlabel(sk, skb);
 	tcp_ecn_check_ce(sk, skb);
 
 	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3f61c6a70a1f..0862b73dd3b5 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -322,7 +322,7 @@ void tcp_delack_timer_handler(struct sock *sk)
 	if (inet_csk_ack_scheduled(sk)) {
 		if (!inet_csk_in_pingpong_mode(sk)) {
 			/* Delayed ACK missed: inflate ATO. */
-			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
+			icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto);
 		} else {
 			/* Delayed ACK missed: leave pingpong mode and
 			 * deflate ATO.
-- 
cgit v1.2.3-70-g09d2


From 8a23f4ab92f9b7f258ca28cce2e34b80f56ab9d1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 6 Oct 2023 11:27:29 +0200
Subject: netfilter: conntrack: simplify nf_conntrack_alter_reply

nf_conntrack_alter_reply doesn't do helper reassignment anymore.
Remove the comments that make this claim.

Furthermore, remove dead code from the function and place ot
in nf_conntrack.h.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_conntrack.h | 14 ++++++++++----
 net/netfilter/nf_conntrack_core.c    | 18 ------------------
 net/netfilter/nf_conntrack_helper.c  |  7 +------
 3 files changed, 11 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 4085765c3370..cba3ccf03fcc 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -160,10 +160,6 @@ static inline struct net *nf_ct_net(const struct nf_conn *ct)
 	return read_pnet(&ct->ct_net);
 }
 
-/* Alter reply tuple (maybe alter helper). */
-void nf_conntrack_alter_reply(struct nf_conn *ct,
-			      const struct nf_conntrack_tuple *newreply);
-
 /* Is this tuple taken? (ignoring any belonging to the given
    conntrack). */
 int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
@@ -284,6 +280,16 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb)
 	return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK;
 }
 
+static inline void nf_conntrack_alter_reply(struct nf_conn *ct,
+					    const struct nf_conntrack_tuple *newreply)
+{
+	/* Must be unconfirmed, so not in hash table yet */
+	if (WARN_ON(nf_ct_is_confirmed(ct)))
+		return;
+
+	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+}
+
 #define nfct_time_stamp ((u32)(jiffies))
 
 /* jiffies until ct expires, 0 if already expired */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9f6f2e643575..124136b5a79a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2042,24 +2042,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_in);
 
-/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
-   implicitly racy: see __nf_conntrack_confirm */
-void nf_conntrack_alter_reply(struct nf_conn *ct,
-			      const struct nf_conntrack_tuple *newreply)
-{
-	struct nf_conn_help *help = nfct_help(ct);
-
-	/* Should be unconfirmed, so not in hash table yet */
-	WARN_ON(nf_ct_is_confirmed(ct));
-
-	nf_ct_dump_tuple(newreply);
-
-	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
-	if (ct->master || (help && !hlist_empty(&help->expectations)))
-		return;
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
-
 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
 void __nf_ct_refresh_acct(struct nf_conn *ct,
 			  enum ip_conntrack_info ctinfo,
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index f22691f83853..4ed5878cb25b 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -194,12 +194,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 	struct nf_conntrack_helper *helper = NULL;
 	struct nf_conn_help *help;
 
-	/* We already got a helper explicitly attached. The function
-	 * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
-	 * the helper up again. Since now the user is in full control of
-	 * making consistent helper configurations, skip this automatic
-	 * re-lookup, otherwise we'll lose the helper.
-	 */
+	/* We already got a helper explicitly attached (e.g. nft_ct) */
 	if (test_bit(IPS_HELPER_BIT, &ct->status))
 		return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 94ecde833be5779f8086c3a094dfa51e1dbce75f Mon Sep 17 00:00:00 2001
From: George Guo <guodongtai@kylinos.cn>
Date: Mon, 9 Oct 2023 10:55:48 +0800
Subject: netfilter: cleanup struct nft_table

Add comments for nlpid, family, udlen and udata in struct nft_table, and
afinfo is no longer a member of struct nft_table, so remove the comment
for it.

Signed-off-by: George Guo <guodongtai@kylinos.cn>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_tables.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 7c816359d5a9..9fb16485d08f 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1198,10 +1198,13 @@ static inline void nft_use_inc_restore(u32 *use)
  *	@hgenerator: handle generator state
  *	@handle: table handle
  *	@use: number of chain references to this table
+ *	@family:address family
  *	@flags: table flag (see enum nft_table_flags)
  *	@genmask: generation mask
- *	@afinfo: address family info
+ *	@nlpid: netlink port ID
  *	@name: name of the table
+ *	@udlen: length of the user data
+ *	@udata: user data
  *	@validate_state: internal, set when transaction adds jumps
  */
 struct nft_table {
-- 
cgit v1.2.3-70-g09d2


From 5247dbf16cee4e83eb89e4d3b87bd5e79c5d1655 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Mon, 9 Oct 2023 19:16:33 +0800
Subject: net/core: Introduce netdev_core_stats_inc()

Although there is a kfree_skb_reason() helper function that can be used to
find the reason why this skb is dropped, but most callers didn't increase
one of rx_dropped, tx_dropped, rx_nohandler and rx_otherhost_dropped.

For the users, people are more concerned about why the dropped in ip
is increasing.

Introduce netdev_core_stats_inc() for trace the caller of
dev_core_stats_*_inc().

Also, add __code to netdev_core_stats_alloc(), as it's called with small
probability. And add noinline make sure netdev_core_stats_inc was never
inlined.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Suggested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 21 ++++-----------------
 net/core/dev.c            | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e070a4540fba..11d704bfec9b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4002,32 +4002,19 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
 	return false;
 }
 
-struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev);
-
-static inline struct net_device_core_stats __percpu *dev_core_stats(struct net_device *dev)
-{
-	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
-	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
-
-	if (likely(p))
-		return p;
-
-	return netdev_core_stats_alloc(dev);
-}
+void netdev_core_stats_inc(struct net_device *dev, u32 offset);
 
 #define DEV_CORE_STATS_INC(FIELD)						\
 static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)		\
 {										\
-	struct net_device_core_stats __percpu *p;				\
-										\
-	p = dev_core_stats(dev);						\
-	if (p)									\
-		this_cpu_inc(p->FIELD);						\
+	netdev_core_stats_inc(dev,						\
+			offsetof(struct net_device_core_stats, FIELD));		\
 }
 DEV_CORE_STATS_INC(rx_dropped)
 DEV_CORE_STATS_INC(tx_dropped)
 DEV_CORE_STATS_INC(rx_nohandler)
 DEV_CORE_STATS_INC(rx_otherhost_dropped)
+#undef DEV_CORE_STATS_INC
 
 static __always_inline int ____dev_forward_skb(struct net_device *dev,
 					       struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 606a366cc209..02949a929e7f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10497,7 +10497,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 }
 EXPORT_SYMBOL(netdev_stats_to_stats64);
 
-struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
+static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
+		struct net_device *dev)
 {
 	struct net_device_core_stats __percpu *p;
 
@@ -10510,7 +10511,23 @@ struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device
 	/* This READ_ONCE() pairs with the cmpxchg() above */
 	return READ_ONCE(dev->core_stats);
 }
-EXPORT_SYMBOL(netdev_core_stats_alloc);
+
+noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
+	unsigned long __percpu *field;
+
+	if (unlikely(!p)) {
+		p = netdev_core_stats_alloc(dev);
+		if (!p)
+			return;
+	}
+
+	field = (__force unsigned long __percpu *)((__force void *)p + offset);
+	this_cpu_inc(*field);
+}
+EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
 
 /**
  *	dev_get_stats	- get network device statistics
-- 
cgit v1.2.3-70-g09d2


From fefba7d1ae198dcbf8b3b432de46a4e29f8dbd8c Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:04 +0200
Subject: bpf: Propagate modified uaddrlen from cgroup sockaddr programs

As prep for adding unix socket support to the cgroup sockaddr hooks,
let's propagate the sockaddr length back to the caller after running
a bpf cgroup sockaddr hook program. While not important for AF_INET or
AF_INET6, the sockaddr length is important when working with AF_UNIX
sockaddrs as the size of the sockaddr cannot be determined just from the
address family or the sockaddr's contents.

__cgroup_bpf_run_filter_sock_addr() is modified to take the uaddrlen as
an input/output argument. After running the program, the modified sockaddr
length is stored in the uaddrlen pointer.

Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-3-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup.h | 73 +++++++++++++++++++++++-----------------------
 include/linux/filter.h     |  1 +
 kernel/bpf/cgroup.c        | 17 +++++++++--
 net/ipv4/af_inet.c         |  7 +++--
 net/ipv4/ping.c            |  2 +-
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/udp.c             |  9 ++++--
 net/ipv6/af_inet6.c        |  9 +++---
 net/ipv6/ping.c            |  2 +-
 net/ipv6/tcp_ipv6.c        |  2 +-
 net/ipv6/udp.c             |  6 ++--
 11 files changed, 76 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8506690dbb9c..31561e789715 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -120,6 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
+				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags);
@@ -230,22 +231,22 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)				       \
 	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
 
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype)				       \
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype)		       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))					       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  NULL, NULL);	       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, NULL, NULL);  \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx)		       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx)	       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  t_ctx, NULL);	       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, t_ctx, NULL); \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -256,14 +257,14 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
  * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
  * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
  */
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags)	       \
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, bind_flags) \
 ({									       \
 	u32 __flags = 0;						       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  NULL, &__flags);     \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, NULL, &__flags); \
 		release_sock(sk);					       \
 		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
 			*bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE;	       \
@@ -276,29 +277,29 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	  cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) &&		       \
 	 (sk)->sk_prot->pre_connect)
 
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen)			\
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT)
 
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen)			\
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT)
 
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT, NULL)
 
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
 
-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
 
-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
 
-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
 
-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
 
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
@@ -477,24 +478,24 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 }
 
 #define cgroup_bpf_enabled(atype) (0)
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, flags) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ff7ecc89d3dd..bcd2bc15ff56 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1335,6 +1335,7 @@ struct bpf_sock_addr_kern {
 	 */
 	u64 tmp_reg;
 	void *t_ctx;	/* Attach type specific context. */
+	u32 uaddrlen;
 };
 
 struct bpf_sock_ops_kern {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 03b3d4492980..ac37bd53aee0 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1450,6 +1450,9 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  *                                       provided by user sockaddr
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
+ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
+ *            read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
+ *            uaddr.
  * @atype: The type of program to be executed
  * @t_ctx: Pointer to attach type specific context
  * @flags: Pointer to u32 which contains higher bits of BPF program
@@ -1462,6 +1465,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
+				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags)
@@ -1473,6 +1477,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	};
 	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
+	int ret;
 
 	/* Check socket family since not all sockets represent network
 	 * endpoint (e.g. AF_UNIX).
@@ -1483,11 +1488,19 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	if (!ctx.uaddr) {
 		memset(&unspec, 0, sizeof(unspec));
 		ctx.uaddr = (struct sockaddr *)&unspec;
+		ctx.uaddrlen = 0;
+	} else {
+		ctx.uaddrlen = *uaddrlen;
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
-				     0, flags);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+				    0, flags);
+
+	if (!ret && uaddr)
+		*uaddrlen = ctx.uaddrlen;
+
+	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3d2e30e20473..7e27ad37b939 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -452,7 +452,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
 						 CGROUP_INET4_BIND, &flags);
 	if (err)
 		return err;
@@ -788,6 +788,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 	struct sock *sk		= sock->sk;
 	struct inet_sock *inet	= inet_sk(sk);
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+	int sin_addr_len = sizeof(*sin);
 
 	sin->sin_family = AF_INET;
 	lock_sock(sk);
@@ -800,7 +801,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		}
 		sin->sin_port = inet->inet_dport;
 		sin->sin_addr.s_addr = inet->inet_daddr;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET4_GETPEERNAME);
 	} else {
 		__be32 addr = inet->inet_rcv_saddr;
@@ -808,7 +809,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 			addr = inet->inet_saddr;
 		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET4_GETSOCKNAME);
 	}
 	release_sock(sk);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4dd809b7b188..2887177822c9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -301,7 +301,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f13eb7e23d03..7c18dd3ce011 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -194,7 +194,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	sock_owned_by_me(sk);
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
 }
 
 /* This will initiate an outgoing connection. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ff984b6354..7b21a51dd25a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1143,7 +1143,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
-					    (struct sockaddr *)usin, &ipc.addr);
+					    (struct sockaddr *)usin,
+					    &msg->msg_namelen,
+					    &ipc.addr);
 		if (err)
 			goto out_free;
 		if (usin) {
@@ -1865,7 +1867,8 @@ try_again:
 		*addr_len = sizeof(*sin);
 
 		BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
-						      (struct sockaddr *)sin);
+						      (struct sockaddr *)sin,
+						      addr_len);
 	}
 
 	if (udp_test_bit(GRO_ENABLED, sk))
@@ -1904,7 +1907,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 EXPORT_SYMBOL(udp_pre_connect);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6337fb4504fd..c35d302a3da9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -454,7 +454,7 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
 						 CGROUP_INET6_BIND, &flags);
 	if (err)
 		return err;
@@ -520,6 +520,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		  int peer)
 {
 	struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
+	int sin_addr_len = sizeof(*sin);
 	struct sock *sk = sock->sk;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -539,7 +540,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin6_addr = sk->sk_v6_daddr;
 		if (inet6_test_bit(SNDFLOW, sk))
 			sin->sin6_flowinfo = np->flow_label;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET6_GETPEERNAME);
 	} else {
 		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -547,13 +548,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		else
 			sin->sin6_addr = sk->sk_v6_rcv_saddr;
 		sin->sin6_port = inet->inet_sport;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET6_GETSOCKNAME);
 	}
 	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
 						 sk->sk_bound_dev_if);
 	release_sock(sk);
-	return sizeof(*sin);
+	return sin_addr_len;
 }
 EXPORT_SYMBOL(inet6_getname);
 
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index e8fb0d275cc2..d2098dd4ceae 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -56,7 +56,7 @@ static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 94afb8d0f2d0..3a1e76a2d33e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -135,7 +135,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	sock_owned_by_me(sk);
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
 }
 
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5e9312eefed0..622b10a549f7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -410,7 +410,8 @@ try_again:
 		*addr_len = sizeof(*sin6);
 
 		BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
-						      (struct sockaddr *)sin6);
+						      (struct sockaddr *)sin6,
+						      addr_len);
 	}
 
 	if (udp_test_bit(GRO_ENABLED, sk))
@@ -1157,7 +1158,7 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 /**
@@ -1510,6 +1511,7 @@ do_udp_sendmsg:
 	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
 					   (struct sockaddr *)sin6,
+					   &addr_len,
 					   &fl6->saddr);
 		if (err)
 			goto out_no_dst;
-- 
cgit v1.2.3-70-g09d2


From 859051dd165ec6cc915f0f2114699021144fd249 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:06 +0200
Subject: bpf: Implement cgroup sockaddr hooks for unix sockets

These hooks allows intercepting connect(), getsockname(),
getpeername(), sendmsg() and recvmsg() for unix sockets. The unix
socket hooks get write access to the address length because the
address length is not fixed when dealing with unix sockets and
needs to be modified when a unix socket address is modified by
the hook. Because abstract socket unix addresses start with a
NUL byte, we cannot recalculate the socket address in kernelspace
after running the hook by calculating the length of the unix socket
path using strlen().

These hooks can be used when users want to multiplex syscall to a
single unix socket to multiple different processes behind the scenes
by redirecting the connect() and other syscalls to process specific
sockets.

We do not implement support for intercepting bind() because when
using bind() with unix sockets with a pathname address, this creates
an inode in the filesystem which must be cleaned up. If we rewrite
the address, the user might try to clean up the wrong file, leaking
the socket in the filesystem where it is never cleaned up. Until we
figure out a solution for this (and a use case for intercepting bind()),
we opt to not allow rewriting the sockaddr in bind() calls.

We also implement recvmsg() support for connected streams so that
after a connect() that is modified by a sockaddr hook, any corresponding
recmvsg() on the connected socket can also be modified to make the
connected program think it is connected to the "intended" remote.

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-5-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup-defs.h |  5 +++++
 include/linux/bpf-cgroup.h      | 17 +++++++++++++++++
 include/uapi/linux/bpf.h        | 13 +++++++++----
 kernel/bpf/cgroup.c             | 11 +++++++++--
 kernel/bpf/syscall.c            | 15 +++++++++++++++
 kernel/bpf/verifier.c           |  5 ++++-
 net/core/filter.c               | 14 ++++++++++++--
 net/unix/af_unix.c              | 35 ++++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h  | 13 +++++++++----
 9 files changed, 114 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 7b121bd780eb..0985221d5478 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -28,19 +28,24 @@ enum cgroup_bpf_attach_type {
 	CGROUP_INET6_BIND,
 	CGROUP_INET4_CONNECT,
 	CGROUP_INET6_CONNECT,
+	CGROUP_UNIX_CONNECT,
 	CGROUP_INET4_POST_BIND,
 	CGROUP_INET6_POST_BIND,
 	CGROUP_UDP4_SENDMSG,
 	CGROUP_UDP6_SENDMSG,
+	CGROUP_UNIX_SENDMSG,
 	CGROUP_SYSCTL,
 	CGROUP_UDP4_RECVMSG,
 	CGROUP_UDP6_RECVMSG,
+	CGROUP_UNIX_RECVMSG,
 	CGROUP_GETSOCKOPT,
 	CGROUP_SETSOCKOPT,
 	CGROUP_INET4_GETPEERNAME,
 	CGROUP_INET6_GETPEERNAME,
+	CGROUP_UNIX_GETPEERNAME,
 	CGROUP_INET4_GETSOCKNAME,
 	CGROUP_INET6_GETSOCKNAME,
+	CGROUP_UNIX_GETSOCKNAME,
 	CGROUP_INET_SOCK_RELEASE,
 	CGROUP_LSM_START,
 	CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1,
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 31561e789715..98b8cea904fe 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -48,19 +48,24 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
 	CGROUP_ATYPE(CGROUP_INET6_BIND);
 	CGROUP_ATYPE(CGROUP_INET4_CONNECT);
 	CGROUP_ATYPE(CGROUP_INET6_CONNECT);
+	CGROUP_ATYPE(CGROUP_UNIX_CONNECT);
 	CGROUP_ATYPE(CGROUP_INET4_POST_BIND);
 	CGROUP_ATYPE(CGROUP_INET6_POST_BIND);
 	CGROUP_ATYPE(CGROUP_UDP4_SENDMSG);
 	CGROUP_ATYPE(CGROUP_UDP6_SENDMSG);
+	CGROUP_ATYPE(CGROUP_UNIX_SENDMSG);
 	CGROUP_ATYPE(CGROUP_SYSCTL);
 	CGROUP_ATYPE(CGROUP_UDP4_RECVMSG);
 	CGROUP_ATYPE(CGROUP_UDP6_RECVMSG);
+	CGROUP_ATYPE(CGROUP_UNIX_RECVMSG);
 	CGROUP_ATYPE(CGROUP_GETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_SETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME);
+	CGROUP_ATYPE(CGROUP_UNIX_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
+	CGROUP_ATYPE(CGROUP_UNIX_GETSOCKNAME);
 	CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
 	default:
 		return CGROUP_BPF_ATTACH_TYPE_INVALID;
@@ -289,18 +294,27 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_CONNECT, NULL)
+
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_SENDMSG, t_ctx)
+
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_RECVMSG, NULL)
+
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
  * sk_to_full_sk().
@@ -492,10 +506,13 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e0aa457f94a9..7ba61b75bc0e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1047,6 +1047,11 @@ enum bpf_attach_type {
 	BPF_TCX_INGRESS,
 	BPF_TCX_EGRESS,
 	BPF_TRACE_UPROBE_MULTI,
+	BPF_CGROUP_UNIX_CONNECT,
+	BPF_CGROUP_UNIX_SENDMSG,
+	BPF_CGROUP_UNIX_RECVMSG,
+	BPF_CGROUP_UNIX_GETPEERNAME,
+	BPF_CGROUP_UNIX_GETSOCKNAME,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2704,8 +2709,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **setsockopt()**.
  * 		It supports the following *level*\ s:
@@ -2943,8 +2948,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **getsockopt()**.
  * 		It supports the same set of *optname*\ s that is supported by
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ac37bd53aee0..74ad2215e1ba 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1458,7 +1458,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @flags: Pointer to u32 which contains higher bits of BPF program
  *         return value (OR'ed together).
  *
- * socket is expected to be of type INET or INET6.
+ * socket is expected to be of type INET, INET6 or UNIX.
  *
  * This function will return %-EPERM if an attached program is found and
  * returned value != 1 during execution. In all other cases, 0 is returned.
@@ -1482,7 +1482,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	/* Check socket family since not all sockets represent network
 	 * endpoint (e.g. AF_UNIX).
 	 */
-	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
+	    sk->sk_family != AF_UNIX)
 		return 0;
 
 	if (!ctx.uaddr) {
@@ -2533,10 +2534,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_SOCK_OPS:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return NULL;
 		default:
 			return &bpf_get_retval_proto;
@@ -2548,10 +2552,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_SOCK_OPS:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return NULL;
 		default:
 			return &bpf_set_retval_proto;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6b5280f14a53..8677837f3deb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2446,14 +2446,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -3678,14 +3683,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UNIX_CONNECT:
 	case BPF_CGROUP_INET4_GETPEERNAME:
 	case BPF_CGROUP_INET6_GETPEERNAME:
+	case BPF_CGROUP_UNIX_GETPEERNAME:
 	case BPF_CGROUP_INET4_GETSOCKNAME:
 	case BPF_CGROUP_INET6_GETSOCKNAME:
+	case BPF_CGROUP_UNIX_GETSOCKNAME:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UNIX_SENDMSG:
 	case BPF_CGROUP_UDP4_RECVMSG:
 	case BPF_CGROUP_UDP6_RECVMSG:
+	case BPF_CGROUP_UNIX_RECVMSG:
 		return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 	case BPF_CGROUP_SOCK_OPS:
 		return BPF_PROG_TYPE_SOCK_OPS;
@@ -3942,14 +3952,19 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UNIX_CONNECT:
 	case BPF_CGROUP_INET4_GETPEERNAME:
 	case BPF_CGROUP_INET6_GETPEERNAME:
+	case BPF_CGROUP_UNIX_GETPEERNAME:
 	case BPF_CGROUP_INET4_GETSOCKNAME:
 	case BPF_CGROUP_INET6_GETSOCKNAME:
+	case BPF_CGROUP_UNIX_GETSOCKNAME:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UNIX_SENDMSG:
 	case BPF_CGROUP_UDP4_RECVMSG:
 	case BPF_CGROUP_UDP6_RECVMSG:
+	case BPF_CGROUP_UNIX_RECVMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 	case BPF_CGROUP_SYSCTL:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eed7350e15f4..e777f50401b6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14797,10 +14797,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
 		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
-		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
+		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
 			range = tnum_range(1, 1);
 		if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
diff --git a/net/core/filter.c b/net/core/filter.c
index ff0bd9f20b95..cc2e4babc85f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7875,14 +7875,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return &bpf_sock_addr_setsockopt_proto;
 		default:
 			return NULL;
@@ -7893,14 +7898,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return &bpf_sock_addr_getsockopt_proto;
 		default:
 			return NULL;
@@ -8948,8 +8958,8 @@ static bool sock_addr_is_valid_access(int off, int size,
 	if (off % size != 0)
 		return false;
 
-	/* Disallow access to IPv6 fields from IPv4 contex and vise
-	 * versa.
+	/* Disallow access to fields not belonging to the attach type's address
+	 * family.
 	 */
 	switch (off) {
 	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3e8a04a13668..e10d07c76044 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -116,6 +116,7 @@
 #include <linux/freezer.h>
 #include <linux/file.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf-cgroup.h>
 
 #include "scm.h"
 
@@ -1381,6 +1382,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		if (err)
 			goto out;
 
+		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
+		if (err)
+			goto out;
+
 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
 		    !unix_sk(sk)->addr) {
@@ -1490,6 +1495,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (err)
 		goto out;
 
+	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
+	if (err)
+		goto out;
+
 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
 		err = unix_autobind(sk);
@@ -1770,6 +1779,13 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
 	} else {
 		err = addr->len;
 		memcpy(sunaddr, addr->name, addr->len);
+
+		if (peer)
+			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
+					       CGROUP_UNIX_GETPEERNAME);
+		else
+			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
+					       CGROUP_UNIX_GETSOCKNAME);
 	}
 	sock_put(sk);
 out:
@@ -1922,6 +1938,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
 		if (err)
 			goto out;
+
+		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
+							    msg->msg_name,
+							    &msg->msg_namelen,
+							    NULL);
+		if (err)
+			goto out;
 	} else {
 		sunaddr = NULL;
 		err = -ENOTCONN;
@@ -2390,9 +2413,14 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
 						EPOLLOUT | EPOLLWRNORM |
 						EPOLLWRBAND);
 
-	if (msg->msg_name)
+	if (msg->msg_name) {
 		unix_copy_addr(msg, skb->sk);
 
+		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
+						      msg->msg_name,
+						      &msg->msg_namelen);
+	}
+
 	if (size > skb->len - skip)
 		size = skb->len - skip;
 	else if (size < skb->len - skip)
@@ -2744,6 +2772,11 @@ unlock:
 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
 					 state->msg->msg_name);
 			unix_copy_addr(state->msg, skb->sk);
+
+			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
+							      state->msg->msg_name,
+							      &state->msg->msg_namelen);
+
 			sunaddr = NULL;
 		}
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e0aa457f94a9..7ba61b75bc0e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1047,6 +1047,11 @@ enum bpf_attach_type {
 	BPF_TCX_INGRESS,
 	BPF_TCX_EGRESS,
 	BPF_TRACE_UPROBE_MULTI,
+	BPF_CGROUP_UNIX_CONNECT,
+	BPF_CGROUP_UNIX_SENDMSG,
+	BPF_CGROUP_UNIX_RECVMSG,
+	BPF_CGROUP_UNIX_GETPEERNAME,
+	BPF_CGROUP_UNIX_GETSOCKNAME,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2704,8 +2709,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **setsockopt()**.
  * 		It supports the following *level*\ s:
@@ -2943,8 +2948,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **getsockopt()**.
  * 		It supports the same set of *optname*\ s that is supported by
-- 
cgit v1.2.3-70-g09d2


From 0a779003213b589d7b3eef72f69e19a30f603ebc Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Mon, 9 Oct 2023 15:37:51 +0200
Subject: netdev: make napi_schedule return bool on NAPI successful schedule

Change napi_schedule to return a bool on NAPI successful schedule.
This might be useful for some driver to do additional steps after a
NAPI has been scheduled.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20231009133754.9834-2-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 11d704bfec9b..2874770a0a74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -490,11 +490,18 @@ bool napi_schedule_prep(struct napi_struct *n);
  *
  * Schedule NAPI poll routine to be called if it is not already
  * running.
+ * Return true if we schedule a NAPI or false if not.
+ * Refer to napi_schedule_prep() for additional reason on why
+ * a NAPI might not be scheduled.
  */
-static inline void napi_schedule(struct napi_struct *n)
+static inline bool napi_schedule(struct napi_struct *n)
 {
-	if (napi_schedule_prep(n))
+	if (napi_schedule_prep(n)) {
 		__napi_schedule(n);
+		return true;
+	}
+
+	return false;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 73382e919f3d938554dadd01d95760f90d1c25c1 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Mon, 9 Oct 2023 15:37:52 +0200
Subject: netdev: replace napi_reschedule with napi_schedule

Now that napi_schedule return a bool, we can drop napi_reschedule that
does the same exact function. The function comes from a very old commit
bfe13f54f502 ("ibm_emac: Convert to use napi_struct independent of struct
net_device") and the purpose is actually deprecated in favour of
different logic.

Convert every user of napi_reschedule to napi_schedule.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Acked-by: Jeff Johnson <quic_jjohnson@quicinc.com> # ath10k
Acked-by: Nick Child <nnac123@linux.ibm.com> # ibm
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de> # for can/dev/rx-offload.c
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20231009133754.9834-3-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c                |  4 ++--
 drivers/net/can/dev/rx-offload.c                       |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/sge.c               |  2 +-
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c             |  2 +-
 drivers/net/ethernet/ezchip/nps_enet.c                 |  2 +-
 drivers/net/ethernet/google/gve/gve_main.c             |  2 +-
 drivers/net/ethernet/ibm/ehea/ehea_main.c              |  2 +-
 drivers/net/ethernet/ibm/emac/mal.c                    |  2 +-
 drivers/net/ethernet/ibm/ibmveth.c                     |  2 +-
 drivers/net/ethernet/ibm/ibmvnic.c                     |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c             |  2 +-
 drivers/net/ethernet/ni/nixge.c                        |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c |  2 +-
 drivers/net/ethernet/xscale/ixp4xx_eth.c               |  4 ++--
 drivers/net/fjes/fjes_main.c                           |  2 +-
 drivers/net/wan/ixp4xx_hss.c                           |  4 ++--
 drivers/net/wireless/ath/ath10k/pci.c                  |  2 +-
 drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c             |  2 +-
 include/linux/netdevice.h                              | 10 ----------
 19 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index ed25061fac62..7f84d9866cef 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -488,7 +488,7 @@ poll_more:
 		if (unlikely(ib_req_notify_cq(priv->recv_cq,
 					      IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
-		    napi_reschedule(napi))
+		    napi_schedule(napi))
 			goto poll_more;
 	}
 
@@ -518,7 +518,7 @@ poll_more:
 		napi_complete(napi);
 		if (unlikely(ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
-		    napi_reschedule(napi))
+		    napi_schedule(napi))
 			goto poll_more;
 	}
 	return n < 0 ? 0 : n;
diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c
index 77091f7d1fa7..46e7b6db4a1e 100644
--- a/drivers/net/can/dev/rx-offload.c
+++ b/drivers/net/can/dev/rx-offload.c
@@ -67,7 +67,7 @@ static int can_rx_offload_napi_poll(struct napi_struct *napi, int quota)
 
 		/* Check if there was another interrupt */
 		if (!skb_queue_empty(&offload->skb_queue))
-			napi_reschedule(&offload->napi);
+			napi_schedule(&offload->napi);
 	}
 
 	return work_done;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 98dd78551d89..b5ff2e1a9975 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -4261,7 +4261,7 @@ static void sge_rx_timer_cb(struct timer_list *t)
 
 			if (fl_starving(adap, fl)) {
 				rxq = container_of(fl, struct sge_eth_rxq, fl);
-				if (napi_reschedule(&rxq->rspq.napi))
+				if (napi_schedule(&rxq->rspq.napi))
 					fl->starving++;
 				else
 					set_bit(id, s->starving_fl);
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index 2d0cf76fb3c5..5b1d746e6563 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -2094,7 +2094,7 @@ static void sge_rx_timer_cb(struct timer_list *t)
 				struct sge_eth_rxq *rxq;
 
 				rxq = container_of(fl, struct sge_eth_rxq, fl);
-				if (napi_reschedule(&rxq->rspq.napi))
+				if (napi_schedule(&rxq->rspq.napi))
 					fl->starving++;
 				else
 					set_bit(id, s->starving_fl);
diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c
index edf000e7bab4..4d7184d46824 100644
--- a/drivers/net/ethernet/ezchip/nps_enet.c
+++ b/drivers/net/ethernet/ezchip/nps_enet.c
@@ -198,7 +198,7 @@ static int nps_enet_poll(struct napi_struct *napi, int budget)
 		 */
 		if (nps_enet_is_tx_pending(priv)) {
 			nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0);
-			napi_reschedule(napi);
+			napi_schedule(napi);
 		}
 	}
 
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 83b09dcfafc4..276f996f95dc 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -281,7 +281,7 @@ static int gve_napi_poll(struct napi_struct *napi, int budget)
 		if (block->rx)
 			reschedule |= gve_rx_work_pending(block->rx);
 
-		if (reschedule && napi_reschedule(napi))
+		if (reschedule && napi_schedule(napi))
 			iowrite32be(GVE_IRQ_MASK, irq_doorbell);
 	}
 	return work_done;
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 251dedd55cfb..1e29e5c9a2df 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -900,7 +900,7 @@ static int ehea_poll(struct napi_struct *napi, int budget)
 		if (!cqe && !cqe_skb)
 			return rx;
 
-		if (!napi_reschedule(napi))
+		if (!napi_schedule(napi))
 			return rx;
 
 		cqe_skb = ehea_proc_cqes(pr, EHEA_POLL_MAX_CQES);
diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c
index 462646d1b817..2439f7e96e05 100644
--- a/drivers/net/ethernet/ibm/emac/mal.c
+++ b/drivers/net/ethernet/ibm/emac/mal.c
@@ -442,7 +442,7 @@ static int mal_poll(struct napi_struct *napi, int budget)
 		if (unlikely(mc->ops->peek_rx(mc->dev) ||
 			     test_bit(MAL_COMMAC_RX_STOPPED, &mc->flags))) {
 			MAL_DBG2(mal, "rotting packet" NL);
-			if (!napi_reschedule(napi))
+			if (!napi_schedule(napi))
 				goto more_work;
 
 			spin_lock_irqsave(&mal->lock, flags);
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index a8d79ee350f8..b5aef0b29efe 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1432,7 +1432,7 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
 		BUG_ON(lpar_rc != H_SUCCESS);
 
 		if (ibmveth_rxq_pending_buffer(adapter) &&
-		    napi_reschedule(napi)) {
+		    napi_schedule(napi)) {
 			lpar_rc = h_vio_signal(adapter->vdev->unit_address,
 					       VIO_IRQ_DISABLE);
 		}
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index cdf5251e5679..2094f413cbe4 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3519,7 +3519,7 @@ restart_poll:
 		if (napi_complete_done(napi, frames_processed)) {
 			enable_scrq_irq(adapter, rx_scrq);
 			if (pending_scrq(adapter, rx_scrq)) {
-				if (napi_reschedule(napi)) {
+				if (napi_schedule(napi)) {
 					disable_scrq_irq(adapter, rx_scrq);
 					goto restart_poll;
 				}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 332472fe4990..a09b6e05337d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -400,7 +400,7 @@ void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
 	for (ring = 0; ring < priv->rx_ring_num; ring++) {
 		if (mlx4_en_is_ring_empty(priv->rx_ring[ring])) {
 			local_bh_disable();
-			napi_reschedule(&priv->rx_cq[ring]->napi);
+			napi_schedule(&priv->rx_cq[ring]->napi);
 			local_bh_enable();
 		}
 	}
diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
index f71a4f8bbb89..fa1f78b03cb2 100644
--- a/drivers/net/ethernet/ni/nixge.c
+++ b/drivers/net/ethernet/ni/nixge.c
@@ -683,7 +683,7 @@ static int nixge_poll(struct napi_struct *napi, int budget)
 		if (status & (XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK)) {
 			/* If there's more, reschedule, but clear */
 			nixge_dma_write_reg(priv, XAXIDMA_RX_SR_OFFSET, status);
-			napi_reschedule(napi);
+			napi_schedule(napi);
 		} else {
 			/* if not, turn on RX IRQs again ... */
 			cr = nixge_dma_read_reg(priv, XAXIDMA_RX_CR_OFFSET);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
index f9e43fc32ee8..3ca1c2a816ff 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
@@ -802,7 +802,7 @@ static int stmmac_test_flowctrl(struct stmmac_priv *priv)
 		stmmac_start_rx(priv, priv->ioaddr, i);
 
 		local_bh_disable();
-		napi_reschedule(&ch->rx_napi);
+		napi_schedule(&ch->rx_napi);
 		local_bh_enable();
 	}
 
diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c
index c47108f2a0a0..531bf919aef5 100644
--- a/drivers/net/ethernet/xscale/ixp4xx_eth.c
+++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c
@@ -723,9 +723,9 @@ static int eth_poll(struct napi_struct *napi, int budget)
 			napi_complete(napi);
 			qmgr_enable_irq(rxq);
 			if (!qmgr_stat_below_low_watermark(rxq) &&
-			    napi_reschedule(napi)) { /* not empty again */
+			    napi_schedule(napi)) { /* not empty again */
 #if DEBUG_RX
-				netdev_debug(dev, "eth_poll napi_reschedule succeeded\n");
+				netdev_debug(dev, "eth_poll napi_schedule succeeded\n");
 #endif
 				qmgr_disable_irq(rxq);
 				continue;
diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c
index 2513be6d4e11..cd8cf08477ec 100644
--- a/drivers/net/fjes/fjes_main.c
+++ b/drivers/net/fjes/fjes_main.c
@@ -1030,7 +1030,7 @@ static int fjes_poll(struct napi_struct *napi, int budget)
 		}
 
 		if (((long)jiffies - (long)adapter->rx_last_jiffies) < 3) {
-			napi_reschedule(napi);
+			napi_schedule(napi);
 		} else {
 			spin_lock(&hw->rx_status_lock);
 			for (epidx = 0; epidx < hw->max_epid; epidx++) {
diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
index e46b7f5ee49e..b09f4c235142 100644
--- a/drivers/net/wan/ixp4xx_hss.c
+++ b/drivers/net/wan/ixp4xx_hss.c
@@ -687,10 +687,10 @@ static int hss_hdlc_poll(struct napi_struct *napi, int budget)
 			napi_complete(napi);
 			qmgr_enable_irq(rxq);
 			if (!qmgr_stat_empty(rxq) &&
-			    napi_reschedule(napi)) {
+			    napi_schedule(napi)) {
 #if DEBUG_RX
 				printk(KERN_DEBUG "%s: hss_hdlc_poll"
-				       " napi_reschedule succeeded\n",
+				       " napi_schedule succeeded\n",
 				       dev->name);
 #endif
 				qmgr_disable_irq(rxq);
diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index 23f366221939..2f8c785277af 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -3148,7 +3148,7 @@ static int ath10k_pci_napi_poll(struct napi_struct *ctx, int budget)
 		 * immediate servicing.
 		 */
 		if (ath10k_ce_interrupt_summary(ar)) {
-			napi_reschedule(ctx);
+			napi_schedule(ctx);
 			goto out;
 		}
 		ath10k_pci_enable_legacy_irq(ar);
diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c
index f4ff2198b5ef..210d84c67ef9 100644
--- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c
+++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c
@@ -852,7 +852,7 @@ int t7xx_dpmaif_napi_rx_poll(struct napi_struct *napi, const int budget)
 	if (!ret) {
 		napi_complete_done(napi, work_done);
 		rxq->sleep_lock_pending = true;
-		napi_reschedule(napi);
+		napi_schedule(napi);
 		return work_done;
 	}
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2874770a0a74..ae553f886796 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -516,16 +516,6 @@ static inline void napi_schedule_irqoff(struct napi_struct *n)
 		__napi_schedule_irqoff(n);
 }
 
-/* Try to reschedule poll. Called by dev->poll() after napi_complete().  */
-static inline bool napi_reschedule(struct napi_struct *napi)
-{
-	if (napi_schedule_prep(napi)) {
-		__napi_schedule(napi);
-		return true;
-	}
-	return false;
-}
-
 /**
  * napi_complete_done - NAPI processing complete
  * @n: NAPI context
-- 
cgit v1.2.3-70-g09d2


From 61b546b1f3e24007bc39e31e332466725e7df470 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Wed, 11 Oct 2023 19:52:51 +0800
Subject: wifi: radiotap: add bandwidth definition of EHT U-SIG

Define EHT U-SIG bandwidth used by radiotap according to Table 36-28
"U-SIG field of an EHT MU PPDU" in 802.11be (D3.0).

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20231011115256.6121-2-pkshih@realtek.com
---
 include/net/ieee80211_radiotap.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 2338f8d2a8b3..925bac726a92 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -539,6 +539,12 @@ enum ieee80211_radiotap_eht_usig_common {
 	IEEE80211_RADIOTAP_EHT_USIG_COMMON_VALIDATE_BITS_OK	= 0x00000080,
 	IEEE80211_RADIOTAP_EHT_USIG_COMMON_PHY_VER		= 0x00007000,
 	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW			= 0x00038000,
+		IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_20MHZ		= 0,
+		IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_40MHZ		= 1,
+		IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_80MHZ		= 2,
+		IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_160MHZ		= 3,
+		IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_320MHZ_1		= 4,
+		IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_320MHZ_2		= 5,
 	IEEE80211_RADIOTAP_EHT_USIG_COMMON_UL_DL		= 0x00040000,
 	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BSS_COLOR		= 0x01f80000,
 	IEEE80211_RADIOTAP_EHT_USIG_COMMON_TXOP			= 0xfe000000,
-- 
cgit v1.2.3-70-g09d2


From 38985e8c278b82e6d4d62d4acd57c761cc23ce63 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Mon, 9 Oct 2023 13:06:08 +0300
Subject: net: Handle bulk delete policy in bridge driver

The merge commit 92716869375b ("Merge branch 'br-flush-filtering'")
added support for FDB flushing in bridge driver. The following patches
will extend VXLAN driver to support FDB flushing as well. The netlink
message for bulk delete is shared between the drivers. With the existing
implementation, there is no way to prevent user from flushing with
attributes that are not supported per driver. For example, when VNI will
be added, user will not get an error for flush FDB entries in bridge
with VNI, although this attribute is not relevant for bridge.

As preparation for support of FDB flush in VXLAN driver, move the policy
to be handled in bridge driver, later a new policy for VXLAN will be
added in VXLAN driver. Do not pass 'vid' as part of ndo_fdb_del_bulk(),
as this field is relevant only for bridge.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  8 ++------
 net/bridge/br_fdb.c       | 29 ++++++++++++++++++++++++-----
 net/bridge/br_private.h   |  3 +--
 net/core/rtnetlink.c      | 27 ++++++++++-----------------
 4 files changed, 37 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ae553f886796..1c7681263d30 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1287,9 +1287,7 @@ struct netdev_net_notifier {
  *		      struct net_device *dev,
  *		      const unsigned char *addr, u16 vid)
  *	Deletes the FDB entry from dev coresponding to addr.
- * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[],
- *			   struct net_device *dev,
- *			   u16 vid,
+ * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
  *			   struct netlink_ext_ack *extack);
  * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
  *		       struct net_device *dev, struct net_device *filter_dev,
@@ -1564,10 +1562,8 @@ struct net_device_ops {
 					       struct net_device *dev,
 					       const unsigned char *addr,
 					       u16 vid, struct netlink_ext_ack *extack);
-	int			(*ndo_fdb_del_bulk)(struct ndmsg *ndm,
-						    struct nlattr *tb[],
+	int			(*ndo_fdb_del_bulk)(struct nlmsghdr *nlh,
 						    struct net_device *dev,
-						    u16 vid,
 						    struct netlink_ext_ack *extack);
 	int			(*ndo_fdb_dump)(struct sk_buff *skb,
 						struct netlink_callback *cb,
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e69a872bfc1d..a98ad763b368 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -661,14 +661,30 @@ static int __fdb_flush_validate_ifindex(const struct net_bridge *br,
 	return 0;
 }
 
-int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[],
-		       struct net_device *dev, u16 vid,
+static const struct nla_policy br_fdb_del_bulk_policy[NDA_MAX + 1] = {
+	[NDA_VLAN]	= NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2),
+	[NDA_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
+	[NDA_NDM_STATE_MASK]	= { .type = NLA_U16 },
+	[NDA_NDM_FLAGS_MASK]	= { .type = NLA_U8 },
+};
+
+int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
 		       struct netlink_ext_ack *extack)
 {
-	u8 ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS;
-	struct net_bridge_fdb_flush_desc desc = { .vlan_id = vid };
+	struct net_bridge_fdb_flush_desc desc = {};
+	struct ndmsg *ndm = nlmsg_data(nlh);
 	struct net_bridge_port *p = NULL;
+	struct nlattr *tb[NDA_MAX + 1];
 	struct net_bridge *br;
+	u8 ndm_flags;
+	int err;
+
+	ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS;
+
+	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
+			  br_fdb_del_bulk_policy, extack);
+	if (err)
+		return err;
 
 	if (netif_is_bridge_master(dev)) {
 		br = netdev_priv(dev);
@@ -681,6 +697,9 @@ int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[],
 		br = p->br;
 	}
 
+	if (tb[NDA_VLAN])
+		desc.vlan_id = nla_get_u16(tb[NDA_VLAN]);
+
 	if (ndm_flags & ~FDB_FLUSH_ALLOWED_NDM_FLAGS) {
 		NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set");
 		return -EINVAL;
@@ -703,7 +722,7 @@ int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[],
 		desc.flags_mask |= __ndm_flags_to_fdb_flags(ndm_flags_mask);
 	}
 	if (tb[NDA_IFINDEX]) {
-		int err, ifidx = nla_get_s32(tb[NDA_IFINDEX]);
+		int ifidx = nla_get_s32(tb[NDA_IFINDEX]);
 
 		err = __fdb_flush_validate_ifindex(br, ifidx, extack);
 		if (err)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a1f4acfa6994..cbbe35278459 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -847,8 +847,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 		  struct net_device *dev, const unsigned char *addr, u16 vid,
 		  struct netlink_ext_ack *extack);
-int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[],
-		       struct net_device *dev, u16 vid,
+int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
 		       struct netlink_ext_ack *extack);
 int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
 	       const unsigned char *addr, u16 vid, u16 nlh_flags,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 7452a6d190c5..eef7f7788996 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4367,13 +4367,6 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
 }
 EXPORT_SYMBOL(ndo_dflt_fdb_del);
 
-static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = {
-	[NDA_VLAN]	= { .type = NLA_U16 },
-	[NDA_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
-	[NDA_NDM_STATE_MASK]	= { .type = NLA_U16  },
-	[NDA_NDM_FLAGS_MASK]	= { .type = NLA_U8 },
-};
-
 static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 			struct netlink_ext_ack *extack)
 {
@@ -4394,8 +4387,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 		err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
 					     NULL, extack);
 	} else {
-		err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
-				  fdb_del_bulk_policy, extack);
+		/* For bulk delete, the drivers will parse the message with
+		 * policy.
+		 */
+		err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
 	}
 	if (err < 0)
 		return err;
@@ -4418,6 +4413,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 			return -EINVAL;
 		}
 		addr = nla_data(tb[NDA_LLADDR]);
+
+		err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+		if (err)
+			return err;
 	}
 
 	if (dev->type != ARPHRD_ETHER) {
@@ -4425,10 +4424,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 
-	err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
-	if (err)
-		return err;
-
 	err = -EOPNOTSUPP;
 
 	/* Support fdb on master device the net/bridge default case */
@@ -4442,8 +4437,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 				err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
 		} else {
 			if (ops->ndo_fdb_del_bulk)
-				err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
-							    extack);
+				err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
 		}
 
 		if (err)
@@ -4464,8 +4458,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 			/* in case err was cleared by NTF_MASTER call */
 			err = -EOPNOTSUPP;
 			if (ops->ndo_fdb_del_bulk)
-				err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
-							    extack);
+				err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
 		}
 
 		if (!err) {
-- 
cgit v1.2.3-70-g09d2


From 6d5029e54700b2427581513c533232b02ce05043 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 9 Oct 2023 22:50:43 +0200
Subject: tls: store rec_seq directly within cipher_context

TLS_MAX_REC_SEQ_SIZE is 8B, we don't get anything by using kmalloc.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    |  2 +-
 net/tls/tls_device.c | 11 ++---------
 net/tls/tls_main.c   |  1 -
 net/tls/tls_sw.c     | 13 ++-----------
 4 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index a2b44578dcb7..f3f22b08af26 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -194,7 +194,7 @@ enum tls_context_flags {
 
 struct cipher_context {
 	char *iv;
-	char *rec_seq;
+	char rec_seq[TLS_MAX_REC_SEQ_SIZE];
 };
 
 union tls_crypto_context {
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index fbd687a0c66f..525d7b813869 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -58,7 +58,6 @@ static void tls_device_free_ctx(struct tls_context *ctx)
 {
 	if (ctx->tx_conf == TLS_HW) {
 		kfree(tls_offload_ctx_tx(ctx));
-		kfree(ctx->tx.rec_seq);
 		kfree(ctx->tx.iv);
 	}
 
@@ -1098,16 +1097,12 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
 	memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv);
 
 	prot->rec_seq_size = cipher_desc->rec_seq;
-	ctx->tx.rec_seq = kmemdup(rec_seq, cipher_desc->rec_seq, GFP_KERNEL);
-	if (!ctx->tx.rec_seq) {
-		rc = -ENOMEM;
-		goto free_iv;
-	}
+	memcpy(ctx->tx.rec_seq, rec_seq, cipher_desc->rec_seq);
 
 	start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
 	if (!start_marker_record) {
 		rc = -ENOMEM;
-		goto free_rec_seq;
+		goto free_iv;
 	}
 
 	offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
@@ -1192,8 +1187,6 @@ free_offload_ctx:
 	ctx->priv_ctx_tx = NULL;
 free_marker_record:
 	kfree(start_marker_record);
-free_rec_seq:
-	kfree(ctx->tx.rec_seq);
 free_iv:
 	kfree(ctx->tx.iv);
 release_netdev:
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 02f583ff9239..f705d812fc36 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -344,7 +344,6 @@ static void tls_sk_proto_cleanup(struct sock *sk,
 
 	/* We need these for tls_sw_fallback handling of other packets */
 	if (ctx->tx_conf == TLS_SW) {
-		kfree(ctx->tx.rec_seq);
 		kfree(ctx->tx.iv);
 		tls_sw_release_resources_tx(sk);
 		TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 270712b8d391..93d40c9a6823 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2467,7 +2467,6 @@ void tls_sw_release_resources_rx(struct sock *sk)
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
-	kfree(tls_ctx->rx.rec_seq);
 	kfree(tls_ctx->rx.iv);
 
 	if (ctx->aead_recv) {
@@ -2692,19 +2691,14 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	prot->rec_seq_size = cipher_desc->rec_seq;
 	memcpy(cctx->iv, salt, cipher_desc->salt);
 	memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv);
-
-	cctx->rec_seq = kmemdup(rec_seq, cipher_desc->rec_seq, GFP_KERNEL);
-	if (!cctx->rec_seq) {
-		rc = -ENOMEM;
-		goto free_iv;
-	}
+	memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq);
 
 	if (!*aead) {
 		*aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0);
 		if (IS_ERR(*aead)) {
 			rc = PTR_ERR(*aead);
 			*aead = NULL;
-			goto free_rec_seq;
+			goto free_iv;
 		}
 	}
 
@@ -2736,9 +2730,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 free_aead:
 	crypto_free_aead(*aead);
 	*aead = NULL;
-free_rec_seq:
-	kfree(cctx->rec_seq);
-	cctx->rec_seq = NULL;
 free_iv:
 	kfree(cctx->iv);
 	cctx->iv = NULL;
-- 
cgit v1.2.3-70-g09d2


From bee6b7b30706e7693d91cb28c8ff3cb69e094f65 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 9 Oct 2023 22:50:44 +0200
Subject: tls: rename MAX_IV_SIZE to TLS_MAX_IV_SIZE

It's defined in include/net/tls.h, avoid using an overly generic name.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h             | 2 +-
 net/tls/tls.h                 | 2 +-
 net/tls/tls_device_fallback.c | 2 +-
 net/tls/tls_main.c            | 2 +-
 net/tls/tls_sw.c              | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index f3f22b08af26..5200ce27db91 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -61,7 +61,7 @@ struct tls_rec;
 
 #define TLS_AAD_SPACE_SIZE		13
 
-#define MAX_IV_SIZE			16
+#define TLS_MAX_IV_SIZE			16
 #define TLS_TAG_SIZE			16
 #define TLS_MAX_REC_SEQ_SIZE		8
 #define TLS_MAX_AAD_SIZE		TLS_AAD_SPACE_SIZE
diff --git a/net/tls/tls.h b/net/tls/tls.h
index 28a8c0e80e3c..16830aa2d6ec 100644
--- a/net/tls/tls.h
+++ b/net/tls/tls.h
@@ -127,7 +127,7 @@ struct tls_rec {
 	struct sock *sk;
 
 	char aad_space[TLS_AAD_SPACE_SIZE];
-	u8 iv_data[MAX_IV_SIZE];
+	u8 iv_data[TLS_MAX_IV_SIZE];
 	struct aead_request aead_req;
 	u8 aead_req_ctx[];
 };
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 1d2b4d83ccab..4e7228f275fa 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -54,7 +54,7 @@ static int tls_enc_record(struct aead_request *aead_req,
 			  struct scatter_walk *out, int *in_len,
 			  struct tls_prot_info *prot)
 {
-	unsigned char buf[TLS_HEADER_SIZE + MAX_IV_SIZE];
+	unsigned char buf[TLS_HEADER_SIZE + TLS_MAX_IV_SIZE];
 	const struct tls_cipher_desc *cipher_desc;
 	struct scatterlist sg_in[3];
 	struct scatterlist sg_out[3];
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index f705d812fc36..58f13660fe6b 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -59,7 +59,7 @@ enum {
 };
 
 #define CHECK_CIPHER_DESC(cipher,ci)				\
-	static_assert(cipher ## _IV_SIZE <= MAX_IV_SIZE);		\
+	static_assert(cipher ## _IV_SIZE <= TLS_MAX_IV_SIZE);		\
 	static_assert(cipher ## _REC_SEQ_SIZE <= TLS_MAX_REC_SEQ_SIZE);	\
 	static_assert(cipher ## _TAG_SIZE == TLS_TAG_SIZE);		\
 	static_assert(sizeof_field(struct ci, iv) == cipher ## _IV_SIZE);	\
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 93d40c9a6823..5b6175f9b9a6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -60,7 +60,7 @@ struct tls_decrypt_arg {
 
 struct tls_decrypt_ctx {
 	struct sock *sk;
-	u8 iv[MAX_IV_SIZE];
+	u8 iv[TLS_MAX_IV_SIZE];
 	u8 aad[TLS_MAX_AAD_SIZE];
 	u8 tail;
 	struct scatterlist sg[];
@@ -2319,7 +2319,7 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
 	struct tls_prot_info *prot = &tls_ctx->prot_info;
-	char header[TLS_HEADER_SIZE + MAX_IV_SIZE];
+	char header[TLS_HEADER_SIZE + TLS_MAX_IV_SIZE];
 	size_t cipher_overhead;
 	size_t data_len = 0;
 	int ret;
@@ -2669,7 +2669,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	}
 
 	/* Sanity-check the sizes for stack allocations. */
-	if (nonce_size > MAX_IV_SIZE || prot->aad_size > TLS_MAX_AAD_SIZE) {
+	if (nonce_size > TLS_MAX_IV_SIZE || prot->aad_size > TLS_MAX_AAD_SIZE) {
 		rc = -EINVAL;
 		goto free_priv;
 	}
-- 
cgit v1.2.3-70-g09d2


From 1c1cb3110d7ed2897e65d9a352a8fb709723e057 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 9 Oct 2023 22:50:45 +0200
Subject: tls: store iv directly within cipher_context

TLS_MAX_IV_SIZE + TLS_MAX_SALT_SIZE is 20B, we don't get much benefit
in cipher_context's size and can simplify the init code a bit.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    |  3 ++-
 net/tls/tls_device.c | 13 ++-----------
 net/tls/tls_main.c   |  2 +-
 net/tls/tls_sw.c     | 13 ++-----------
 4 files changed, 7 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 5200ce27db91..28cc40d7b945 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -62,6 +62,7 @@ struct tls_rec;
 #define TLS_AAD_SPACE_SIZE		13
 
 #define TLS_MAX_IV_SIZE			16
+#define TLS_MAX_SALT_SIZE		4
 #define TLS_TAG_SIZE			16
 #define TLS_MAX_REC_SEQ_SIZE		8
 #define TLS_MAX_AAD_SIZE		TLS_AAD_SPACE_SIZE
@@ -193,7 +194,7 @@ enum tls_context_flags {
 };
 
 struct cipher_context {
-	char *iv;
+	char iv[TLS_MAX_IV_SIZE + TLS_MAX_SALT_SIZE];
 	char rec_seq[TLS_MAX_REC_SEQ_SIZE];
 };
 
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 525d7b813869..0981496c6294 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -56,10 +56,8 @@ static struct page *dummy_page;
 
 static void tls_device_free_ctx(struct tls_context *ctx)
 {
-	if (ctx->tx_conf == TLS_HW) {
+	if (ctx->tx_conf == TLS_HW)
 		kfree(tls_offload_ctx_tx(ctx));
-		kfree(ctx->tx.iv);
-	}
 
 	if (ctx->rx_conf == TLS_HW)
 		kfree(tls_offload_ctx_rx(ctx));
@@ -1088,11 +1086,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
 	prot->overhead_size = prot->prepend_size + prot->tag_size;
 	prot->iv_size = cipher_desc->iv;
 	prot->salt_size = cipher_desc->salt;
-	ctx->tx.iv = kmalloc(cipher_desc->iv + cipher_desc->salt, GFP_KERNEL);
-	if (!ctx->tx.iv) {
-		rc = -ENOMEM;
-		goto release_netdev;
-	}
 
 	memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv);
 
@@ -1102,7 +1095,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
 	start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
 	if (!start_marker_record) {
 		rc = -ENOMEM;
-		goto free_iv;
+		goto release_netdev;
 	}
 
 	offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
@@ -1187,8 +1180,6 @@ free_offload_ctx:
 	ctx->priv_ctx_tx = NULL;
 free_marker_record:
 	kfree(start_marker_record);
-free_iv:
-	kfree(ctx->tx.iv);
 release_netdev:
 	dev_put(netdev);
 	return rc;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 58f13660fe6b..b91524ac1009 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -60,6 +60,7 @@ enum {
 
 #define CHECK_CIPHER_DESC(cipher,ci)				\
 	static_assert(cipher ## _IV_SIZE <= TLS_MAX_IV_SIZE);		\
+	static_assert(cipher ## _SALT_SIZE <= TLS_MAX_SALT_SIZE);		\
 	static_assert(cipher ## _REC_SEQ_SIZE <= TLS_MAX_REC_SEQ_SIZE);	\
 	static_assert(cipher ## _TAG_SIZE == TLS_TAG_SIZE);		\
 	static_assert(sizeof_field(struct ci, iv) == cipher ## _IV_SIZE);	\
@@ -344,7 +345,6 @@ static void tls_sk_proto_cleanup(struct sock *sk,
 
 	/* We need these for tls_sw_fallback handling of other packets */
 	if (ctx->tx_conf == TLS_SW) {
-		kfree(ctx->tx.iv);
 		tls_sw_release_resources_tx(sk);
 		TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW);
 	} else if (ctx->tx_conf == TLS_HW) {
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 5b6175f9b9a6..c3da937b8207 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2467,8 +2467,6 @@ void tls_sw_release_resources_rx(struct sock *sk)
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
-	kfree(tls_ctx->rx.iv);
-
 	if (ctx->aead_recv) {
 		__skb_queue_purge(&ctx->rx_list);
 		crypto_free_aead(ctx->aead_recv);
@@ -2682,11 +2680,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 			      prot->tag_size + prot->tail_size;
 	prot->iv_size = cipher_desc->iv;
 	prot->salt_size = cipher_desc->salt;
-	cctx->iv = kmalloc(cipher_desc->iv + cipher_desc->salt, GFP_KERNEL);
-	if (!cctx->iv) {
-		rc = -ENOMEM;
-		goto free_priv;
-	}
+
 	/* Note: 128 & 256 bit salt are the same size */
 	prot->rec_seq_size = cipher_desc->rec_seq;
 	memcpy(cctx->iv, salt, cipher_desc->salt);
@@ -2698,7 +2692,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		if (IS_ERR(*aead)) {
 			rc = PTR_ERR(*aead);
 			*aead = NULL;
-			goto free_iv;
+			goto free_priv;
 		}
 	}
 
@@ -2730,9 +2724,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 free_aead:
 	crypto_free_aead(*aead);
 	*aead = NULL;
-free_iv:
-	kfree(cctx->iv);
-	cctx->iv = NULL;
 free_priv:
 	if (tx) {
 		kfree(ctx->priv_ctx_tx);
-- 
cgit v1.2.3-70-g09d2


From 9f0c8245516bc30cff770c3a69a6baaf8eef8810 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 9 Oct 2023 22:50:54 +0200
Subject: tls: use fixed size for tls_offload_context_{tx,rx}.driver_state

driver_state is a flex array, but is always allocated by the tls core
to a fixed size (TLS_DRIVER_STATE_SIZE_{TX,RX}). Simplify the code by
making that size explicit so that sizeof(struct
tls_offload_context_{tx,rx}) works.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    | 14 ++++----------
 net/tls/tls_device.c |  4 ++--
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 28cc40d7b945..962f0c501111 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -150,6 +150,7 @@ struct tls_record_info {
 	skb_frag_t frags[MAX_SKB_FRAGS];
 };
 
+#define TLS_DRIVER_STATE_SIZE_TX	16
 struct tls_offload_context_tx {
 	struct crypto_aead *aead_send;
 	spinlock_t lock;	/* protects records list */
@@ -163,17 +164,13 @@ struct tls_offload_context_tx {
 	void (*sk_destruct)(struct sock *sk);
 	struct work_struct destruct_work;
 	struct tls_context *ctx;
-	u8 driver_state[] __aligned(8);
 	/* The TLS layer reserves room for driver specific state
 	 * Currently the belief is that there is not enough
 	 * driver specific state to justify another layer of indirection
 	 */
-#define TLS_DRIVER_STATE_SIZE_TX	16
+	u8 driver_state[TLS_DRIVER_STATE_SIZE_TX] __aligned(8);
 };
 
-#define TLS_OFFLOAD_CONTEXT_SIZE_TX                                            \
-	(sizeof(struct tls_offload_context_tx) + TLS_DRIVER_STATE_SIZE_TX)
-
 enum tls_context_flags {
 	/* tls_device_down was called after the netdev went down, device state
 	 * was released, and kTLS works in software, even though rx_conf is
@@ -303,6 +300,7 @@ struct tls_offload_resync_async {
 	u32 log[TLS_DEVICE_RESYNC_ASYNC_LOGMAX];
 };
 
+#define TLS_DRIVER_STATE_SIZE_RX	8
 struct tls_offload_context_rx {
 	/* sw must be the first member of tls_offload_context_rx */
 	struct tls_sw_context_rx sw;
@@ -326,17 +324,13 @@ struct tls_offload_context_rx {
 			struct tls_offload_resync_async *resync_async;
 		};
 	};
-	u8 driver_state[] __aligned(8);
 	/* The TLS layer reserves room for driver specific state
 	 * Currently the belief is that there is not enough
 	 * driver specific state to justify another layer of indirection
 	 */
-#define TLS_DRIVER_STATE_SIZE_RX	8
+	u8 driver_state[TLS_DRIVER_STATE_SIZE_RX] __aligned(8);
 };
 
-#define TLS_OFFLOAD_CONTEXT_SIZE_RX					\
-	(sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX)
-
 struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
 				       u32 seq, u64 *p_record_sn);
 
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index fe52765beaee..f01543557a60 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -1038,7 +1038,7 @@ static struct tls_offload_context_tx *alloc_offload_ctx_tx(struct tls_context *c
 	struct tls_offload_context_tx *offload_ctx;
 	__be64 rcd_sn;
 
-	offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
+	offload_ctx = kzalloc(sizeof(*offload_ctx), GFP_KERNEL);
 	if (!offload_ctx)
 		return NULL;
 
@@ -1225,7 +1225,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
 		goto release_lock;
 	}
 
-	context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
 	if (!context) {
 		rc = -ENOMEM;
 		goto release_lock;
-- 
cgit v1.2.3-70-g09d2


From 236334aeec0f93217cf9235f2004e61a0a1a5985 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Fri, 13 Oct 2023 08:39:16 +0000
Subject: bpf: Avoid unnecessary audit log for CPU security mitigations

Check cpu_mitigations_off() first to avoid calling capable() if it is off.
This can avoid unnecessary audit log.

Fixes: bc5bc309db45 ("bpf: Inherit system settings for CPU security mitigations")
Suggested-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/CAEf4Bza6UVUWqcWQ-66weZ-nMDr+TFU3Mtq=dumZFD-pSqU7Ow@mail.gmail.com/
Link: https://lore.kernel.org/bpf/20231013083916.4199-1-laoar.shao@gmail.com
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 61bde4520f5c..f0891ba24cb1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void)
 
 static inline bool bpf_bypass_spec_v1(void)
 {
-	return perfmon_capable() || cpu_mitigations_off();
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 static inline bool bpf_bypass_spec_v4(void)
 {
-	return perfmon_capable() || cpu_mitigations_off();
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
-- 
cgit v1.2.3-70-g09d2


From 3f7f31fff2510272334f3d0374c432bdaa4f1536 Mon Sep 17 00:00:00 2001
From: Wei Zhang <weizhang@nvidia.com>
Date: Thu, 12 Oct 2023 12:27:36 -0700
Subject: net/mlx5: Parallelize vhca event handling

At present, mlx5 driver have a general purpose
event handler which not only handles vhca event
but also many other events. This incurs a huge
bottleneck because the event handler is
implemented by single threaded workqueue and all
events are forced to be handled in serial manner
even though application tries to create multiple
SFs simultaneously.

Introduce a dedicated vhca event handler which
manages SFs parallel creation.

Signed-off-by: Wei Zhang <weizhang@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/events.c   |  5 --
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  3 +-
 .../ethernet/mellanox/mlx5/core/sf/vhca_event.c    | 57 ++++++++++++++++++++--
 include/linux/mlx5/driver.h                        |  1 +
 4 files changed, 57 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 3ec892d51f57..d91ea53eb394 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -441,8 +441,3 @@ int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int ev
 
 	return blocking_notifier_call_chain(&events->sw_nh, event, data);
 }
-
-void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work)
-{
-	queue_work(dev->priv.events->wq, work);
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 19ffd1816474..d348a7f9511f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -159,6 +159,8 @@ enum mlx5_semaphore_space_address {
 
 #define MLX5_DEFAULT_PROF       2
 #define MLX5_SF_PROF		3
+#define MLX5_NUM_FW_CMD_THREADS 8
+#define MLX5_DEV_MAX_WQS	MLX5_NUM_FW_CMD_THREADS
 
 static inline int mlx5_flexible_inlen(struct mlx5_core_dev *dev, size_t fixed,
 				      size_t item_size, size_t num_items,
@@ -347,7 +349,6 @@ int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap
 #define mlx5_vport_get_other_func_general_cap(dev, vport, out)		\
 	mlx5_vport_get_other_func_cap(dev, vport, out, MLX5_CAP_GENERAL)
 
-void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work);
 static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev)
 {
 	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
index d908fba968f0..c6fd729de8b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
@@ -21,6 +21,15 @@ struct mlx5_vhca_event_work {
 	struct mlx5_vhca_state_event event;
 };
 
+struct mlx5_vhca_event_handler {
+	struct workqueue_struct *wq;
+};
+
+struct mlx5_vhca_events {
+	struct mlx5_core_dev *dev;
+	struct mlx5_vhca_event_handler handler[MLX5_DEV_MAX_WQS];
+};
+
 int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, u32 *out, u32 outlen)
 {
 	u32 in[MLX5_ST_SZ_DW(query_vhca_state_in)] = {};
@@ -99,6 +108,12 @@ static void mlx5_vhca_state_work_handler(struct work_struct *_work)
 	kfree(work);
 }
 
+static void
+mlx5_vhca_events_work_enqueue(struct mlx5_core_dev *dev, int idx, struct work_struct *work)
+{
+	queue_work(dev->priv.vhca_events->handler[idx].wq, work);
+}
+
 static int
 mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, void *data)
 {
@@ -106,6 +121,7 @@ mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, v
 				mlx5_nb_cof(nb, struct mlx5_vhca_state_notifier, nb);
 	struct mlx5_vhca_event_work *work;
 	struct mlx5_eqe *eqe = data;
+	int wq_idx;
 
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work)
@@ -113,7 +129,8 @@ mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, v
 	INIT_WORK(&work->work, &mlx5_vhca_state_work_handler);
 	work->notifier = notifier;
 	work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id);
-	mlx5_events_work_enqueue(notifier->dev, &work->work);
+	wq_idx = work->event.function_id % MLX5_DEV_MAX_WQS;
+	mlx5_vhca_events_work_enqueue(notifier->dev, wq_idx, &work->work);
 	return NOTIFY_OK;
 }
 
@@ -132,28 +149,62 @@ void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap)
 int mlx5_vhca_event_init(struct mlx5_core_dev *dev)
 {
 	struct mlx5_vhca_state_notifier *notifier;
+	char wq_name[MLX5_CMD_WQ_MAX_NAME];
+	struct mlx5_vhca_events *events;
+	int err, i;
 
 	if (!mlx5_vhca_event_supported(dev))
 		return 0;
 
-	notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
-	if (!notifier)
+	events = kzalloc(sizeof(*events), GFP_KERNEL);
+	if (!events)
 		return -ENOMEM;
 
+	events->dev = dev;
+	dev->priv.vhca_events = events;
+	for (i = 0; i < MLX5_DEV_MAX_WQS; i++) {
+		snprintf(wq_name, MLX5_CMD_WQ_MAX_NAME, "mlx5_vhca_event%d", i);
+		events->handler[i].wq = create_singlethread_workqueue(wq_name);
+		if (!events->handler[i].wq) {
+			err = -ENOMEM;
+			goto err_create_wq;
+		}
+	}
+
+	notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
+	if (!notifier) {
+		err = -ENOMEM;
+		goto err_notifier;
+	}
+
 	dev->priv.vhca_state_notifier = notifier;
 	notifier->dev = dev;
 	BLOCKING_INIT_NOTIFIER_HEAD(&notifier->n_head);
 	MLX5_NB_INIT(&notifier->nb, mlx5_vhca_state_change_notifier, VHCA_STATE_CHANGE);
 	return 0;
+
+err_notifier:
+err_create_wq:
+	for (--i; i >= 0; i--)
+		destroy_workqueue(events->handler[i].wq);
+	kfree(events);
+	return err;
 }
 
 void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev)
 {
+	struct mlx5_vhca_events *vhca_events;
+	int i;
+
 	if (!mlx5_vhca_event_supported(dev))
 		return;
 
 	kfree(dev->priv.vhca_state_notifier);
 	dev->priv.vhca_state_notifier = NULL;
+	vhca_events = dev->priv.vhca_events;
+	for (i = 0; i < MLX5_DEV_MAX_WQS; i++)
+		destroy_workqueue(vhca_events->handler[i].wq);
+	kvfree(vhca_events);
 }
 
 void mlx5_vhca_event_start(struct mlx5_core_dev *dev)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 52e982bc0f50..7968c5ee85c4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -615,6 +615,7 @@ struct mlx5_priv {
 	int			adev_idx;
 	int			sw_vhca_id;
 	struct mlx5_events      *events;
+	struct mlx5_vhca_events *vhca_events;
 
 	struct mlx5_flow_steering *steering;
 	struct mlx5_mpfs        *mpfs;
-- 
cgit v1.2.3-70-g09d2


From e534552c92a44690e48593f9567fe689545ded73 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Thu, 12 Oct 2023 12:27:39 -0700
Subject: net/mlx5: Refactor LAG peer device lookout bus logic to mlx5 devcom

LAG peer device lookout bus logic required the usage of global lock,
mlx5_intf_mutex.
As part of the effort to remove this global lock, refactor LAG peer
device lookout to use mlx5 devcom layer.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      | 68 ----------------------
 drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c  | 12 ++--
 .../net/ethernet/mellanox/mlx5/core/lib/devcom.c   | 14 +++++
 .../net/ethernet/mellanox/mlx5/core/lib/devcom.h   |  4 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 25 ++++++++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  1 -
 include/linux/mlx5/driver.h                        |  1 +
 7 files changed, 52 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 1fc03480c2ff..6e3a8c22881f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -566,74 +566,6 @@ bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev
 	return (fsystem_guid && psystem_guid && fsystem_guid == psystem_guid);
 }
 
-static u32 mlx5_gen_pci_id(const struct mlx5_core_dev *dev)
-{
-	return (u32)((pci_domain_nr(dev->pdev->bus) << 16) |
-		     (dev->pdev->bus->number << 8) |
-		     PCI_SLOT(dev->pdev->devfn));
-}
-
-static int _next_phys_dev(struct mlx5_core_dev *mdev,
-			  const struct mlx5_core_dev *curr)
-{
-	if (!mlx5_core_is_pf(mdev))
-		return 0;
-
-	if (mdev == curr)
-		return 0;
-
-	if (!mlx5_same_hw_devs(mdev, (struct mlx5_core_dev *)curr) &&
-	    mlx5_gen_pci_id(mdev) != mlx5_gen_pci_id(curr))
-		return 0;
-
-	return 1;
-}
-
-static void *pci_get_other_drvdata(struct device *this, struct device *other)
-{
-	if (this->driver != other->driver)
-		return NULL;
-
-	return pci_get_drvdata(to_pci_dev(other));
-}
-
-static int next_phys_dev_lag(struct device *dev, const void *data)
-{
-	struct mlx5_core_dev *mdev, *this = (struct mlx5_core_dev *)data;
-
-	mdev = pci_get_other_drvdata(this->device, dev);
-	if (!mdev)
-		return 0;
-
-	if (!mlx5_lag_is_supported(mdev))
-		return 0;
-
-	return _next_phys_dev(mdev, data);
-}
-
-static struct mlx5_core_dev *mlx5_get_next_dev(struct mlx5_core_dev *dev,
-					       int (*match)(struct device *dev, const void *data))
-{
-	struct device *next;
-
-	if (!mlx5_core_is_pf(dev))
-		return NULL;
-
-	next = bus_find_device(&pci_bus_type, NULL, dev, match);
-	if (!next)
-		return NULL;
-
-	put_device(next);
-	return pci_get_drvdata(to_pci_dev(next));
-}
-
-/* Must be called with intf_mutex held */
-struct mlx5_core_dev *mlx5_get_next_phys_dev_lag(struct mlx5_core_dev *dev)
-{
-	lockdep_assert_held(&mlx5_intf_mutex);
-	return mlx5_get_next_dev(dev, &next_phys_dev_lag);
-}
-
 void mlx5_dev_list_lock(void)
 {
 	mutex_lock(&mlx5_intf_mutex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index af3fac090b82..f0b57f97739f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1212,13 +1212,14 @@ static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
 	dev->priv.lag = NULL;
 }
 
-/* Must be called with intf_mutex held */
+/* Must be called with HCA devcom component lock held */
 static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
 {
+	struct mlx5_devcom_comp_dev *pos = NULL;
 	struct mlx5_lag *ldev = NULL;
 	struct mlx5_core_dev *tmp_dev;
 
-	tmp_dev = mlx5_get_next_phys_dev_lag(dev);
+	tmp_dev = mlx5_devcom_get_next_peer_data(dev->priv.hca_devcom_comp, &pos);
 	if (tmp_dev)
 		ldev = mlx5_lag_dev(tmp_dev);
 
@@ -1275,10 +1276,13 @@ void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
 	if (!mlx5_lag_is_supported(dev))
 		return;
 
+	if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp))
+		return;
+
 recheck:
-	mlx5_dev_list_lock();
+	mlx5_devcom_comp_lock(dev->priv.hca_devcom_comp);
 	err = __mlx5_lag_dev_add_mdev(dev);
-	mlx5_dev_list_unlock();
+	mlx5_devcom_comp_unlock(dev->priv.hca_devcom_comp);
 
 	if (err) {
 		msleep(100);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
index 89ac3209277e..f4d5c300ddd6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
@@ -387,3 +387,17 @@ void *mlx5_devcom_get_next_peer_data_rcu(struct mlx5_devcom_comp_dev *devcom,
 	*pos = tmp;
 	return data;
 }
+
+void mlx5_devcom_comp_lock(struct mlx5_devcom_comp_dev *devcom)
+{
+	if (IS_ERR_OR_NULL(devcom))
+		return;
+	down_write(&devcom->comp->sem);
+}
+
+void mlx5_devcom_comp_unlock(struct mlx5_devcom_comp_dev *devcom)
+{
+	if (IS_ERR_OR_NULL(devcom))
+		return;
+	up_write(&devcom->comp->sem);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
index 8220d180e33c..f06529cc3c61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
@@ -9,6 +9,7 @@
 enum mlx5_devcom_component {
 	MLX5_DEVCOM_ESW_OFFLOADS,
 	MLX5_DEVCOM_MPV,
+	MLX5_DEVCOM_HCA_PORTS,
 	MLX5_DEVCOM_NUM_COMPONENTS,
 };
 
@@ -52,4 +53,7 @@ void *mlx5_devcom_get_next_peer_data_rcu(struct mlx5_devcom_comp_dev *devcom,
 	     data;								  \
 	     data = mlx5_devcom_get_next_peer_data_rcu(devcom, &pos))
 
+void mlx5_devcom_comp_lock(struct mlx5_devcom_comp_dev *devcom);
+void mlx5_devcom_comp_unlock(struct mlx5_devcom_comp_dev *devcom);
+
 #endif /* __LIB_MLX5_DEVCOM_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 307ffe6300f8..a17152c1cbb2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -73,6 +73,7 @@
 #include "sf/sf.h"
 #include "mlx5_irq.h"
 #include "hwmon.h"
+#include "lag/lag.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -952,6 +953,27 @@ static void mlx5_pci_close(struct mlx5_core_dev *dev)
 	mlx5_pci_disable_device(dev);
 }
 
+static void mlx5_register_hca_devcom_comp(struct mlx5_core_dev *dev)
+{
+	/* This component is use to sync adding core_dev to lag_dev and to sync
+	 * changes of mlx5_adev_devices between LAG layer and other layers.
+	 */
+	if (!mlx5_lag_is_supported(dev))
+		return;
+
+	dev->priv.hca_devcom_comp =
+		mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_HCA_PORTS,
+					       mlx5_query_nic_system_image_guid(dev),
+					       NULL, dev);
+	if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp))
+		mlx5_core_err(dev, "Failed to register devcom HCA component\n");
+}
+
+static void mlx5_unregister_hca_devcom_comp(struct mlx5_core_dev *dev)
+{
+	mlx5_devcom_unregister_component(dev->priv.hca_devcom_comp);
+}
+
 static int mlx5_init_once(struct mlx5_core_dev *dev)
 {
 	int err;
@@ -960,6 +982,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 	if (IS_ERR(dev->priv.devc))
 		mlx5_core_warn(dev, "failed to register devcom device %ld\n",
 			       PTR_ERR(dev->priv.devc));
+	mlx5_register_hca_devcom_comp(dev);
 
 	err = mlx5_query_board_id(dev);
 	if (err) {
@@ -1094,6 +1117,7 @@ err_eq_cleanup:
 err_irq_cleanup:
 	mlx5_irq_table_cleanup(dev);
 err_devcom:
+	mlx5_unregister_hca_devcom_comp(dev);
 	mlx5_devcom_unregister_device(dev->priv.devc);
 
 	return err;
@@ -1123,6 +1147,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_events_cleanup(dev);
 	mlx5_eq_table_cleanup(dev);
 	mlx5_irq_table_cleanup(dev);
+	mlx5_unregister_hca_devcom_comp(dev);
 	mlx5_devcom_unregister_device(dev->priv.devc);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index d348a7f9511f..f191a90f337b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -266,7 +266,6 @@ int mlx5_register_device(struct mlx5_core_dev *dev);
 void mlx5_unregister_device(struct mlx5_core_dev *dev);
 void mlx5_dev_set_lightweight(struct mlx5_core_dev *dev);
 bool mlx5_dev_is_lightweight(struct mlx5_core_dev *dev);
-struct mlx5_core_dev *mlx5_get_next_phys_dev_lag(struct mlx5_core_dev *dev);
 void mlx5_dev_list_lock(void);
 void mlx5_dev_list_unlock(void);
 int mlx5_dev_list_trylock(void);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7968c5ee85c4..f60cdc9bd40f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -624,6 +624,7 @@ struct mlx5_priv {
 	struct mlx5_lag		*lag;
 	u32			flags;
 	struct mlx5_devcom_dev	*devc;
+	struct mlx5_devcom_comp_dev *hca_devcom_comp;
 	struct mlx5_fw_reset	*fw_reset;
 	struct mlx5_core_roce	roce;
 	struct mlx5_fc_stats		fc_stats;
-- 
cgit v1.2.3-70-g09d2


From 0d2d6bc7e74fee586f587d33484b797bb78f334c Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Thu, 12 Oct 2023 12:27:41 -0700
Subject: net/mlx5: Remove unused declaration

Commit 2ac9cfe78223 ("net/mlx5e: IPSec, Add Innova IPSec offload TX data path")
declared mlx5e_ipsec_inverse_table_init() but never implemented it.
Commit f52f2faee581 ("net/mlx5e: Introduce flow steering API")
declared mlx5e_fs_set_tc() but never implemented it.
Commit f2f3df550139 ("net/mlx5: EQ, Privatize eq_table and friends")
declared mlx5_eq_comp_cpumask() but never implemented it.
Commit cac1eb2cf2e3 ("net/mlx5: Lag, properly lock eswitch if needed")
removed mlx5_lag_update() but not its declaration.
Commit 35ba005d820b ("net/mlx5: DR, Set flex parser for TNL_MPLS dynamically")
removed mlx5dr_ste_build_tnl_mpls() but not its declaration.

Commit e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
declared but never implemented mlx5_alloc_cmd_mailbox_chain() and mlx5_free_cmd_mailbox_chain().
Commit 0cf53c124756 ("net/mlx5: FWPage, Use async events chain")
removed mlx5_core_req_pages_handler() but not its declaration.
Commit 938fe83c8dcb ("net/mlx5_core: New device capabilities handling")
removed mlx5_query_odp_caps() but not its declaration.
Commit f6a8a19bb11b ("RDMA/netdev: Hoist alloc_netdev_mqs out of the driver")
removed mlx5_rdma_netdev_alloc() but not its declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h            |  1 -
 .../net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h  |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h           |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h        |  2 --
 .../net/ethernet/mellanox/mlx5/core/steering/dr_types.h    |  4 ----
 include/linux/mlx5/driver.h                                | 14 --------------
 6 files changed, 23 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
index e5a44b0b9616..4d6225e0eec7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -150,7 +150,6 @@ struct mlx5e_flow_steering *mlx5e_fs_init(const struct mlx5e_profile *profile,
 					  struct dentry *dfs_root);
 void mlx5e_fs_cleanup(struct mlx5e_flow_steering *fs);
 struct mlx5e_vlan_table *mlx5e_fs_get_vlan(struct mlx5e_flow_steering *fs);
-void mlx5e_fs_set_tc(struct mlx5e_flow_steering *fs, struct mlx5e_tc_table *tc);
 struct mlx5e_tc_table *mlx5e_fs_get_tc(struct mlx5e_flow_steering *fs);
 struct mlx5e_l2_table *mlx5e_fs_get_l2(struct mlx5e_flow_steering *fs);
 struct mlx5_flow_namespace *mlx5e_fs_get_ns(struct mlx5e_flow_steering *fs, bool egress);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
index 9ee014a8ad24..2ed99772f168 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
@@ -54,7 +54,6 @@ struct mlx5e_accel_tx_ipsec_state {
 
 #ifdef CONFIG_MLX5_EN_IPSEC
 
-void mlx5e_ipsec_inverse_table_init(void);
 void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x,
 			    struct xfrm_offload *xo);
 void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index 69a75459775d..4b7f7131c560 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -85,7 +85,6 @@ void mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq);
 struct mlx5_eq_comp *mlx5_eqn2comp_eq(struct mlx5_core_dev *dev, int eqn);
 struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev);
 void mlx5_cq_tasklet_cb(struct tasklet_struct *t);
-struct cpumask *mlx5_eq_comp_cpumask(struct mlx5_core_dev *dev, int ix);
 
 u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq_comp *eq);
 void mlx5_cmd_eq_recover(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 13af3ae2cce5..6b14e347d914 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -311,8 +311,6 @@ static inline int mlx5_rescan_drivers(struct mlx5_core_dev *dev)
 	return ret;
 }
 
-void mlx5_lag_update(struct mlx5_core_dev *dev);
-
 enum {
 	MLX5_NIC_IFC_FULL		= 0,
 	MLX5_NIC_IFC_DISABLED		= 1,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 55dc7383477c..81eff6c410ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -436,10 +436,6 @@ void mlx5dr_ste_build_mpls(struct mlx5dr_ste_ctx *ste_ctx,
 			   struct mlx5dr_ste_build *sb,
 			   struct mlx5dr_match_param *mask,
 			   bool inner, bool rx);
-void mlx5dr_ste_build_tnl_mpls(struct mlx5dr_ste_ctx *ste_ctx,
-			       struct mlx5dr_ste_build *sb,
-			       struct mlx5dr_match_param *mask,
-			       bool inner, bool rx);
 void mlx5dr_ste_build_tnl_mpls_over_gre(struct mlx5dr_ste_ctx *ste_ctx,
 					struct mlx5dr_ste_build *sb,
 					struct mlx5dr_match_param *mask,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f60cdc9bd40f..d2b8d4a74a30 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1043,10 +1043,6 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
 int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			     struct mlx5_frag_buf *buf, int node);
 void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf);
-struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
-						      gfp_t flags, int npages);
-void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
-				 struct mlx5_cmd_mailbox *head);
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
 			  int inlen);
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 mkey);
@@ -1060,8 +1056,6 @@ void mlx5_pagealloc_start(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
 void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev);
 void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev);
-void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
-				 s32 npages, bool ec_function);
 int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
 int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev);
 void mlx5_register_debugfs(void);
@@ -1101,8 +1095,6 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
 int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
 __be32 mlx5_core_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev);
 void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
-int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
-			struct mlx5_odp_caps *odp_caps);
 
 int mlx5_init_rl_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
@@ -1204,12 +1196,6 @@ int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev,
 void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev,
 					     int vf_id,
 					     struct notifier_block *nb);
-#ifdef CONFIG_MLX5_CORE_IPOIB
-struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
-					  struct ib_device *ibdev,
-					  const char *name,
-					  void (*setup)(struct net_device *));
-#endif /* CONFIG_MLX5_CORE_IPOIB */
 int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
 			    struct ib_device *device,
 			    struct rdma_netdev_alloc_params *params);
-- 
cgit v1.2.3-70-g09d2


From 49dbe25adac42d3e06f65d1420946bec65896222 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Tue, 10 Oct 2023 22:15:14 +0300
Subject: vsock: read from socket's error queue

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK'
and 'VSOCK_RECVERR'.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h          |  1 +
 include/uapi/linux/vm_sockets.h | 17 +++++++++++++++++
 net/vmw_vsock/af_vsock.c        |  6 ++++++
 3 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 39b74d83c7c4..cfcb7e2c3813 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -383,6 +383,7 @@ struct ucred {
 #define SOL_MPTCP	284
 #define SOL_MCTP	285
 #define SOL_SMC		286
+#define SOL_VSOCK	287
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index c60ca33eac59..ed07181d4eff 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -191,4 +191,21 @@ struct sockaddr_vm {
 
 #define IOCTL_VM_SOCKETS_GET_LOCAL_CID		_IO(7, 0xb9)
 
+/* MSG_ZEROCOPY notifications are encoded in the standard error format,
+ * sock_extended_err. See Documentation/networking/msg_zerocopy.rst in
+ * kernel source tree for more details.
+ */
+
+/* 'cmsg_level' field value of 'struct cmsghdr' for notification parsing
+ * when MSG_ZEROCOPY flag is used on transmissions.
+ */
+
+#define SOL_VSOCK	287
+
+/* 'cmsg_type' field value of 'struct cmsghdr' for notification parsing
+ * when MSG_ZEROCOPY flag is used on transmissions.
+ */
+
+#define VSOCK_RECVERR	1
+
 #endif /* _UAPI_VM_SOCKETS_H */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d841f4de33b0..38486efd3d05 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -89,6 +89,7 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 #include <linux/cred.h>
+#include <linux/errqueue.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
@@ -110,6 +111,7 @@
 #include <linux/workqueue.h>
 #include <net/sock.h>
 #include <net/af_vsock.h>
+#include <uapi/linux/vm_sockets.h>
 
 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
 static void vsock_sk_destruct(struct sock *sk);
@@ -2137,6 +2139,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	int err;
 
 	sk = sock->sk;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR);
+
 	vsk = vsock_sk(sk);
 	err = 0;
 
-- 
cgit v1.2.3-70-g09d2


From 5fbfc7d243343917793ae95a6011f03b5aac4735 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Tue, 10 Oct 2023 22:15:15 +0300
Subject: vsock: check for MSG_ZEROCOPY support on send

This feature totally depends on transport, so if transport doesn't
support it, return error.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_vsock.h   | 7 +++++++
 net/vmw_vsock/af_vsock.c | 6 ++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index b01cf9ac2437..e302c0e804d0 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -177,6 +177,9 @@ struct vsock_transport {
 
 	/* Read a single skb */
 	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
+
+	/* Zero-copy. */
+	bool (*msgzerocopy_allow)(void);
 };
 
 /**** CORE ****/
@@ -241,4 +244,8 @@ static inline void __init vsock_bpf_build_proto(void)
 {}
 #endif
 
+static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
+{
+	return t->msgzerocopy_allow && t->msgzerocopy_allow();
+}
 #endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 38486efd3d05..71108b1f0dfc 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1824,6 +1824,12 @@ static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
+	if (msg->msg_flags & MSG_ZEROCOPY &&
+	    !vsock_msgzerocopy_allow(transport)) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
 	/* Wait for room in the produce queue to enqueue our user's data. */
 	timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
-- 
cgit v1.2.3-70-g09d2


From c3c6ab95c397134bf5948f18743b3ba8008e7c47 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Wed, 11 Oct 2023 12:12:33 +0200
Subject: dpll: spec: add support for pin-dpll signal phase offset/adjust

Add attributes for providing the user with:
- measurement of signals phase offset between pin and dpll
- ability to adjust the phase of pin signal

Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/netlink/specs/dpll.yaml | 30 ++++++++++++++++++++++++++++++
 drivers/dpll/dpll_nl.c                |  8 +++++---
 drivers/dpll/dpll_nl.h                |  2 +-
 include/uapi/linux/dpll.h             |  6 ++++++
 4 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index 1c1b53136c7b..cf8abe1c0550 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -164,6 +164,18 @@ definitions:
       -
         name: state-can-change
         doc: pin state can be changed
+  -
+    type: const
+    name: phase-offset-divider
+    value: 1000
+    doc: |
+      phase offset divider allows userspace to calculate a value of
+      measured signal phase difference between a pin and dpll device
+      as a fractional value with three digit decimal precision.
+      Value of (DPLL_A_PHASE_OFFSET / DPLL_PHASE_OFFSET_DIVIDER) is an
+      integer part of a measured phase offset value.
+      Value of (DPLL_A_PHASE_OFFSET % DPLL_PHASE_OFFSET_DIVIDER) is a
+      fractional part of a measured phase offset value.
 
 attribute-sets:
   -
@@ -272,6 +284,18 @@ attribute-sets:
         type: nest
         multi-attr: true
         nested-attributes: pin-parent-pin
+      -
+        name: phase-adjust-min
+        type: s32
+      -
+        name: phase-adjust-max
+        type: s32
+      -
+        name: phase-adjust
+        type: s32
+      -
+        name: phase-offset
+        type: s64
   -
     name: pin-parent-device
     subset-of: pin
@@ -284,6 +308,8 @@ attribute-sets:
         name: prio
       -
         name: state
+      -
+        name: phase-offset
   -
     name: pin-parent-pin
     subset-of: pin
@@ -431,6 +457,9 @@ operations:
             - capabilities
             - parent-device
             - parent-pin
+            - phase-adjust-min
+            - phase-adjust-max
+            - phase-adjust
 
       dump:
         pre: dpll-lock-dumpit
@@ -458,6 +487,7 @@ operations:
             - state
             - parent-device
             - parent-pin
+            - phase-adjust
     -
       name: pin-create-ntf
       doc: Notification about pin appearing
diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c
index 14064c8c783b..eaee5be7aa64 100644
--- a/drivers/dpll/dpll_nl.c
+++ b/drivers/dpll/dpll_nl.c
@@ -11,11 +11,12 @@
 #include <uapi/linux/dpll.h>
 
 /* Common nested types */
-const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_STATE + 1] = {
+const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_PHASE_OFFSET + 1] = {
 	[DPLL_A_PIN_PARENT_ID] = { .type = NLA_U32, },
 	[DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2),
 	[DPLL_A_PIN_PRIO] = { .type = NLA_U32, },
 	[DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3),
+	[DPLL_A_PIN_PHASE_OFFSET] = { .type = NLA_S64, },
 };
 
 const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1] = {
@@ -61,7 +62,7 @@ static const struct nla_policy dpll_pin_get_dump_nl_policy[DPLL_A_PIN_ID + 1] =
 };
 
 /* DPLL_CMD_PIN_SET - do */
-static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_PARENT_PIN + 1] = {
+static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_PHASE_ADJUST + 1] = {
 	[DPLL_A_PIN_ID] = { .type = NLA_U32, },
 	[DPLL_A_PIN_FREQUENCY] = { .type = NLA_U64, },
 	[DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2),
@@ -69,6 +70,7 @@ static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_PARENT_PIN + 1]
 	[DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3),
 	[DPLL_A_PIN_PARENT_DEVICE] = NLA_POLICY_NESTED(dpll_pin_parent_device_nl_policy),
 	[DPLL_A_PIN_PARENT_PIN] = NLA_POLICY_NESTED(dpll_pin_parent_pin_nl_policy),
+	[DPLL_A_PIN_PHASE_ADJUST] = { .type = NLA_S32, },
 };
 
 /* Ops table for dpll */
@@ -140,7 +142,7 @@ static const struct genl_split_ops dpll_nl_ops[] = {
 		.doit		= dpll_nl_pin_set_doit,
 		.post_doit	= dpll_pin_post_doit,
 		.policy		= dpll_pin_set_nl_policy,
-		.maxattr	= DPLL_A_PIN_PARENT_PIN,
+		.maxattr	= DPLL_A_PIN_PHASE_ADJUST,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 };
diff --git a/drivers/dpll/dpll_nl.h b/drivers/dpll/dpll_nl.h
index 1f67aaed4742..92d4c9c4f788 100644
--- a/drivers/dpll/dpll_nl.h
+++ b/drivers/dpll/dpll_nl.h
@@ -12,7 +12,7 @@
 #include <uapi/linux/dpll.h>
 
 /* Common nested types */
-extern const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_STATE + 1];
+extern const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_PHASE_OFFSET + 1];
 extern const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1];
 
 int dpll_lock_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index 20ef0718f8dc..715a491d2727 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -138,6 +138,8 @@ enum dpll_pin_capabilities {
 	DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE = 4,
 };
 
+#define DPLL_PHASE_OFFSET_DIVIDER	1000
+
 enum dpll_a {
 	DPLL_A_ID = 1,
 	DPLL_A_MODULE_NAME,
@@ -173,6 +175,10 @@ enum dpll_a_pin {
 	DPLL_A_PIN_CAPABILITIES,
 	DPLL_A_PIN_PARENT_DEVICE,
 	DPLL_A_PIN_PARENT_PIN,
+	DPLL_A_PIN_PHASE_ADJUST_MIN,
+	DPLL_A_PIN_PHASE_ADJUST_MAX,
+	DPLL_A_PIN_PHASE_ADJUST,
+	DPLL_A_PIN_PHASE_OFFSET,
 
 	__DPLL_A_PIN_MAX,
 	DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1)
-- 
cgit v1.2.3-70-g09d2


From d7fbc0b7e846e9e0e70ae766d274b8720fbab412 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Wed, 11 Oct 2023 12:12:34 +0200
Subject: dpll: netlink/core: add support for pin-dpll signal phase
 offset/adjust

Add callback ops for pin-dpll phase measurement.
Add callback for pin signal phase adjustment.
Add min and max phase adjustment values to pin proprties.
Invoke callbacks in dpll_netlink.c when filling the pin details to
provide user with phase related attribute values.

Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dpll/dpll_netlink.c | 138 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/dpll.h        |  18 ++++++
 2 files changed, 155 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index e20daba6896a..09a6c2a1ea92 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -212,6 +212,53 @@ dpll_msg_add_pin_direction(struct sk_buff *msg, struct dpll_pin *pin,
 	return 0;
 }
 
+static int
+dpll_msg_add_pin_phase_adjust(struct sk_buff *msg, struct dpll_pin *pin,
+			      struct dpll_pin_ref *ref,
+			      struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	s32 phase_adjust;
+	int ret;
+
+	if (!ops->phase_adjust_get)
+		return 0;
+	ret = ops->phase_adjust_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				    dpll, dpll_priv(dpll),
+				    &phase_adjust, extack);
+	if (ret)
+		return ret;
+	if (nla_put_s32(msg, DPLL_A_PIN_PHASE_ADJUST, phase_adjust))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_phase_offset(struct sk_buff *msg, struct dpll_pin *pin,
+			  struct dpll_pin_ref *ref,
+			  struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	s64 phase_offset;
+	int ret;
+
+	if (!ops->phase_offset_get)
+		return 0;
+	ret = ops->phase_offset_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				    dpll, dpll_priv(dpll), &phase_offset,
+				    extack);
+	if (ret)
+		return ret;
+	if (nla_put_64bit(msg, DPLL_A_PIN_PHASE_OFFSET, sizeof(phase_offset),
+			  &phase_offset, DPLL_A_PIN_PAD))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int
 dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin,
 		      struct dpll_pin_ref *ref, struct netlink_ext_ack *extack)
@@ -330,6 +377,9 @@ dpll_msg_add_pin_dplls(struct sk_buff *msg, struct dpll_pin *pin,
 		if (ret)
 			goto nest_cancel;
 		ret = dpll_msg_add_pin_direction(msg, pin, ref, extack);
+		if (ret)
+			goto nest_cancel;
+		ret = dpll_msg_add_phase_offset(msg, pin, ref, extack);
 		if (ret)
 			goto nest_cancel;
 		nla_nest_end(msg, attr);
@@ -377,6 +427,15 @@ dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
 	if (nla_put_u32(msg, DPLL_A_PIN_CAPABILITIES, prop->capabilities))
 		return -EMSGSIZE;
 	ret = dpll_msg_add_pin_freq(msg, pin, ref, extack);
+	if (ret)
+		return ret;
+	if (nla_put_s32(msg, DPLL_A_PIN_PHASE_ADJUST_MIN,
+			prop->phase_range.min))
+		return -EMSGSIZE;
+	if (nla_put_s32(msg, DPLL_A_PIN_PHASE_ADJUST_MAX,
+			prop->phase_range.max))
+		return -EMSGSIZE;
+	ret = dpll_msg_add_pin_phase_adjust(msg, pin, ref, extack);
 	if (ret)
 		return ret;
 	if (xa_empty(&pin->parent_refs))
@@ -416,7 +475,7 @@ dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg,
 	if (nla_put_u32(msg, DPLL_A_TYPE, dpll->type))
 		return -EMSGSIZE;
 
-	return ret;
+	return 0;
 }
 
 static int
@@ -705,6 +764,78 @@ dpll_pin_direction_set(struct dpll_pin *pin, struct dpll_device *dpll,
 	return 0;
 }
 
+static int
+dpll_pin_phase_adj_set(struct dpll_pin *pin, struct nlattr *phase_adj_attr,
+		       struct netlink_ext_ack *extack)
+{
+	struct dpll_pin_ref *ref, *failed;
+	const struct dpll_pin_ops *ops;
+	s32 phase_adj, old_phase_adj;
+	struct dpll_device *dpll;
+	unsigned long i;
+	int ret;
+
+	phase_adj = nla_get_s32(phase_adj_attr);
+	if (phase_adj > pin->prop->phase_range.max ||
+	    phase_adj < pin->prop->phase_range.min) {
+		NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr,
+				    "phase adjust value not supported");
+		return -EINVAL;
+	}
+
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		ops = dpll_pin_ops(ref);
+		if (!ops->phase_adjust_set || !ops->phase_adjust_get) {
+			NL_SET_ERR_MSG(extack, "phase adjust not supported");
+			return -EOPNOTSUPP;
+		}
+	}
+	ref = dpll_xa_ref_dpll_first(&pin->dpll_refs);
+	ops = dpll_pin_ops(ref);
+	dpll = ref->dpll;
+	ret = ops->phase_adjust_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				    dpll, dpll_priv(dpll), &old_phase_adj,
+				    extack);
+	if (ret) {
+		NL_SET_ERR_MSG(extack, "unable to get old phase adjust value");
+		return ret;
+	}
+	if (phase_adj == old_phase_adj)
+		return 0;
+
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		ops = dpll_pin_ops(ref);
+		dpll = ref->dpll;
+		ret = ops->phase_adjust_set(pin,
+					    dpll_pin_on_dpll_priv(dpll, pin),
+					    dpll, dpll_priv(dpll), phase_adj,
+					    extack);
+		if (ret) {
+			failed = ref;
+			NL_SET_ERR_MSG_FMT(extack,
+					   "phase adjust set failed for dpll_id:%u",
+					   dpll->id);
+			goto rollback;
+		}
+	}
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+
+rollback:
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		if (ref == failed)
+			break;
+		ops = dpll_pin_ops(ref);
+		dpll = ref->dpll;
+		if (ops->phase_adjust_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+					  dpll, dpll_priv(dpll), old_phase_adj,
+					  extack))
+			NL_SET_ERR_MSG(extack, "set phase adjust rollback failed");
+	}
+	return ret;
+}
+
 static int
 dpll_pin_parent_device_set(struct dpll_pin *pin, struct nlattr *parent_nest,
 			   struct netlink_ext_ack *extack)
@@ -793,6 +924,11 @@ dpll_pin_set_from_nlattr(struct dpll_pin *pin, struct genl_info *info)
 			if (ret)
 				return ret;
 			break;
+		case DPLL_A_PIN_PHASE_ADJUST:
+			ret = dpll_pin_phase_adj_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
 		case DPLL_A_PIN_PARENT_DEVICE:
 			ret = dpll_pin_parent_device_set(pin, a, info->extack);
 			if (ret)
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index bbc480cd2932..578fc5fa3750 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -68,6 +68,18 @@ struct dpll_pin_ops {
 	int (*prio_set)(const struct dpll_pin *pin, void *pin_priv,
 			const struct dpll_device *dpll, void *dpll_priv,
 			const u32 prio, struct netlink_ext_ack *extack);
+	int (*phase_offset_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				s64 *phase_offset,
+				struct netlink_ext_ack *extack);
+	int (*phase_adjust_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				s32 *phase_adjust,
+				struct netlink_ext_ack *extack);
+	int (*phase_adjust_set)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				const s32 phase_adjust,
+				struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_frequency {
@@ -91,6 +103,11 @@ struct dpll_pin_frequency {
 #define DPLL_PIN_FREQUENCY_DCF77 \
 	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_77_5_KHZ)
 
+struct dpll_pin_phase_adjust_range {
+	s32 min;
+	s32 max;
+};
+
 struct dpll_pin_properties {
 	const char *board_label;
 	const char *panel_label;
@@ -99,6 +116,7 @@ struct dpll_pin_properties {
 	unsigned long capabilities;
 	u32 freq_supported_num;
 	struct dpll_pin_frequency *freq_supported;
+	struct dpll_pin_phase_adjust_range phase_range;
 };
 
 #if IS_ENABLED(CONFIG_DPLL)
-- 
cgit v1.2.3-70-g09d2


From 60c6946675fc06dd2fd2b7a4b6fd1c1f046f1056 Mon Sep 17 00:00:00 2001
From: Xabier Marquiegui <reibax@gmail.com>
Date: Thu, 12 Oct 2023 00:39:53 +0200
Subject: posix-clock: introduce posix_clock_context concept

Add the necessary structure to support custom private-data per
posix-clock user.

The previous implementation of posix-clock assumed all file open
instances need access to the same clock structure on private_data.

The need for individual data structures per file open instance has been
identified when developing support for multiple timestamp event queue
users for ptp_clock.

Signed-off-by: Xabier Marquiegui <reibax@gmail.com>
Suggested-by: Richard Cochran <richardcochran@gmail.com>
Suggested-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_chardev.c   | 21 +++++++++++++--------
 drivers/ptp/ptp_private.h   | 16 +++++++++-------
 include/linux/posix-clock.h | 35 +++++++++++++++++++++++++++--------
 kernel/time/posix-clock.c   | 36 +++++++++++++++++++++++++++---------
 4 files changed, 76 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 362bf756e6b7..0ba3e7064df2 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -101,14 +101,16 @@ int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin,
 	return 0;
 }
 
-int ptp_open(struct posix_clock *pc, fmode_t fmode)
+int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode)
 {
 	return 0;
 }
 
-long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
+long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
+	       unsigned long arg)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 	struct ptp_sys_offset_extended *extoff = NULL;
 	struct ptp_sys_offset_precise precise_offset;
 	struct system_device_crosststamp xtstamp;
@@ -432,9 +434,11 @@ out:
 	return err;
 }
 
-__poll_t ptp_poll(struct posix_clock *pc, struct file *fp, poll_table *wait)
+__poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp,
+		  poll_table *wait)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 
 	poll_wait(fp, &ptp->tsev_wq, wait);
 
@@ -443,10 +447,11 @@ __poll_t ptp_poll(struct posix_clock *pc, struct file *fp, poll_table *wait)
 
 #define EXTTS_BUFSIZE (PTP_BUF_TIMESTAMPS * sizeof(struct ptp_extts_event))
 
-ssize_t ptp_read(struct posix_clock *pc,
-		 uint rdflags, char __user *buf, size_t cnt)
+ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags,
+		 char __user *buf, size_t cnt)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 	struct timestamp_event_queue *queue = &ptp->tsevq;
 	struct ptp_extts_event *event;
 	unsigned long flags;
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 75f58fc468a7..a3110c85f694 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -117,16 +117,18 @@ extern struct class *ptp_class;
 int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin,
 		    enum ptp_pin_function func, unsigned int chan);
 
-long ptp_ioctl(struct posix_clock *pc,
-	       unsigned int cmd, unsigned long arg);
+long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
+	       unsigned long arg);
 
-int ptp_open(struct posix_clock *pc, fmode_t fmode);
+int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode);
 
-ssize_t ptp_read(struct posix_clock *pc,
-		 uint flags, char __user *buf, size_t cnt);
+int ptp_release(struct posix_clock_context *pccontext);
 
-__poll_t ptp_poll(struct posix_clock *pc,
-	      struct file *fp, poll_table *wait);
+ssize_t ptp_read(struct posix_clock_context *pccontext, uint flags, char __user *buf,
+		 size_t cnt);
+
+__poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp,
+		  poll_table *wait);
 
 /*
  * see ptp_sysfs.c
diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
index 468328b1e1dd..ef8619f48920 100644
--- a/include/linux/posix-clock.h
+++ b/include/linux/posix-clock.h
@@ -14,6 +14,7 @@
 #include <linux/rwsem.h>
 
 struct posix_clock;
+struct posix_clock_context;
 
 /**
  * struct posix_clock_operations - functional interface to the clock
@@ -50,18 +51,18 @@ struct posix_clock_operations {
 	/*
 	 * Optional character device methods:
 	 */
-	long    (*ioctl)   (struct posix_clock *pc,
-			    unsigned int cmd, unsigned long arg);
+	long (*ioctl)(struct posix_clock_context *pccontext, unsigned int cmd,
+		      unsigned long arg);
 
-	int     (*open)    (struct posix_clock *pc, fmode_t f_mode);
+	int (*open)(struct posix_clock_context *pccontext, fmode_t f_mode);
 
-	__poll_t (*poll)   (struct posix_clock *pc,
-			    struct file *file, poll_table *wait);
+	__poll_t (*poll)(struct posix_clock_context *pccontext, struct file *file,
+			 poll_table *wait);
 
-	int     (*release) (struct posix_clock *pc);
+	int (*release)(struct posix_clock_context *pccontext);
 
-	ssize_t (*read)    (struct posix_clock *pc,
-			    uint flags, char __user *buf, size_t cnt);
+	ssize_t (*read)(struct posix_clock_context *pccontext, uint flags,
+			char __user *buf, size_t cnt);
 };
 
 /**
@@ -90,6 +91,24 @@ struct posix_clock {
 	bool zombie;
 };
 
+/**
+ * struct posix_clock_context - represents clock file operations context
+ *
+ * @clk:              Pointer to the clock
+ * @private_clkdata:  Pointer to user data
+ *
+ * Drivers should use struct posix_clock_context during specific character
+ * device file operation methods to access the posix clock.
+ *
+ * Drivers can store a private data structure during the open operation
+ * if they have specific information that is required in other file
+ * operations.
+ */
+struct posix_clock_context {
+	struct posix_clock *clk;
+	void *private_clkdata;
+};
+
 /**
  * posix_clock_register() - register a new clock
  * @clk:   Pointer to the clock. Caller must provide 'ops' field
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 77c0c2370b6d..9de66bbbb3d1 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,8 @@
  */
 static struct posix_clock *get_posix_clock(struct file *fp)
 {
-	struct posix_clock *clk = fp->private_data;
+	struct posix_clock_context *pccontext = fp->private_data;
+	struct posix_clock *clk = pccontext->clk;
 
 	down_read(&clk->rwsem);
 
@@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk)
 static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 				size_t count, loff_t *ppos)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -EINVAL;
 
@@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 		return -ENODEV;
 
 	if (clk->ops.read)
-		err = clk->ops.read(clk, fp->f_flags, buf, count);
+		err = clk->ops.read(pccontext, fp->f_flags, buf, count);
 
 	put_posix_clock(clk);
 
@@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 
 static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	__poll_t result = 0;
 
@@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 		return EPOLLERR;
 
 	if (clk->ops.poll)
-		result = clk->ops.poll(clk, fp, wait);
+		result = clk->ops.poll(pccontext, fp, wait);
 
 	put_posix_clock(clk);
 
@@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 static long posix_clock_ioctl(struct file *fp,
 			      unsigned int cmd, unsigned long arg)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -ENOTTY;
 
@@ -79,7 +83,7 @@ static long posix_clock_ioctl(struct file *fp,
 		return -ENODEV;
 
 	if (clk->ops.ioctl)
-		err = clk->ops.ioctl(clk, cmd, arg);
+		err = clk->ops.ioctl(pccontext, cmd, arg);
 
 	put_posix_clock(clk);
 
@@ -90,6 +94,7 @@ static long posix_clock_ioctl(struct file *fp,
 static long posix_clock_compat_ioctl(struct file *fp,
 				     unsigned int cmd, unsigned long arg)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -ENOTTY;
 
@@ -97,7 +102,7 @@ static long posix_clock_compat_ioctl(struct file *fp,
 		return -ENODEV;
 
 	if (clk->ops.ioctl)
-		err = clk->ops.ioctl(clk, cmd, arg);
+		err = clk->ops.ioctl(pccontext, cmd, arg);
 
 	put_posix_clock(clk);
 
@@ -110,6 +115,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 	int err;
 	struct posix_clock *clk =
 		container_of(inode->i_cdev, struct posix_clock, cdev);
+	struct posix_clock_context *pccontext;
 
 	down_read(&clk->rwsem);
 
@@ -117,14 +123,20 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 		err = -ENODEV;
 		goto out;
 	}
+	pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL);
+	if (!pccontext) {
+		err = -ENOMEM;
+		goto out;
+	}
+	pccontext->clk = clk;
+	fp->private_data = pccontext;
 	if (clk->ops.open)
-		err = clk->ops.open(clk, fp->f_mode);
+		err = clk->ops.open(pccontext, fp->f_mode);
 	else
 		err = 0;
 
 	if (!err) {
 		get_device(clk->dev);
-		fp->private_data = clk;
 	}
 out:
 	up_read(&clk->rwsem);
@@ -133,14 +145,20 @@ out:
 
 static int posix_clock_release(struct inode *inode, struct file *fp)
 {
-	struct posix_clock *clk = fp->private_data;
+	struct posix_clock_context *pccontext = fp->private_data;
+	struct posix_clock *clk;
 	int err = 0;
 
+	if (!pccontext)
+		return -ENODEV;
+	clk = pccontext->clk;
+
 	if (clk->ops.release)
-		err = clk->ops.release(clk);
+		err = clk->ops.release(pccontext);
 
 	put_device(clk->dev);
 
+	kfree(pccontext);
 	fp->private_data = NULL;
 
 	return err;
-- 
cgit v1.2.3-70-g09d2


From c5a445b1e9347b14752b01f1a304bd7a2f260acc Mon Sep 17 00:00:00 2001
From: Xabier Marquiegui <reibax@gmail.com>
Date: Thu, 12 Oct 2023 00:39:56 +0200
Subject: ptp: support event queue reader channel masks

On systems with multiple timestamp event channels, some readers might
want to receive only a subset of those channels.

Add the necessary modifications to support timestamp event channel
filtering, including two IOCTL operations:

- Clear all channels
- Enable one channel

The mask modification operations will be applied exclusively on the
event queue assigned to the file descriptor used on the IOCTL operation,
so the typical procedure to have a reader receiving only a subset of the
enabled channels would be:

- Open device file
- ioctl: clear all channels
- ioctl: enable one channel
- start reading

Calling the enable one channel ioctl more than once will result in
multiple enabled channels.

Signed-off-by: Xabier Marquiegui <reibax@gmail.com>
Suggested-by: Richard Cochran <richardcochran@gmail.com>
Suggested-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_chardev.c      | 26 ++++++++++++++++++++++++++
 drivers/ptp/ptp_clock.c        | 12 ++++++++++--
 drivers/ptp/ptp_private.h      |  3 +++
 include/uapi/linux/ptp_clock.h |  2 ++
 4 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index abe94bb80cf6..ac2f2b5ea0b7 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -110,6 +110,12 @@ int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode)
 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 	if (!queue)
 		return -EINVAL;
+	queue->mask = bitmap_alloc(PTP_MAX_CHANNELS, GFP_KERNEL);
+	if (!queue->mask) {
+		kfree(queue);
+		return -EINVAL;
+	}
+	bitmap_set(queue->mask, 0, PTP_MAX_CHANNELS);
 	spin_lock_init(&queue->lock);
 	list_add_tail(&queue->qlist, &ptp->tsevqs);
 	pccontext->private_clkdata = queue;
@@ -126,6 +132,7 @@ int ptp_release(struct posix_clock_context *pccontext)
 		spin_lock_irqsave(&queue->lock, flags);
 		list_del(&queue->qlist);
 		spin_unlock_irqrestore(&queue->lock, flags);
+		bitmap_free(queue->mask);
 		kfree(queue);
 	}
 	return 0;
@@ -141,6 +148,7 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
 	struct system_device_crosststamp xtstamp;
 	struct ptp_clock_info *ops = ptp->info;
 	struct ptp_sys_offset *sysoff = NULL;
+	struct timestamp_event_queue *tsevq;
 	struct ptp_system_timestamp sts;
 	struct ptp_clock_request req;
 	struct ptp_clock_caps caps;
@@ -150,6 +158,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
 	struct timespec64 ts;
 	int enable, err = 0;
 
+	tsevq = pccontext->private_clkdata;
+
 	switch (cmd) {
 
 	case PTP_CLOCK_GETCAPS:
@@ -448,6 +458,22 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
 		mutex_unlock(&ptp->pincfg_mux);
 		break;
 
+	case PTP_MASK_CLEAR_ALL:
+		bitmap_clear(tsevq->mask, 0, PTP_MAX_CHANNELS);
+		break;
+
+	case PTP_MASK_EN_SINGLE:
+		if (copy_from_user(&i, (void __user *)arg, sizeof(i))) {
+			err = -EFAULT;
+			break;
+		}
+		if (i >= PTP_MAX_CHANNELS) {
+			err = -EFAULT;
+			break;
+		}
+		set_bit(i, tsevq->mask);
+		break;
+
 	default:
 		err = -ENOTTY;
 		break;
diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 74f1ce2dbccb..ed16d9787ce9 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -183,6 +183,7 @@ static void ptp_clock_release(struct device *dev)
 	spin_lock_irqsave(&tsevq->lock, flags);
 	list_del(&tsevq->qlist);
 	spin_unlock_irqrestore(&tsevq->lock, flags);
+	bitmap_free(tsevq->mask);
 	kfree(tsevq);
 	ida_free(&ptp_clocks_map, ptp->index);
 	kfree(ptp);
@@ -243,6 +244,10 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 	if (!queue)
 		goto no_memory_queue;
 	list_add_tail(&queue->qlist, &ptp->tsevqs);
+	queue->mask = bitmap_alloc(PTP_MAX_CHANNELS, GFP_KERNEL);
+	if (!queue->mask)
+		goto no_memory_bitmap;
+	bitmap_set(queue->mask, 0, PTP_MAX_CHANNELS);
 	spin_lock_init(&queue->lock);
 	mutex_init(&ptp->pincfg_mux);
 	mutex_init(&ptp->n_vclocks_mux);
@@ -346,6 +351,8 @@ no_mem_for_vclocks:
 kworker_err:
 	mutex_destroy(&ptp->pincfg_mux);
 	mutex_destroy(&ptp->n_vclocks_mux);
+	bitmap_free(queue->mask);
+no_memory_bitmap:
 	list_del(&queue->qlist);
 	kfree(queue);
 no_memory_queue:
@@ -400,9 +407,10 @@ void ptp_clock_event(struct ptp_clock *ptp, struct ptp_clock_event *event)
 		break;
 
 	case PTP_CLOCK_EXTTS:
-		/* Enqueue timestamp on all queues */
+		/* Enqueue timestamp on selected queues */
 		list_for_each_entry(tsevq, &ptp->tsevqs, qlist) {
-			enqueue_external_timestamp(tsevq, event);
+			if (test_bit((unsigned int)event->index, tsevq->mask))
+				enqueue_external_timestamp(tsevq, event);
 		}
 		wake_up_interruptible(&ptp->tsev_wq);
 		break;
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 9d5f3d95058e..ad4ce1b25c86 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -16,10 +16,12 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/time.h>
 #include <linux/list.h>
+#include <linux/bitmap.h>
 
 #define PTP_MAX_TIMESTAMPS 128
 #define PTP_BUF_TIMESTAMPS 30
 #define PTP_DEFAULT_MAX_VCLOCKS 20
+#define PTP_MAX_CHANNELS 2048
 
 struct timestamp_event_queue {
 	struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
@@ -27,6 +29,7 @@ struct timestamp_event_queue {
 	int tail;
 	spinlock_t lock;
 	struct list_head qlist;
+	unsigned long *mask;
 };
 
 struct ptp_clock {
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 05cc35fc94ac..da700999cad4 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -224,6 +224,8 @@ struct ptp_pin_desc {
 	_IOWR(PTP_CLK_MAGIC, 17, struct ptp_sys_offset_precise)
 #define PTP_SYS_OFFSET_EXTENDED2 \
 	_IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended)
+#define PTP_MASK_CLEAR_ALL  _IO(PTP_CLK_MAGIC, 19)
+#define PTP_MASK_EN_SINGLE  _IOW(PTP_CLK_MAGIC, 20, unsigned int)
 
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
-- 
cgit v1.2.3-70-g09d2


From bf3fcbf7e7a08015d3b169bad6281b29d45c272d Mon Sep 17 00:00:00 2001
From: Beniamino Galvani <b.galvani@gmail.com>
Date: Mon, 16 Oct 2023 09:15:20 +0200
Subject: ipv4: rename and move ip_route_output_tunnel()

At the moment ip_route_output_tunnel() is used only by bareudp.
Ideally, other UDP tunnel implementations should use it, but to do so
the function needs to accept new parameters that are specific for UDP
tunnels, such as the ports.

Prepare for these changes by renaming the function to
udp_tunnel_dst_lookup() and move it to file
net/ipv4/udp_tunnel_core.c.

Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Beniamino Galvani <b.galvani@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bareudp.c      |  8 ++++----
 include/net/route.h        |  6 ------
 include/net/udp_tunnel.h   |  6 ++++++
 net/ipv4/route.c           | 48 ----------------------------------------------
 net/ipv4/udp_tunnel_core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 683203f87ae2..63fc32fa1af5 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -306,8 +306,8 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (!sock)
 		return -ESHUTDOWN;
 
-	rt = ip_route_output_tunnel(skb, dev, bareudp->net, &saddr, info,
-				    IPPROTO_UDP, use_cache);
+	rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, &saddr, info,
+				   IPPROTO_UDP, use_cache);
 
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
@@ -483,8 +483,8 @@ static int bareudp_fill_metadata_dst(struct net_device *dev,
 		struct rtable *rt;
 		__be32 saddr;
 
-		rt = ip_route_output_tunnel(skb, dev, bareudp->net, &saddr,
-					    info, IPPROTO_UDP, use_cache);
+		rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, &saddr,
+					   info, IPPROTO_UDP, use_cache);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 
diff --git a/include/net/route.h b/include/net/route.h
index 5c248a8e3d0e..980ab474eabd 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -136,12 +136,6 @@ static inline struct rtable *__ip_route_output_key(struct net *net,
 
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 				    const struct sock *sk);
-struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
-				      struct net_device *dev,
-				      struct net *net, __be32 *saddr,
-				      const struct ip_tunnel_info *info,
-				      u8 protocol, bool use_cache);
-
 struct dst_entry *ipv4_blackhole_route(struct net *net,
 				       struct dst_entry *dst_orig);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 21ba0a25f936..11e810ca5088 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -162,6 +162,12 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 
 void udp_tunnel_sock_release(struct socket *sock);
 
+struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
+				     struct net_device *dev,
+				     struct net *net, __be32 *saddr,
+				     const struct ip_tunnel_info *info,
+				     u8 protocol, bool use_cache);
+
 struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
 				    __be16 flags, __be64 tunnel_id,
 				    int md_size);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e2bf4602b559..3290a4442b4a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2885,54 +2885,6 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
-				      struct net_device *dev,
-				      struct net *net, __be32 *saddr,
-				      const struct ip_tunnel_info *info,
-				      u8 protocol, bool use_cache)
-{
-#ifdef CONFIG_DST_CACHE
-	struct dst_cache *dst_cache;
-#endif
-	struct rtable *rt = NULL;
-	struct flowi4 fl4;
-	__u8 tos;
-
-#ifdef CONFIG_DST_CACHE
-	dst_cache = (struct dst_cache *)&info->dst_cache;
-	if (use_cache) {
-		rt = dst_cache_get_ip4(dst_cache, saddr);
-		if (rt)
-			return rt;
-	}
-#endif
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_mark = skb->mark;
-	fl4.flowi4_proto = protocol;
-	fl4.daddr = info->key.u.ipv4.dst;
-	fl4.saddr = info->key.u.ipv4.src;
-	tos = info->key.tos;
-	fl4.flowi4_tos = RT_TOS(tos);
-
-	rt = ip_route_output_key(net, &fl4);
-	if (IS_ERR(rt)) {
-		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
-		return ERR_PTR(-ENETUNREACH);
-	}
-	if (rt->dst.dev == dev) { /* is this necessary? */
-		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
-		ip_rt_put(rt);
-		return ERR_PTR(-ELOOP);
-	}
-#ifdef CONFIG_DST_CACHE
-	if (use_cache)
-		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
-#endif
-	*saddr = fl4.saddr;
-	return rt;
-}
-EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
-
 /* called with rcu_read_lock held */
 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 1e7e4aecdc48..96f93f92b6ce 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -204,4 +204,52 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb,  unsigned short family,
 }
 EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
 
+struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
+				     struct net_device *dev,
+				     struct net *net, __be32 *saddr,
+				     const struct ip_tunnel_info *info,
+				     u8 protocol, bool use_cache)
+{
+#ifdef CONFIG_DST_CACHE
+	struct dst_cache *dst_cache;
+#endif
+	struct rtable *rt = NULL;
+	struct flowi4 fl4;
+	__u8 tos;
+
+#ifdef CONFIG_DST_CACHE
+	dst_cache = (struct dst_cache *)&info->dst_cache;
+	if (use_cache) {
+		rt = dst_cache_get_ip4(dst_cache, saddr);
+		if (rt)
+			return rt;
+	}
+#endif
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_proto = protocol;
+	fl4.daddr = info->key.u.ipv4.dst;
+	fl4.saddr = info->key.u.ipv4.src;
+	tos = info->key.tos;
+	fl4.flowi4_tos = RT_TOS(tos);
+
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
+		return ERR_PTR(-ENETUNREACH);
+	}
+	if (rt->dst.dev == dev) { /* is this necessary? */
+		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
+		ip_rt_put(rt);
+		return ERR_PTR(-ELOOP);
+	}
+#ifdef CONFIG_DST_CACHE
+	if (use_cache)
+		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+#endif
+	*saddr = fl4.saddr;
+	return rt;
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_dst_lookup);
+
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From 78f3655adcb52412275f282267ee771421731632 Mon Sep 17 00:00:00 2001
From: Beniamino Galvani <b.galvani@gmail.com>
Date: Mon, 16 Oct 2023 09:15:21 +0200
Subject: ipv4: remove "proto" argument from udp_tunnel_dst_lookup()

The function is now UDP-specific, the protocol is always IPPROTO_UDP.

Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Beniamino Galvani <b.galvani@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bareudp.c      | 4 ++--
 include/net/udp_tunnel.h   | 2 +-
 net/ipv4/udp_tunnel_core.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 63fc32fa1af5..6af67cac6bde 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -307,7 +307,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		return -ESHUTDOWN;
 
 	rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, &saddr, info,
-				   IPPROTO_UDP, use_cache);
+				   use_cache);
 
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
@@ -484,7 +484,7 @@ static int bareudp_fill_metadata_dst(struct net_device *dev,
 		__be32 saddr;
 
 		rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, &saddr,
-					   info, IPPROTO_UDP, use_cache);
+					   info, use_cache);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 11e810ca5088..8f110dbd3784 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -166,7 +166,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
 				     struct net_device *dev,
 				     struct net *net, __be32 *saddr,
 				     const struct ip_tunnel_info *info,
-				     u8 protocol, bool use_cache);
+				     bool use_cache);
 
 struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
 				    __be16 flags, __be64 tunnel_id,
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 96f93f92b6ce..9b0cfd72d5fd 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -208,7 +208,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
 				     struct net_device *dev,
 				     struct net *net, __be32 *saddr,
 				     const struct ip_tunnel_info *info,
-				     u8 protocol, bool use_cache)
+				     bool use_cache)
 {
 #ifdef CONFIG_DST_CACHE
 	struct dst_cache *dst_cache;
@@ -227,7 +227,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
 #endif
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.flowi4_mark = skb->mark;
-	fl4.flowi4_proto = protocol;
+	fl4.flowi4_proto = IPPROTO_UDP;
 	fl4.daddr = info->key.u.ipv4.dst;
 	fl4.saddr = info->key.u.ipv4.src;
 	tos = info->key.tos;
-- 
cgit v1.2.3-70-g09d2


From 72fc68c6356b663a8763f02d9b0ec773d59a4949 Mon Sep 17 00:00:00 2001
From: Beniamino Galvani <b.galvani@gmail.com>
Date: Mon, 16 Oct 2023 09:15:22 +0200
Subject: ipv4: add new arguments to udp_tunnel_dst_lookup()

We want to make the function more generic so that it can be used by
other UDP tunnel implementations such as geneve and vxlan. To do that,
add the following arguments:

 - source and destination UDP port;
 - ifindex of the output interface, needed by vxlan;
 - the tos, because in some cases it is not taken from struct
   ip_tunnel_info (for example, when it's inherited from the inner
   packet);
 - the dst cache, because not all tunnel types (e.g. vxlan) want to
   use the one from struct ip_tunnel_info.

With these parameters, the function no longer needs the full struct
ip_tunnel_info as argument and we can pass only the relevant part of
it (struct ip_tunnel_key).

Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Beniamino Galvani <b.galvani@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bareudp.c      | 11 +++++++----
 include/net/udp_tunnel.h   |  8 +++++---
 net/ipv4/udp_tunnel_core.c | 26 +++++++++++++-------------
 3 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 6af67cac6bde..47a9c2a5583c 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -306,8 +306,10 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (!sock)
 		return -ESHUTDOWN;
 
-	rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, &saddr, info,
-				   use_cache);
+	rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key,
+				   0, 0, key->tos,
+				   use_cache ?
+				   (struct dst_cache *)&info->dst_cache : NULL);
 
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
@@ -483,8 +485,9 @@ static int bareudp_fill_metadata_dst(struct net_device *dev,
 		struct rtable *rt;
 		__be32 saddr;
 
-		rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, &saddr,
-					   info, use_cache);
+		rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr,
+					   &info->key, 0, 0, info->key.tos,
+					   use_cache ? &info->dst_cache : NULL);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 8f110dbd3784..4d0578fab01a 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -164,9 +164,11 @@ void udp_tunnel_sock_release(struct socket *sock);
 
 struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
 				     struct net_device *dev,
-				     struct net *net, __be32 *saddr,
-				     const struct ip_tunnel_info *info,
-				     bool use_cache);
+				     struct net *net, int oif,
+				     __be32 *saddr,
+				     const struct ip_tunnel_key *key,
+				     __be16 sport, __be16 dport, u8 tos,
+				     struct dst_cache *dst_cache);
 
 struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
 				    __be16 flags, __be64 tunnel_id,
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 9b0cfd72d5fd..494685e82856 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -206,31 +206,31 @@ EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
 
 struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
 				     struct net_device *dev,
-				     struct net *net, __be32 *saddr,
-				     const struct ip_tunnel_info *info,
-				     bool use_cache)
+				     struct net *net, int oif,
+				     __be32 *saddr,
+				     const struct ip_tunnel_key *key,
+				     __be16 sport, __be16 dport, u8 tos,
+				     struct dst_cache *dst_cache)
 {
-#ifdef CONFIG_DST_CACHE
-	struct dst_cache *dst_cache;
-#endif
 	struct rtable *rt = NULL;
 	struct flowi4 fl4;
-	__u8 tos;
 
 #ifdef CONFIG_DST_CACHE
-	dst_cache = (struct dst_cache *)&info->dst_cache;
-	if (use_cache) {
+	if (dst_cache) {
 		rt = dst_cache_get_ip4(dst_cache, saddr);
 		if (rt)
 			return rt;
 	}
 #endif
+
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.flowi4_mark = skb->mark;
 	fl4.flowi4_proto = IPPROTO_UDP;
-	fl4.daddr = info->key.u.ipv4.dst;
-	fl4.saddr = info->key.u.ipv4.src;
-	tos = info->key.tos;
+	fl4.flowi4_oif = oif;
+	fl4.daddr = key->u.ipv4.dst;
+	fl4.saddr = key->u.ipv4.src;
+	fl4.fl4_dport = dport;
+	fl4.fl4_sport = sport;
 	fl4.flowi4_tos = RT_TOS(tos);
 
 	rt = ip_route_output_key(net, &fl4);
@@ -244,7 +244,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
 		return ERR_PTR(-ELOOP);
 	}
 #ifdef CONFIG_DST_CACHE
-	if (use_cache)
+	if (dst_cache)
 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
 #endif
 	*saddr = fl4.saddr;
-- 
cgit v1.2.3-70-g09d2


From 54a59aed395ce0f4177b5212e5746a6462de3ad9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 9 Oct 2023 11:26:54 +0200
Subject: net, sched: Make tc-related drop reason more flexible

Currently, the kfree_skb_reason() in sch_handle_{ingress,egress}() can only
express a basic SKB_DROP_REASON_TC_INGRESS or SKB_DROP_REASON_TC_EGRESS reason.

Victor kicked-off an initial proposal to make this more flexible by disambiguating
verdict from return code by moving the verdict into struct tcf_result and
letting tcf_classify() return a negative error. If hit, then two new drop
reasons were added in the proposal, that is SKB_DROP_REASON_TC_INGRESS_ERROR
as well as SKB_DROP_REASON_TC_EGRESS_ERROR. Further analysis of the actual
error codes would have required to attach to tcf_classify via kprobe/kretprobe
to more deeply debug skb and the returned error.

In order to make the kfree_skb_reason() in sch_handle_{ingress,egress}() more
extensible, it can be addressed in a more straight forward way, that is: Instead
of placing the verdict into struct tcf_result, we can just put the drop reason
in there, which does not require changes throughout various classful schedulers
given the existing verdict logic can stay as is.

Then, SKB_DROP_REASON_TC_ERROR{,_*} can be added to the enum skb_drop_reason
to disambiguate between an error or an intentional drop. New drop reason error
codes can be added successively to the tc code base.

For internal error locations which have not yet been annotated with a
SKB_DROP_REASON_TC_ERROR{,_*}, the fallback is SKB_DROP_REASON_TC_INGRESS and
SKB_DROP_REASON_TC_EGRESS, respectively. Generic errors could be marked with a
SKB_DROP_REASON_TC_ERROR code until they are converted to more specific ones
if it is found that they would be useful for troubleshooting.

While drop reasons have infrastructure for subsystem specific error codes which
are currently used by mac80211 and ovs, Jakub mentioned that it is preferred
for tc to use the enum skb_drop_reason core codes given it is a better fit and
currently the tooling support is better, too.

With regards to the latter:

  [...] I think Alastair (bpftrace) is working on auto-prettifying enums when
  bpftrace outputs maps. So we can do something like:

  $ bpftrace -e 'tracepoint:skb:kfree_skb { @[args->reason] = count(); }'
  Attaching 1 probe...
  ^C

  @[SKB_DROP_REASON_TC_INGRESS]: 2
  @[SKB_CONSUMED]: 34

  ^^^^^^^^^^^^ names!!

  Auto-magically. [...]

Add a small helper tcf_set_drop_reason() which can be used to set the drop reason
into the tcf_result.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Victor Nogueira <victor@mojatatu.com>
Link: https://lore.kernel.org/netdev/20231006063233.74345d36@kernel.org
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20231009092655.22025-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_cls.h     |  6 ++++++
 include/net/sch_generic.h |  3 +--
 net/core/dev.c            | 15 ++++++++++-----
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f308e8268651..a76c9171db0e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -154,6 +154,12 @@ __cls_set_class(unsigned long *clp, unsigned long cl)
 	return xchg(clp, cl);
 }
 
+static inline void tcf_set_drop_reason(struct tcf_result *res,
+				       enum skb_drop_reason reason)
+{
+	res->drop_reason = reason;
+}
+
 static inline void
 __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base)
 {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c7318c73cfd6..dcb9160e6467 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -324,7 +324,6 @@ struct Qdisc_ops {
 	struct module		*owner;
 };
 
-
 struct tcf_result {
 	union {
 		struct {
@@ -332,8 +331,8 @@ struct tcf_result {
 			u32		classid;
 		};
 		const struct tcf_proto *goto_tp;
-
 	};
+	enum skb_drop_reason		drop_reason;
 };
 
 struct tcf_chain;
diff --git a/net/core/dev.c b/net/core/dev.c
index 3ca746a5f0ad..97e7b9833db9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3914,7 +3914,8 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 #endif /* CONFIG_NET_EGRESS */
 
 #ifdef CONFIG_NET_XGRESS
-static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+		  enum skb_drop_reason *drop_reason)
 {
 	int ret = TC_ACT_UNSPEC;
 #ifdef CONFIG_NET_CLS_ACT
@@ -3926,12 +3927,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
 
 	tc_skb_cb(skb)->mru = 0;
 	tc_skb_cb(skb)->post_ct = false;
+	res.drop_reason = *drop_reason;
 
 	mini_qdisc_bstats_cpu_update(miniq, skb);
 	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
 	/* Only tcf related quirks below. */
 	switch (ret) {
 	case TC_ACT_SHOT:
+		*drop_reason = res.drop_reason;
 		mini_qdisc_qstats_cpu_drop(miniq);
 		break;
 	case TC_ACT_OK:
@@ -3981,6 +3984,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		   struct net_device *orig_dev, bool *another)
 {
 	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
 	int sch_ret;
 
 	if (!entry)
@@ -3998,7 +4002,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		if (sch_ret != TC_ACT_UNSPEC)
 			goto ingress_verdict;
 	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 ingress_verdict:
 	switch (sch_ret) {
 	case TC_ACT_REDIRECT:
@@ -4015,7 +4019,7 @@ ingress_verdict:
 		*ret = NET_RX_SUCCESS;
 		return NULL;
 	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+		kfree_skb_reason(skb, drop_reason);
 		*ret = NET_RX_DROP;
 		return NULL;
 	/* used by tc_run */
@@ -4036,6 +4040,7 @@ static __always_inline struct sk_buff *
 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
 	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
 	int sch_ret;
 
 	if (!entry)
@@ -4049,7 +4054,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		if (sch_ret != TC_ACT_UNSPEC)
 			goto egress_verdict;
 	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 egress_verdict:
 	switch (sch_ret) {
 	case TC_ACT_REDIRECT:
@@ -4058,7 +4063,7 @@ egress_verdict:
 		*ret = NET_XMIT_SUCCESS;
 		return NULL;
 	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+		kfree_skb_reason(skb, drop_reason);
 		*ret = NET_XMIT_DROP;
 		return NULL;
 	/* used by tc_run */
-- 
cgit v1.2.3-70-g09d2


From 39d08b91646d83e87f7cbcd846b3ef33b1a53b79 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 9 Oct 2023 11:26:55 +0200
Subject: net, sched: Add tcf_set_drop_reason for {__,}tcf_classify

Add an initial user for the newly added tcf_set_drop_reason() helper to set the
drop reason for internal errors leading to TC_ACT_SHOT inside {__,}tcf_classify().

Right now this only adds a very basic SKB_DROP_REASON_TC_ERROR as a generic
fallback indicator to mark drop locations. Where needed, such locations can be
converted to more specific codes, for example, when hitting the reclassification
limit, etc.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Victor Nogueira <victor@mojatatu.com>
Link: https://lore.kernel.org/r/20231009092655.22025-2-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason-core.h |  3 +++
 net/sched/cls_api.c           | 26 ++++++++++++++++++++------
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a587e83fc169..845dce805de7 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -80,6 +80,7 @@
 	FN(IPV6_NDISC_BAD_OPTIONS)	\
 	FN(IPV6_NDISC_NS_OTHERHOST)	\
 	FN(QUEUE_PURGE)			\
+	FN(TC_ERROR)			\
 	FNe(MAX)
 
 /**
@@ -345,6 +346,8 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
 	/** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
 	SKB_DROP_REASON_QUEUE_PURGE,
+	/** @SKB_DROP_REASON_TC_ERROR: generic internal tc error. */
+	SKB_DROP_REASON_TC_ERROR,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
 	 * shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a193cc7b3241..1daeb2182b70 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1681,12 +1681,16 @@ reclassify:
 			 * time we got here with a cookie from hardware.
 			 */
 			if (unlikely(n->tp != tp || n->tp->chain != n->chain ||
-				     !tp->ops->get_exts))
+				     !tp->ops->get_exts)) {
+				tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR);
 				return TC_ACT_SHOT;
+			}
 
 			exts = tp->ops->get_exts(tp, n->handle);
-			if (unlikely(!exts || n->exts != exts))
+			if (unlikely(!exts || n->exts != exts)) {
+				tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR);
 				return TC_ACT_SHOT;
+			}
 
 			n = NULL;
 			err = tcf_exts_exec_ex(skb, exts, act_index, res);
@@ -1712,8 +1716,10 @@ reclassify:
 			return err;
 	}
 
-	if (unlikely(n))
+	if (unlikely(n)) {
+		tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR);
 		return TC_ACT_SHOT;
+	}
 
 	return TC_ACT_UNSPEC; /* signal: continue lookup */
 #ifdef CONFIG_NET_CLS_ACT
@@ -1723,6 +1729,7 @@ reset:
 				       tp->chain->block->index,
 				       tp->prio & 0xffff,
 				       ntohs(tp->protocol));
+		tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR);
 		return TC_ACT_SHOT;
 	}
 
@@ -1759,8 +1766,10 @@ int tcf_classify(struct sk_buff *skb,
 			if (ext->act_miss) {
 				n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie,
 								&act_index);
-				if (!n)
+				if (!n) {
+					tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR);
 					return TC_ACT_SHOT;
+				}
 
 				chain = n->chain_index;
 			} else {
@@ -1768,8 +1777,10 @@ int tcf_classify(struct sk_buff *skb,
 			}
 
 			fchain = tcf_chain_lookup_rcu(block, chain);
-			if (!fchain)
+			if (!fchain) {
+				tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR);
 				return TC_ACT_SHOT;
+			}
 
 			/* Consume, so cloned/redirect skbs won't inherit ext */
 			skb_ext_del(skb, TC_SKB_EXT);
@@ -1788,8 +1799,11 @@ int tcf_classify(struct sk_buff *skb,
 			struct tc_skb_cb *cb = tc_skb_cb(skb);
 
 			ext = tc_skb_ext_alloc(skb);
-			if (WARN_ON_ONCE(!ext))
+			if (WARN_ON_ONCE(!ext)) {
+				tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR);
 				return TC_ACT_SHOT;
+			}
+
 			ext->chain = last_executed_chain;
 			ext->mru = cb->mru;
 			ext->post_ct = cb->post_ct;
-- 
cgit v1.2.3-70-g09d2


From 562b1fdf061bff9394ccd884456ed1173c224fdc Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 11 Oct 2023 13:30:44 -0700
Subject: tcp: Set pingpong threshold via sysctl

TCP pingpong threshold is 1 by default. But some applications, like SQL DB
may prefer a higher pingpong threshold to activate delayed acks in quick
ack mode for better performance.

The pingpong threshold and related code were changed to 3 in the year
2019 in:
  commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
And reverted to 1 in the year 2022 in:
  commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")

There is no single value that fits all applications.
Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
optimal performance based on the application needs.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/1697056244-21888-1-git-send-email-haiyangz@microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst | 13 +++++++++++++
 include/net/inet_connection_sock.h     | 16 ++++++++++++----
 include/net/netns/ipv4.h               |  2 ++
 net/ipv4/sysctl_net_ipv4.c             |  8 ++++++++
 net/ipv4/tcp_ipv4.c                    |  2 ++
 net/ipv4/tcp_output.c                  |  4 ++--
 6 files changed, 39 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index f7dfde3b09a9..e7ec9026e5db 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1183,6 +1183,19 @@ tcp_plb_cong_thresh - INTEGER
 
 	Default: 128
 
+tcp_pingpong_thresh - INTEGER
+	The number of estimated data replies sent for estimated incoming data
+	requests that must happen before TCP considers that a connection is a
+	"ping-pong" (request-response) connection for which delayed
+	acknowledgments can provide benefits.
+
+	This threshold is 1 by default, but some applications may need a higher
+	threshold for optimal performance.
+
+	Possible Values: 1 - 255
+
+	Default: 1
+
 UDP variables
 =============
 
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index d6d9d1c1985a..086d1193c9ef 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -328,11 +328,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
 
 struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
 
-#define TCP_PINGPONG_THRESH	1
-
 static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
 {
-	inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH;
+	inet_csk(sk)->icsk_ack.pingpong =
+		READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
 }
 
 static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
@@ -342,7 +341,16 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
 
 static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
 {
-	return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
+	return inet_csk(sk)->icsk_ack.pingpong >=
+	       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
+}
+
+static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ack.pingpong < U8_MAX)
+		icsk->icsk_ack.pingpong++;
 }
 
 static inline bool inet_csk_has_ulp(const struct sock *sk)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d96d05b08819..73f43f699199 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -133,6 +133,8 @@ struct netns_ipv4 {
 	u8 sysctl_tcp_migrate_req;
 	u8 sysctl_tcp_comp_sack_nr;
 	u8 sysctl_tcp_backlog_ack_defer;
+	u8 sysctl_tcp_pingpong_thresh;
+
 	int sysctl_tcp_reordering;
 	u8 sysctl_tcp_retries1;
 	u8 sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e7f024d93572..f63a545a7374 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1498,6 +1498,14 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+	{
+		.procname	= "tcp_pingpong_thresh",
+		.data		= &init_net.ipv4.sysctl_tcp_pingpong_thresh,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ONE,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a441740616d7..f603ad9307af 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3288,6 +3288,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
 	net->ipv4.sysctl_tcp_shrink_window = 0;
 
+	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
+
 	return 0;
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d5961a82a9e8..8e6ebf35ed58 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -170,10 +170,10 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	tp->lsndtime = now;
 
 	/* If it is a reply for ato after last received
-	 * packet, enter pingpong mode.
+	 * packet, increase pingpong count.
 	 */
 	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
-		inet_csk_enter_pingpong_mode(sk);
+		inet_csk_inc_pingpong_cnt(sk);
 }
 
 /* Account for an ACK we sent. */
-- 
cgit v1.2.3-70-g09d2


From e411a8e3bb2d12a59d5fb3590863cb6a16f27b7a Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Fri, 13 Oct 2023 11:54:50 -0700
Subject: net: stub tcp_gro_complete if CONFIG_INET=n

A few networking drivers including bnx2x, bnxt, qede, and idpf call
tcp_gro_complete as part of offloading TCP GRO. The function is only
defined if CONFIG_INET is true, since its TCP specific and is meaningless
if the kernel lacks IP networking support.

The combination of trying to use the complex network drivers with
CONFIG_NET but not CONFIG_INET is rather unlikely in practice: most use
cases are going to need IP networking.

The tcp_gro_complete function just sets some data in the socket buffer for
use in processing the TCP packet in the event that the GRO was offloaded to
the device. If the kernel lacks TCP support, such setup will simply go
unused.

The bnx2x, bnxt, and qede drivers wrap their TCP offload support in
CONFIG_INET checks and skip handling on such kernels.

The idpf driver did not check CONFIG_INET and thus fails to link if the
kernel is configured  with CONFIG_NET=y, CONFIG_IDPF=(m|y), and
CONFIG_INET=n.

While checking CONFIG_INET does allow the driver to bypass significantly
more instructions in the event that we know TCP networking isn't supported,
the configuration is unlikely to be used widely.

Rather than require driver authors to care about this, stub the
tcp_gro_complete function when CONFIG_INET=n. This allows drivers to be
left as-is. It does mean the idpf driver will perform slightly more work
than strictly necessary when CONFIG_INET=n, since it will still execute
some of the skb setup in idpf_rx_rsc. However, that work would be performed
in the case where CONFIG_INET=y anyways.

I did not change the existing drivers, since they appear to wrap a
significant portion of code when CONFIG_INET=n. There is little benefit in
trashing these drivers just to unwrap and remove the CONFIG_INET check.

Using a stub for tcp_gro_complete is still beneficial, as it means future
drivers no longer need to worry about this case of CONFIG_NET=y and
CONFIG_INET=n, which should reduce noise from buildbots that check such a
configuration.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Link: https://lore.kernel.org/r/20231013185502.1473541-1-jacob.e.keller@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7fdedf5c71f0..32146088a095 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2081,7 +2081,11 @@ INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff))
 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
 INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
+#ifdef CONFIG_INET
 void tcp_gro_complete(struct sk_buff *skb);
+#else
+static inline void tcp_gro_complete(struct sk_buff *skb) { }
+#endif
 
 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
 
-- 
cgit v1.2.3-70-g09d2


From 90de47f020db086f7929e09f64efd0cf627d6869 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 13 Oct 2023 14:48:21 +0800
Subject: page_pool: fragment API support for 32-bit arch with 64-bit DMA

Currently page_pool_alloc_frag() is not supported in 32-bit
arch with 64-bit DMA because of the overlap issue between
pp_frag_count and dma_addr_upper in 'struct page' for those
arches, which seems to be quite common, see [1], which means
driver may need to handle it when using fragment API.

It is assumed that the combination of the above arch with an
address space >16TB does not exist, as all those arches have
64b equivalent, it seems logical to use the 64b version for a
system with a large address space. It is also assumed that dma
address is page aligned when we are dma mapping a page aligned
buffer, see [2].

That means we're storing 12 bits of 0 at the lower end for a
dma address, we can reuse those bits for the above arches to
support 32b+12b, which is 16TB of memory.

If we make a wrong assumption, a warning is emitted so that
user can report to us.

1. https://lore.kernel.org/all/20211117075652.58299-1-linyunsheng@huawei.com/
2. https://lore.kernel.org/all/20230818145145.4b357c89@kernel.org/

Tested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Guillaume Tucker <guillaume.tucker@collabora.com>
CC: Matthew Wilcox <willy@infradead.org>
CC: Linux-MM <linux-mm@kvack.org>
Link: https://lore.kernel.org/r/20231013064827.61135-2-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm_types.h        | 13 +------------
 include/net/page_pool/helpers.h | 20 ++++++++++++++------
 net/core/page_pool.c            | 14 +++++++++-----
 3 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..74b49c4c7a52 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -125,18 +125,7 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			union {
-				/**
-				 * dma_addr_upper: might require a 64-bit
-				 * value on 32-bit architectures.
-				 */
-				unsigned long dma_addr_upper;
-				/**
-				 * For frag page support, not supported in
-				 * 32-bit architectures with 64-bit DMA.
-				 */
-				atomic_long_t pp_frag_count;
-			};
+			atomic_long_t pp_frag_count;
 		};
 		struct {	/* Tail pages of compound page */
 			unsigned long compound_head;	/* Bit zero is set */
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 8e7751464ff5..8f64adf86f5b 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -197,7 +197,7 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
 
-#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
+#define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA	\
 		(sizeof(dma_addr_t) > sizeof(unsigned long))
 
 /**
@@ -211,17 +211,25 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
 	dma_addr_t ret = page->dma_addr;
 
-	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
-		ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16;
+	if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
+		ret <<= PAGE_SHIFT;
 
 	return ret;
 }
 
-static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 {
+	if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
+		page->dma_addr = addr >> PAGE_SHIFT;
+
+		/* We assume page alignment to shave off bottom bits,
+		 * if this "compression" doesn't work we need to drop.
+		 */
+		return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT;
+	}
+
 	page->dma_addr = addr;
-	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
-		page->dma_addr_upper = upper_32_bits(addr);
+	return false;
 }
 
 static inline bool page_pool_put(struct page_pool *pool)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 77cb75e63aca..8a9868ea5067 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -211,10 +211,6 @@ static int page_pool_init(struct page_pool *pool,
 		 */
 	}
 
-	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
-	    pool->p.flags & PP_FLAG_PAGE_FRAG)
-		return -EINVAL;
-
 #ifdef CONFIG_PAGE_POOL_STATS
 	pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 	if (!pool->recycle_stats)
@@ -359,12 +355,20 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	if (dma_mapping_error(pool->p.dev, dma))
 		return false;
 
-	page_pool_set_dma_addr(page, dma);
+	if (page_pool_set_dma_addr(page, dma))
+		goto unmap_failed;
 
 	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 
 	return true;
+
+unmap_failed:
+	WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
+	dma_unmap_page_attrs(pool->p.dev, dma,
+			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+	return false;
 }
 
 static void page_pool_set_pp_info(struct page_pool *pool,
-- 
cgit v1.2.3-70-g09d2


From 0ac6486e5cbd1af682d3b52fcd3615247e13cb57 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Fri, 13 Oct 2023 12:06:24 +0800
Subject: i3c: Add support for bus enumeration & notification

This allows other drivers to be notified when new i3c busses are
attached, referring to a whole i3c bus as opposed to individual
devices.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/i3c/master.c       | 35 +++++++++++++++++++++++++++++++++++
 include/linux/i3c/master.h | 11 +++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 87283e4a4607..959ec5269376 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -22,6 +22,7 @@
 static DEFINE_IDR(i3c_bus_idr);
 static DEFINE_MUTEX(i3c_core_lock);
 static int __i3c_first_dynamic_bus_num;
+static BLOCKING_NOTIFIER_HEAD(i3c_bus_notifier);
 
 /**
  * i3c_bus_maintenance_lock - Lock the bus for a maintenance operation
@@ -453,6 +454,36 @@ static int i3c_bus_init(struct i3c_bus *i3cbus, struct device_node *np)
 	return 0;
 }
 
+void i3c_for_each_bus_locked(int (*fn)(struct i3c_bus *bus, void *data),
+			     void *data)
+{
+	struct i3c_bus *bus;
+	int id;
+
+	mutex_lock(&i3c_core_lock);
+	idr_for_each_entry(&i3c_bus_idr, bus, id)
+		fn(bus, data);
+	mutex_unlock(&i3c_core_lock);
+}
+EXPORT_SYMBOL_GPL(i3c_for_each_bus_locked);
+
+int i3c_register_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&i3c_bus_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(i3c_register_notifier);
+
+int i3c_unregister_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&i3c_bus_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(i3c_unregister_notifier);
+
+static void i3c_bus_notify(struct i3c_bus *bus, unsigned int action)
+{
+	blocking_notifier_call_chain(&i3c_bus_notifier, action, bus);
+}
+
 static const char * const i3c_bus_mode_strings[] = {
 	[I3C_BUS_MODE_PURE] = "pure",
 	[I3C_BUS_MODE_MIXED_FAST] = "mixed-fast",
@@ -2682,6 +2713,8 @@ int i3c_master_register(struct i3c_master_controller *master,
 	if (ret)
 		goto err_del_dev;
 
+	i3c_bus_notify(i3cbus, I3C_NOTIFY_BUS_ADD);
+
 	/*
 	 * We're done initializing the bus and the controller, we can now
 	 * register I3C devices discovered during the initial DAA.
@@ -2714,6 +2747,8 @@ EXPORT_SYMBOL_GPL(i3c_master_register);
  */
 void i3c_master_unregister(struct i3c_master_controller *master)
 {
+	i3c_bus_notify(&master->bus, I3C_NOTIFY_BUS_REMOVE);
+
 	i3c_master_i2c_adapter_cleanup(master);
 	i3c_master_unregister_i3c_devs(master);
 	i3c_master_bus_cleanup(master);
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 0b52da4f2346..db909ef79be4 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -24,6 +24,12 @@
 
 struct i2c_client;
 
+/* notifier actions. notifier call data is the struct i3c_bus */
+enum {
+	I3C_NOTIFY_BUS_ADD,
+	I3C_NOTIFY_BUS_REMOVE,
+};
+
 struct i3c_master_controller;
 struct i3c_bus;
 struct i3c_device;
@@ -652,4 +658,9 @@ void i3c_master_queue_ibi(struct i3c_dev_desc *dev, struct i3c_ibi_slot *slot);
 
 struct i3c_ibi_slot *i3c_master_get_free_ibi_slot(struct i3c_dev_desc *dev);
 
+void i3c_for_each_bus_locked(int (*fn)(struct i3c_bus *bus, void *data),
+			     void *data);
+int i3c_register_notifier(struct notifier_block *nb);
+int i3c_unregister_notifier(struct notifier_block *nb);
+
 #endif /* I3C_MASTER_H */
-- 
cgit v1.2.3-70-g09d2


From 9a675ba55a96a45a9fb69e6a5c43f80c6682e541 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 16 Oct 2023 14:57:38 +0200
Subject: net, bpf: Add a warning if NAPI cb missed xdp_do_flush().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few drivers were missing a xdp_do_flush() invocation after
XDP_REDIRECT.

Add three helper functions each for one of the per-CPU lists. Return
true if the per-CPU list is non-empty and flush the list.

Add xdp_do_check_flushed() which invokes each helper functions and
creates a warning if one of the functions had a non-empty list.

Hide everything behind CONFIG_DEBUG_NET.

Suggested-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20231016125738.Yt79p1uF@linutronix.de
---
 include/linux/bpf.h    |  3 +++
 include/net/xdp_sock.h |  9 +++++++++
 kernel/bpf/cpumap.c    | 10 ++++++++++
 kernel/bpf/devmap.c    | 10 ++++++++++
 net/core/dev.c         |  2 ++
 net/core/dev.h         |  6 ++++++
 net/core/filter.c      | 16 ++++++++++++++++
 net/xdp/xsk.c          | 10 ++++++++++
 8 files changed, 66 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3c51a507508..b4b40b45962b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2478,6 +2478,9 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
+
+bool dev_check_flush(void);
+bool cpu_map_check_flush(void);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 69b472604b86..7dd0df2f6f8e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -109,4 +109,13 @@ static inline void __xsk_map_flush(void)
 
 #endif /* CONFIG_XDP_SOCKETS */
 
+#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
+bool xsk_map_check_flush(void);
+#else
+static inline bool xsk_map_check_flush(void)
+{
+	return false;
+}
+#endif
+
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e42a1bdb7f53..8a0bb80fe48a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -764,6 +764,16 @@ void __cpu_map_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool cpu_map_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
+		return false;
+	__cpu_map_flush();
+	return true;
+}
+#endif
+
 static int __init cpu_map_init(void)
 {
 	int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 4d42f6ed6c11..a936c704d4e7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -418,6 +418,16 @@ void __dev_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool dev_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&dev_flush_list)))
+		return false;
+	__dev_flush();
+	return true;
+}
+#endif
+
 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
  * by local_bh_disable() (from XDP calls inside NAPI). The
  * rcu_read_lock_bh_held() below makes lockdep accept both.
diff --git a/net/core/dev.c b/net/core/dev.c
index 97e7b9833db9..4420831180c6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6535,6 +6535,8 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 		work = n->poll(n, weight);
 		trace_napi_poll(n, work, weight);
+
+		xdp_do_check_flushed(n);
 	}
 
 	if (unlikely(work > weight))
diff --git a/net/core/dev.h b/net/core/dev.h
index e075e198092c..f66125857af7 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -136,4 +136,10 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
 }
 
 int rps_cpumask_housekeeping(struct cpumask *mask);
+
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi);
+#else
+static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
+#endif
 #endif
diff --git a/net/core/filter.c b/net/core/filter.c
index cc2e4babc85f..21d75108c2e9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,8 @@
 #include <net/netfilter/nf_conntrack_bpf.h>
 #include <linux/un.h>
 
+#include "dev.h"
+
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
 
@@ -4208,6 +4210,20 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi)
+{
+	bool ret;
+
+	ret = dev_check_flush();
+	ret |= cpu_map_check_flush();
+	ret |= xsk_map_check_flush();
+
+	WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+		  napi->poll);
+}
+#endif
+
 void bpf_clear_redirect_map(struct bpf_map *map)
 {
 	struct bpf_redirect_info *ri;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f5e96e0d6e01..ba070fd37d24 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -391,6 +391,16 @@ void __xsk_map_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool xsk_map_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
+		return false;
+	__xsk_map_flush();
+	return true;
+}
+#endif
+
 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
 {
 	xskq_prod_submit_n(pool->cq, nb_entries);
-- 
cgit v1.2.3-70-g09d2


From ddd1ad68826d8ff61a2e47733959570aa4d39a16 Mon Sep 17 00:00:00 2001
From: Johannes Nixdorf <jnixdorf-oss@avm.de>
Date: Mon, 16 Oct 2023 15:27:22 +0200
Subject: net: bridge: Add netlink knobs for number / max learned FDB entries

The previous patch added accounting and a limit for the number of
dynamically learned FDB entries per bridge. However it did not provide
means to actually configure those bounds or read back the count. This
patch does that.

Two new netlink attributes are added for the accounting and limit of
dynamically learned FDB entries:
 - IFLA_BR_FDB_N_LEARNED (RO) for the number of entries accounted for
   a single bridge.
 - IFLA_BR_FDB_MAX_LEARNED (RW) for the configured limit of entries for
   the bridge.

The new attributes are used like this:

 # ip link add name br up type bridge fdb_max_learned 256
 # ip link add name v1 up master br type veth peer v2
 # ip link set up dev v2
 # mausezahn -a rand -c 1024 v2
 0.01 seconds (90877 packets per second
 # bridge fdb | grep -v permanent | wc -l
 256
 # ip -d link show dev br
 13: br: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 [...]
     [...] fdb_n_learned 256 fdb_max_learned 256

Signed-off-by: Johannes Nixdorf <jnixdorf-oss@avm.de>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20231016-fdb_limit-v5-3-32cddff87758@avm.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_link.h |  2 ++
 net/bridge/br_netlink.c      | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index fac351a93aed..9f8a3da0f14f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -510,6 +510,8 @@ enum {
 	IFLA_BR_VLAN_STATS_PER_PORT,
 	IFLA_BR_MULTI_BOOLOPT,
 	IFLA_BR_MCAST_QUERIER_STATE,
+	IFLA_BR_FDB_N_LEARNED,
+	IFLA_BR_FDB_MAX_LEARNED,
 	__IFLA_BR_MAX,
 };
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 10f0d33d8ccf..0c3cf6e6dea2 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1265,6 +1265,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
 	[IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
 	[IFLA_BR_MULTI_BOOLOPT] =
 		NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)),
+	[IFLA_BR_FDB_N_LEARNED] = { .type = NLA_REJECT },
+	[IFLA_BR_FDB_MAX_LEARNED] = { .type = NLA_U32 },
 };
 
 static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1539,6 +1541,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 			return err;
 	}
 
+	if (data[IFLA_BR_FDB_MAX_LEARNED]) {
+		u32 val = nla_get_u32(data[IFLA_BR_FDB_MAX_LEARNED]);
+
+		WRITE_ONCE(br->fdb_max_learned, val);
+	}
+
 	return 0;
 }
 
@@ -1593,6 +1601,8 @@ static size_t br_get_size(const struct net_device *brdev)
 	       nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */
 	       nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */
 	       nla_total_size(ETH_ALEN) +       /* IFLA_BR_GROUP_ADDR */
+	       nla_total_size(sizeof(u32)) +    /* IFLA_BR_FDB_N_LEARNED */
+	       nla_total_size(sizeof(u32)) +    /* IFLA_BR_FDB_MAX_LEARNED */
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_MCAST_ROUTER */
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_MCAST_SNOOPING */
@@ -1668,7 +1678,10 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	    nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED,
 		       br->topology_change_detected) ||
 	    nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) ||
-	    nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm))
+	    nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm) ||
+	    nla_put_u32(skb, IFLA_BR_FDB_N_LEARNED,
+			atomic_read(&br->fdb_n_learned)) ||
+	    nla_put_u32(skb, IFLA_BR_FDB_MAX_LEARNED, br->fdb_max_learned))
 		return -EMSGSIZE;
 
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
-- 
cgit v1.2.3-70-g09d2


From b6f9774719e5601b32f47021b40fee446b356490 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:42:53 +0100
Subject: net: phylink: provide mac_get_caps() method

Provide a new method, mac_get_caps() to get the MAC capabilities for
the specified interface mode. This is for MACs which have special
requirements, such as not supporting half-duplex in certain interface
modes, and will replace the validate() method.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPk5-009wiX-G5@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/sfp-phylink.rst |  7 +++++++
 drivers/net/phy/phylink.c                | 14 +++++++++++---
 include/linux/phylink.h                  | 15 +++++++++++++++
 3 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index 55b65f607a64..b069d34d7f5c 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -200,6 +200,13 @@ this documentation.
    when the in-band link state changes - otherwise the link will never
    come up.
 
+   The :c:func:`mac_get_caps` method is optional, and if provided should
+   return the phylink MAC capabilities that are supported for the passed
+   ``interface`` mode. In general, there is no need to implement this method.
+   Phylink will use these capabilities in combination with permissible
+   capabilities for ``interface`` to determine the allowable ethtool link
+   modes.
+
    The :c:func:`validate` method should mask the supplied supported mask,
    and ``state->advertising`` with the supported ethtool link modes.
    These are the new ethtool link modes, so bitmask operations must be
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 0d7354955d62..f5c2ba15d701 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -657,6 +657,7 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 					unsigned long *supported,
 					struct phylink_link_state *state)
 {
+	unsigned long capabilities;
 	struct phylink_pcs *pcs;
 	int ret;
 
@@ -696,10 +697,17 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 	}
 
 	/* Then validate the link parameters with the MAC */
-	if (pl->mac_ops->validate)
+	if (pl->mac_ops->validate) {
 		pl->mac_ops->validate(pl->config, supported, state);
-	else
-		phylink_generic_validate(pl->config, supported, state);
+	} else {
+		if (pl->mac_ops->mac_get_caps)
+			capabilities = pl->mac_ops->mac_get_caps(pl->config,
+							state->interface);
+		else
+			capabilities = pl->config->mac_capabilities;
+
+		phylink_validate_mask_caps(supported, state, capabilities);
+	}
 
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 2b886ea654bb..0798198a09ef 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -228,6 +228,7 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
 /**
  * struct phylink_mac_ops - MAC operations structure.
  * @validate: Validate and update the link configuration.
+ * @mac_get_caps: Get MAC capabilities for interface mode.
  * @mac_select_pcs: Select a PCS for the interface mode.
  * @mac_prepare: prepare for a major reconfiguration of the interface.
  * @mac_config: configure the MAC for the selected mode and state.
@@ -241,6 +242,8 @@ struct phylink_mac_ops {
 	void (*validate)(struct phylink_config *config,
 			 unsigned long *supported,
 			 struct phylink_link_state *state);
+	unsigned long (*mac_get_caps)(struct phylink_config *config,
+				      phy_interface_t interface);
 	struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config,
 					      phy_interface_t interface);
 	int (*mac_prepare)(struct phylink_config *config, unsigned int mode,
@@ -292,6 +295,18 @@ struct phylink_mac_ops {
  */
 void validate(struct phylink_config *config, unsigned long *supported,
 	      struct phylink_link_state *state);
+/**
+ * mac_get_caps: Get MAC capabilities for interface mode.
+ * @config: a pointer to a &struct phylink_config.
+ * @interface: PHY interface mode.
+ *
+ * Optional method. When not provided, config->mac_capabilities will be used.
+ * When implemented, this returns the MAC capabilities for the specified
+ * interface mode where there is some special handling required by the MAC
+ * driver (e.g. not supporting half-duplex in certain interface modes.)
+ */
+unsigned long mac_get_caps(struct phylink_config *config,
+			   phy_interface_t interface);
 /**
  * mac_select_pcs: Select a PCS for the interface mode.
  * @config: a pointer to a &struct phylink_config.
-- 
cgit v1.2.3-70-g09d2


From da5f6b80ad6417f7cf50d273c1c0ecb885e511e8 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:43:03 +0100
Subject: net: phylink: remove .validate() method

The MAC .validate() method is no longer used, so remove it from the
phylink_mac_ops structure, and remove the callsite in
phylink_validate_mac_and_pcs().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPkF-009wij-QM@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/sfp-phylink.rst |  5 -----
 drivers/net/phy/phylink.c                | 16 +++++---------
 include/linux/phylink.h                  | 38 --------------------------------
 3 files changed, 6 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index b069d34d7f5c..8054d33f449f 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -207,11 +207,6 @@ this documentation.
    capabilities for ``interface`` to determine the allowable ethtool link
    modes.
 
-   The :c:func:`validate` method should mask the supplied supported mask,
-   and ``state->advertising`` with the supported ethtool link modes.
-   These are the new ethtool link modes, so bitmask operations must be
-   used. For an example, see ``drivers/net/ethernet/marvell/mvneta.c``.
-
    The :c:func:`mac_link_state` method is used to read the link state
    from the MAC, and report back the settings that the MAC is currently
    using. This is particularly important for in-band negotiation
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index f5c2ba15d701..1c7e73fa58e4 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -697,17 +697,13 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 	}
 
 	/* Then validate the link parameters with the MAC */
-	if (pl->mac_ops->validate) {
-		pl->mac_ops->validate(pl->config, supported, state);
-	} else {
-		if (pl->mac_ops->mac_get_caps)
-			capabilities = pl->mac_ops->mac_get_caps(pl->config,
-							state->interface);
-		else
-			capabilities = pl->config->mac_capabilities;
+	if (pl->mac_ops->mac_get_caps)
+		capabilities = pl->mac_ops->mac_get_caps(pl->config,
+							 state->interface);
+	else
+		capabilities = pl->config->mac_capabilities;
 
-		phylink_validate_mask_caps(supported, state, capabilities);
-	}
+	phylink_validate_mask_caps(supported, state, capabilities);
 
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 0798198a09ef..0cf559bae1ff 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -227,7 +227,6 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
 
 /**
  * struct phylink_mac_ops - MAC operations structure.
- * @validate: Validate and update the link configuration.
  * @mac_get_caps: Get MAC capabilities for interface mode.
  * @mac_select_pcs: Select a PCS for the interface mode.
  * @mac_prepare: prepare for a major reconfiguration of the interface.
@@ -239,9 +238,6 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
  * The individual methods are described more fully below.
  */
 struct phylink_mac_ops {
-	void (*validate)(struct phylink_config *config,
-			 unsigned long *supported,
-			 struct phylink_link_state *state);
 	unsigned long (*mac_get_caps)(struct phylink_config *config,
 				      phy_interface_t interface);
 	struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config,
@@ -261,40 +257,6 @@ struct phylink_mac_ops {
 };
 
 #if 0 /* For kernel-doc purposes only. */
-/**
- * validate - Validate and update the link configuration
- * @config: a pointer to a &struct phylink_config.
- * @supported: ethtool bitmask for supported link modes.
- * @state: a pointer to a &struct phylink_link_state.
- *
- * Clear bits in the @supported and @state->advertising masks that
- * are not supportable by the MAC.
- *
- * Note that the PHY may be able to transform from one connection
- * technology to another, so, eg, don't clear 1000BaseX just
- * because the MAC is unable to BaseX mode. This is more about
- * clearing unsupported speeds and duplex settings. The port modes
- * should not be cleared; phylink_set_port_modes() will help with this.
- *
- * When @config->supported_interfaces has been set, phylink will iterate
- * over the supported interfaces to determine the full capability of the
- * MAC. The validation function must not print errors if @state->interface
- * is set to an unexpected value.
- *
- * When @config->supported_interfaces is empty, phylink will call this
- * function with @state->interface set to %PHY_INTERFACE_MODE_NA, and
- * expects the MAC driver to return all supported link modes.
- *
- * If the @state->interface mode is not supported, then the @supported
- * mask must be cleared.
- *
- * This member is optional; if not set, the generic validator will be
- * used making use of @config->mac_capabilities and
- * @config->supported_interfaces to determine which link modes are
- * supported.
- */
-void validate(struct phylink_config *config, unsigned long *supported,
-	      struct phylink_link_state *state);
 /**
  * mac_get_caps: Get MAC capabilities for interface mode.
  * @config: a pointer to a &struct phylink_config.
-- 
cgit v1.2.3-70-g09d2


From 743f6397623edc708eda76ecc20e70032166e573 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:43:08 +0100
Subject: net: phylink: remove a bunch of unused validation methods

Remove exports for phylink_caps_to_linkmodes(),
phylink_get_capabilities(), phylink_validate_mask_caps() and
phylink_generic_validate(). Also, as phylink_generic_validate() is no
longer called, we can remove its implementation as well.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPkK-009wip-W9@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 35 ++++++++---------------------------
 include/linux/phylink.h   | 11 -----------
 2 files changed, 8 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 1c7e73fa58e4..6712883498bb 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -257,7 +257,8 @@ static int phylink_interface_max_speed(phy_interface_t interface)
  * Set all possible pause, speed and duplex linkmodes in @linkmodes that are
  * supported by the @caps. @linkmodes must have been initialised previously.
  */
-void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps)
+static void phylink_caps_to_linkmodes(unsigned long *linkmodes,
+				      unsigned long caps)
 {
 	if (caps & MAC_SYM_PAUSE)
 		__set_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes);
@@ -400,7 +401,6 @@ void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps)
 		__set_bit(ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT, linkmodes);
 	}
 }
-EXPORT_SYMBOL_GPL(phylink_caps_to_linkmodes);
 
 static struct {
 	unsigned long mask;
@@ -477,9 +477,9 @@ static unsigned long phylink_cap_from_speed_duplex(int speed,
  * Get the MAC capabilities that are supported by the @interface mode and
  * @mac_capabilities.
  */
-unsigned long phylink_get_capabilities(phy_interface_t interface,
-				       unsigned long mac_capabilities,
-				       int rate_matching)
+static unsigned long phylink_get_capabilities(phy_interface_t interface,
+					      unsigned long mac_capabilities,
+					      int rate_matching)
 {
 	int max_speed = phylink_interface_max_speed(interface);
 	unsigned long caps = MAC_SYM_PAUSE | MAC_ASYM_PAUSE;
@@ -606,7 +606,6 @@ unsigned long phylink_get_capabilities(phy_interface_t interface,
 
 	return (caps & mac_capabilities) | matched_caps;
 }
-EXPORT_SYMBOL_GPL(phylink_get_capabilities);
 
 /**
  * phylink_validate_mask_caps() - Restrict link modes based on caps
@@ -618,9 +617,9 @@ EXPORT_SYMBOL_GPL(phylink_get_capabilities);
  * @supported and @state based on that. Use this function if your capabiliies
  * aren't constant, such as if they vary depending on the interface.
  */
-void phylink_validate_mask_caps(unsigned long *supported,
-				struct phylink_link_state *state,
-				unsigned long mac_capabilities)
+static void phylink_validate_mask_caps(unsigned long *supported,
+				       struct phylink_link_state *state,
+				       unsigned long mac_capabilities)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 	unsigned long caps;
@@ -634,24 +633,6 @@ void phylink_validate_mask_caps(unsigned long *supported,
 	linkmode_and(supported, supported, mask);
 	linkmode_and(state->advertising, state->advertising, mask);
 }
-EXPORT_SYMBOL_GPL(phylink_validate_mask_caps);
-
-/**
- * phylink_generic_validate() - generic validate() callback implementation
- * @config: a pointer to a &struct phylink_config.
- * @supported: ethtool bitmask for supported link modes.
- * @state: a pointer to a &struct phylink_link_state.
- *
- * Generic implementation of the validate() callback that MAC drivers can
- * use when they pass the range of supported interfaces and MAC capabilities.
- */
-void phylink_generic_validate(struct phylink_config *config,
-			      unsigned long *supported,
-			      struct phylink_link_state *state)
-{
-	phylink_validate_mask_caps(supported, state, config->mac_capabilities);
-}
-EXPORT_SYMBOL_GPL(phylink_generic_validate);
 
 static int phylink_validate_mac_and_pcs(struct phylink *pl,
 					unsigned long *supported,
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 0cf559bae1ff..875439ab45de 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -613,17 +613,6 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
 		 phy_interface_t interface, int speed, int duplex);
 #endif
 
-void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps);
-unsigned long phylink_get_capabilities(phy_interface_t interface,
-				       unsigned long mac_capabilities,
-				       int rate_matching);
-void phylink_validate_mask_caps(unsigned long *supported,
-				struct phylink_link_state *state,
-				unsigned long caps);
-void phylink_generic_validate(struct phylink_config *config,
-			      unsigned long *supported,
-			      struct phylink_link_state *state);
-
 struct phylink *phylink_create(struct phylink_config *,
 			       const struct fwnode_handle *,
 			       phy_interface_t,
-- 
cgit v1.2.3-70-g09d2


From 2034d90ae41ae93e30d492ebcf1f06f97a9cfba6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Fri, 13 Oct 2023 14:10:23 +0200
Subject: net: treat possible_net_t net pointer as an RCU one and add
 read_pnet_rcu()

Make the net pointer stored in possible_net_t structure annotated as
an RCU pointer. Change the access helpers to treat it as such.
Introduce read_pnet_rcu() helper to allow caller to dereference
the net pointer under RCU read lock.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index eb6cd43b1746..13b3a4e29fdb 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -368,21 +368,30 @@ static inline void put_net_track(struct net *net, netns_tracker *tracker)
 
 typedef struct {
 #ifdef CONFIG_NET_NS
-	struct net *net;
+	struct net __rcu *net;
 #endif
 } possible_net_t;
 
 static inline void write_pnet(possible_net_t *pnet, struct net *net)
 {
 #ifdef CONFIG_NET_NS
-	pnet->net = net;
+	rcu_assign_pointer(pnet->net, net);
 #endif
 }
 
 static inline struct net *read_pnet(const possible_net_t *pnet)
 {
 #ifdef CONFIG_NET_NS
-	return pnet->net;
+	return rcu_dereference_protected(pnet->net, true);
+#else
+	return &init_net;
+#endif
+}
+
+static inline struct net *read_pnet_rcu(possible_net_t *pnet)
+{
+#ifdef CONFIG_NET_NS
+	return rcu_dereference(pnet->net);
 #else
 	return &init_net;
 #endif
-- 
cgit v1.2.3-70-g09d2


From e0d4593140b01b8da513a0c88c26da28b4906413 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Oct 2023 09:59:38 +0200
Subject: netfilter: make nftables drops visible in net dropmonitor

net_dropmonitor blames core.c:nf_hook_slow.
Add NF_DROP_REASON() helper and use it in nft_do_chain().

The helper releases the skb, so exact drop location becomes
available. Calling code will observe the NF_STOLEN verdict
instead.

Adjust nf_hook_slow so we can embed an erro value wih
NF_STOLEN verdicts, just like we do for NF_DROP.

After this, drop in nftables can be pinpointed to a drop due
to a rule or the chain policy.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter.h      | 10 ++++++++++
 net/netfilter/core.c           |  6 +++---
 net/netfilter/nf_tables_core.c |  6 +++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index d68644b7c299..80900d910992 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -22,6 +22,16 @@ static inline int NF_DROP_GETERR(int verdict)
 	return -(verdict >> NF_VERDICT_QBITS);
 }
 
+static __always_inline int
+NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err)
+{
+	BUILD_BUG_ON(err > 0xffff);
+
+	kfree_skb_reason(skb, reason);
+
+	return ((err << 16) | NF_STOLEN);
+}
+
 static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
 				   const union nf_inet_addr *a2)
 {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index ef4e76e5aef9..3126911f5042 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -639,10 +639,10 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 			if (ret == 1)
 				continue;
 			return ret;
+		case NF_STOLEN:
+			return NF_DROP_GETERR(verdict);
 		default:
-			/* Implicit handling for NF_STOLEN, as well as any other
-			 * non conventional verdicts.
-			 */
+			WARN_ON_ONCE(1);
 			return 0;
 		}
 	}
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 6009b423f60a..8b536d7ef6c2 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -308,10 +308,11 @@ next_rule:
 
 	switch (regs.verdict.code & NF_VERDICT_MASK) {
 	case NF_ACCEPT:
-	case NF_DROP:
 	case NF_QUEUE:
 	case NF_STOLEN:
 		return regs.verdict.code;
+	case NF_DROP:
+		return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
 	}
 
 	switch (regs.verdict.code) {
@@ -342,6 +343,9 @@ next_rule:
 	if (static_branch_unlikely(&nft_counters_enabled))
 		nft_update_chain_stats(basechain, pkt);
 
+	if (nft_base_chain(basechain)->policy == NF_DROP)
+		return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
+
 	return nft_base_chain(basechain)->policy;
 }
 EXPORT_SYMBOL_GPL(nft_do_chain);
-- 
cgit v1.2.3-70-g09d2


From 256001672153af5786c6ca148114693d7d76d836 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 13 Oct 2023 14:18:14 +0200
Subject: netfilter: nf_tables: de-constify set commit ops function argument

The set backend using this already has to work around this via ugly
cast, don't spread this pattern.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_tables.h | 2 +-
 net/netfilter/nft_set_pipapo.c    | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 9fb16485d08f..8de040d2d2cf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -462,7 +462,7 @@ struct nft_set_ops {
 					       const struct nft_set *set,
 					       const struct nft_set_elem *elem,
 					       unsigned int flags);
-	void				(*commit)(const struct nft_set *set);
+	void				(*commit)(struct nft_set *set);
 	void				(*abort)(const struct nft_set *set);
 	u64				(*privsize)(const struct nlattr * const nla[],
 						    const struct nft_set_desc *desc);
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index c0dcc40de358..75a9dee353e2 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1549,12 +1549,11 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
 
 /**
  * pipapo_gc() - Drop expired entries from set, destroy start and end elements
- * @_set:	nftables API set representation
+ * @set:	nftables API set representation
  * @m:		Matching data
  */
-static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m)
+static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
 {
-	struct nft_set *set = (struct nft_set *) _set;
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct net *net = read_pnet(&set->net);
 	int rules_f0, first_rule = 0;
@@ -1672,7 +1671,7 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
  * We also need to create a new working copy for subsequent insertions and
  * deletions.
  */
-static void nft_pipapo_commit(const struct nft_set *set)
+static void nft_pipapo_commit(struct nft_set *set)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_match *new_clone, *old;
-- 
cgit v1.2.3-70-g09d2


From 26c5334d344db7a9bffa1f5ce87d9045b5e90383 Mon Sep 17 00:00:00 2001
From: Paul Greenwalt <paul.greenwalt@intel.com>
Date: Sun, 15 Oct 2023 19:43:02 -0400
Subject: ethtool: Add forced speed to supported link modes maps

The need to map Ethtool forced speeds to Ethtool supported link modes is
common among drivers. To support this, add a common structure for forced
speed maps and a function to init them.  This is solution was originally
introduced in commit 1d4e4ecccb11 ("qede: populate supported link modes
maps on module init") for qede driver.

ethtool_forced_speed_maps_init() should be called during driver init
with an array of struct ethtool_forced_speed_map to populate the mapping.

Definitions for maps themselves are left in the driver code, as the sets
of supported link modes may vary between the devices.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Pawel Chmielewski <pawel.chmielewski@intel.com>
Signed-off-by: Paul Greenwalt <paul.greenwalt@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h  | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/linkmode.h | 29 +++++++++++++++--------------
 2 files changed, 52 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 62b61527bcc4..8e91e8b8a693 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -13,6 +13,7 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
+#include <linux/linkmode.h>
 #include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <linux/if_ether.h>
@@ -1052,4 +1053,40 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add,
  * next string.
  */
 extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
+
+/* Link mode to forced speed capabilities maps */
+struct ethtool_forced_speed_map {
+	u32		speed;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(caps);
+
+	const u32	*cap_arr;
+	u32		arr_size;
+};
+
+#define ETHTOOL_FORCED_SPEED_MAP(prefix, value)				\
+{									\
+	.speed		= SPEED_##value,				\
+	.cap_arr	= prefix##_##value,				\
+	.arr_size	= ARRAY_SIZE(prefix##_##value),			\
+}
+
+/**
+ * ethtool_forced_speed_maps_init
+ * @maps: Pointer to an array of Ethtool forced speed map
+ * @size: Array size
+ *
+ * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
+ * should be called during driver module init.
+ */
+static inline void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
+{
+	for (u32 i = 0; i < size; i++) {
+		struct ethtool_forced_speed_map *map = &maps[i];
+
+		linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
+		map->cap_arr = NULL;
+		map->arr_size = 0;
+	}
+}
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 15e0e0209da4..cd38f89553e6 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -2,6 +2,21 @@
 #define __LINKMODE_H
 
 #include <linux/bitmap.h>
+
+static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
+{
+	__set_bit(nr, addr);
+}
+
+static inline void linkmode_set_bit_array(const int *array, int array_size,
+					  unsigned long *addr)
+{
+	int i;
+
+	for (i = 0; i < array_size; i++)
+		linkmode_set_bit(array[i], addr);
+}
+
 #include <linux/ethtool.h>
 #include <uapi/linux/ethtool.h>
 
@@ -38,20 +53,6 @@ static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
 	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
-{
-	__set_bit(nr, addr);
-}
-
-static inline void linkmode_set_bit_array(const int *array, int array_size,
-					  unsigned long *addr)
-{
-	int i;
-
-	for (i = 0; i < array_size; i++)
-		linkmode_set_bit(array[i], addr);
-}
-
 static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
 {
 	__clear_bit(nr, addr);
-- 
cgit v1.2.3-70-g09d2


From 1dbfe73bd648bdc3a2d8c9e5b23ae70e3333b592 Mon Sep 17 00:00:00 2001
From: Johannes Zink <j.zink@pengutronix.de>
Date: Wed, 18 Oct 2023 09:09:55 +0200
Subject: net: stmmac: intel: remove unnecessary field struct
 plat_stmmacenet_data::ext_snapshot_num

Do not store bitmask for enabling AUX_SNAPSHOT0. The previous commit
("net: stmmac: fix PPS capture input index") takes care of calculating
the proper bit mask from the request data's extts.index field, which is
0 if not explicitly specified otherwise.

Signed-off-by: Johannes Zink <j.zink@pengutronix.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 1 -
 include/linux/stmmac.h                            | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index a3a249c63598..60283543ffc8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -605,7 +605,6 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	plat->mdio_bus_data->phy_mask |= 1 << INTEL_MGBE_XPCS_ADDR;
 
 	plat->int_snapshot_num = AUX_SNAPSHOT1;
-	plat->ext_snapshot_num = AUX_SNAPSHOT0;
 
 	plat->crosststamp = intel_crosststamp;
 	plat->flags &= ~STMMAC_FLAG_INT_SNAPSHOT_EN;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c0079a7574ae..0b4658a7eceb 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -303,7 +303,6 @@ struct plat_stmmacenet_data {
 	unsigned int eee_usecs_rate;
 	struct pci_dev *pdev;
 	int int_snapshot_num;
-	int ext_snapshot_num;
 	int msi_mac_vec;
 	int msi_wol_vec;
 	int msi_lpi_vec;
-- 
cgit v1.2.3-70-g09d2


From 878d951c6712b655c38e78ac1ee63c35cd913b22 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Oct 2023 09:00:13 +0000
Subject: inet: lock the socket in ip_sock_set_tos()

Christoph Paasch reported a panic in TCP stack [1]

Indeed, we should not call sk_dst_reset() without holding
the socket lock, as __sk_dst_get() callers do not all rely
on bare RCU.

[1]
BUG: kernel NULL pointer dereference, address: 0000000000000000
PGD 12bad6067 P4D 12bad6067 PUD 12bad5067 PMD 0
Oops: 0000 [#1] PREEMPT SMP
CPU: 1 PID: 2750 Comm: syz-executor.5 Not tainted 6.6.0-rc4-g7a5720a344e7 #49
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
RIP: 0010:tcp_get_metrics+0x118/0x8f0 net/ipv4/tcp_metrics.c:321
Code: c7 44 24 70 02 00 8b 03 89 44 24 48 c7 44 24 4c 00 00 00 00 66 c7 44 24 58 02 00 66 ba 02 00 b1 01 89 4c 24 04 4c 89 7c 24 10 <49> 8b 0f 48 8b 89 50 05 00 00 48 89 4c 24 30 33 81 00 02 00 00 69
RSP: 0018:ffffc90000af79b8 EFLAGS: 00010293
RAX: 000000000100007f RBX: ffff88812ae8f500 RCX: ffff88812b5f8f01
RDX: 0000000000000002 RSI: ffffffff8300f080 RDI: 0000000000000002
RBP: 0000000000000002 R08: 0000000000000003 R09: ffffffff8205eca0
R10: 0000000000000002 R11: ffff88812b5f8f00 R12: ffff88812a9e0580
R13: 0000000000000000 R14: ffff88812ae8fbd2 R15: 0000000000000000
FS: 00007f70a006b640(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000012bad7003 CR4: 0000000000170ee0
Call Trace:
<TASK>
tcp_fastopen_cache_get+0x32/0x140 net/ipv4/tcp_metrics.c:567
tcp_fastopen_cookie_check+0x28/0x180 net/ipv4/tcp_fastopen.c:419
tcp_connect+0x9c8/0x12a0 net/ipv4/tcp_output.c:3839
tcp_v4_connect+0x645/0x6e0 net/ipv4/tcp_ipv4.c:323
__inet_stream_connect+0x120/0x590 net/ipv4/af_inet.c:676
tcp_sendmsg_fastopen+0x2d6/0x3a0 net/ipv4/tcp.c:1021
tcp_sendmsg_locked+0x1957/0x1b00 net/ipv4/tcp.c:1073
tcp_sendmsg+0x30/0x50 net/ipv4/tcp.c:1336
__sock_sendmsg+0x83/0xd0 net/socket.c:730
__sys_sendto+0x20a/0x2a0 net/socket.c:2194
__do_sys_sendto net/socket.c:2206 [inline]

Fixes: e08d0b3d1723 ("inet: implement lockless IP_TOS")
Reported-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20231018090014.345158-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/ip.h                                   |  1 +
 net/ipv4/ip_sockglue.c                             | 11 +++++++++--
 net/mptcp/sockopt.c                                |  4 ++--
 tools/testing/selftests/net/mptcp/mptcp_connect.sh |  2 +-
 4 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 6fbc0dcf4b97..1fc4c8d69e33 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -810,5 +810,6 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val);
 void ip_sock_set_pktinfo(struct sock *sk);
 void ip_sock_set_recverr(struct sock *sk);
 void ip_sock_set_tos(struct sock *sk, int val);
+void  __ip_sock_set_tos(struct sock *sk, int val);
 
 #endif	/* _IP_H */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0b74ac49d6a6..9c68b6b74d9f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -585,9 +585,9 @@ out:
 	return err;
 }
 
-void ip_sock_set_tos(struct sock *sk, int val)
+void __ip_sock_set_tos(struct sock *sk, int val)
 {
-	u8 old_tos = READ_ONCE(inet_sk(sk)->tos);
+	u8 old_tos = inet_sk(sk)->tos;
 
 	if (sk->sk_type == SOCK_STREAM) {
 		val &= ~INET_ECN_MASK;
@@ -599,6 +599,13 @@ void ip_sock_set_tos(struct sock *sk, int val)
 		sk_dst_reset(sk);
 	}
 }
+
+void ip_sock_set_tos(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__ip_sock_set_tos(sk, val);
+	release_sock(sk);
+}
 EXPORT_SYMBOL(ip_sock_set_tos);
 
 void ip_sock_set_freebind(struct sock *sk)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 18ce624bfde2..59bd5e114392 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -738,7 +738,7 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
-		ip_sock_set_tos(ssk, val);
+		__ip_sock_set_tos(ssk, val);
 	}
 	release_sock(sk);
 
@@ -1411,7 +1411,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
 	ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
 	ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
 	ssk->sk_ipv6only = sk->sk_ipv6only;
-	ip_sock_set_tos(ssk, inet_sk(sk)->tos);
+	__ip_sock_set_tos(ssk, inet_sk(sk)->tos);
 
 	if (sk->sk_userlocks & tx_rx_locks) {
 		ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 61a2a1988ce6..b1fc8afd072d 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -716,7 +716,7 @@ run_test_transparent()
 	# the required infrastructure in MPTCP sockopt code. To support TOS, the
 	# following function has been exported (T). Not great but better than
 	# checking for a specific kernel version.
-	if ! mptcp_lib_kallsyms_has "T ip_sock_set_tos$"; then
+	if ! mptcp_lib_kallsyms_has "T __ip_sock_set_tos$"; then
 		echo "INFO: ${msg} not supported by the kernel: SKIP"
 		mptcp_lib_result_skip "${TEST_GROUP}"
 		return
-- 
cgit v1.2.3-70-g09d2


From 7f3eb2174512fe6c9c0f062e96eccb0d3cc6d5cd Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Wed, 18 Oct 2023 14:35:47 +0200
Subject: net: introduce napi_is_scheduled helper

We currently have napi_if_scheduled_mark_missed that can be used to
check if napi is scheduled but that does more thing than simply checking
it and return a bool. Some driver already implement custom function to
check if napi is scheduled.

Drop these custom function and introduce napi_is_scheduled that simply
check if napi is scheduled atomically.

Update any driver and code that implement a similar check and instead
use this new helper.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/chelsio/cxgb3/sge.c  |  8 --------
 drivers/net/wireless/realtek/rtw89/core.c |  2 +-
 include/linux/netdevice.h                 | 23 +++++++++++++++++++++++
 net/core/dev.c                            |  2 +-
 4 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index dfe4e0102960..6268f96cb4aa 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -2501,14 +2501,6 @@ static int napi_rx_handler(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
-/*
- * Returns true if the device is already scheduled for polling.
- */
-static inline int napi_is_scheduled(struct napi_struct *napi)
-{
-	return test_bit(NAPI_STATE_SCHED, &napi->state);
-}
-
 /**
  *	process_pure_responses - process pure responses from a response queue
  *	@adap: the adapter
diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c
index 4bfb4188de72..3d75165e48be 100644
--- a/drivers/net/wireless/realtek/rtw89/core.c
+++ b/drivers/net/wireless/realtek/rtw89/core.c
@@ -2005,7 +2005,7 @@ static void rtw89_core_rx_to_mac80211(struct rtw89_dev *rtwdev,
 	struct napi_struct *napi = &rtwdev->napi;
 
 	/* In low power mode, napi isn't scheduled. Receive it to netif. */
-	if (unlikely(!test_bit(NAPI_STATE_SCHED, &napi->state)))
+	if (unlikely(!napi_is_scheduled(napi)))
 		napi = NULL;
 
 	rtw89_core_hw_to_sband_rate(rx_status);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1c7681263d30..b8bf669212cc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -482,6 +482,29 @@ static inline bool napi_prefer_busy_poll(struct napi_struct *n)
 	return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
 }
 
+/**
+ * napi_is_scheduled - test if NAPI is scheduled
+ * @n: NAPI context
+ *
+ * This check is "best-effort". With no locking implemented,
+ * a NAPI can be scheduled or terminate right after this check
+ * and produce not precise results.
+ *
+ * NAPI_STATE_SCHED is an internal state, napi_is_scheduled
+ * should not be used normally and napi_schedule should be
+ * used instead.
+ *
+ * Use only if the driver really needs to check if a NAPI
+ * is scheduled for example in the context of delayed timer
+ * that can be skipped if a NAPI is already scheduled.
+ *
+ * Return True if NAPI is scheduled, False otherwise.
+ */
+static inline bool napi_is_scheduled(struct napi_struct *n)
+{
+	return test_bit(NAPI_STATE_SCHED, &n->state);
+}
+
 bool napi_schedule_prep(struct napi_struct *n);
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index 97e7b9833db9..e7f61f5a5322 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6532,7 +6532,7 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
 	 * accidentally calling ->poll() when NAPI is not scheduled.
 	 */
 	work = 0;
-	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+	if (napi_is_scheduled(n)) {
 		work = n->poll(n, weight);
 		trace_napi_poll(n, work, weight);
 	}
-- 
cgit v1.2.3-70-g09d2


From 6da88306811b40a207c94c9da9faf07bdb20776e Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:39 +0800
Subject: cgroup: Prepare for using css_task_iter_*() in BPF

This patch makes some preparations for using css_task_iter_*() in BPF
Program.

1. Flags CSS_TASK_ITER_* are #define-s and it's not easy for bpf prog to
use them. Convert them to enum so bpf prog can take them from vmlinux.h.

2. In the next patch we will add css_task_iter_*() in common kfuncs which
is not safe. Since css_task_iter_*() does spin_unlock_irq() which might
screw up irq flags depending on the context where bpf prog is running.
So we should use irqsave/irqrestore here and the switching is harmless.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-2-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/cgroup.h | 12 +++++-------
 kernel/cgroup/cgroup.c | 18 ++++++++++++------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b307013b9c6c..0ef0af66080e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -40,13 +40,11 @@ struct kernel_clone_args;
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
-/* walk only threadgroup leaders */
-#define CSS_TASK_ITER_PROCS		(1U << 0)
-/* walk all threaded css_sets in the domain */
-#define CSS_TASK_ITER_THREADED		(1U << 1)
-
-/* internal flags */
-#define CSS_TASK_ITER_SKIPPED		(1U << 16)
+enum {
+	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
+	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
+	CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
+};
 
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1fb7f562289d..b6d64f3b8888 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4917,9 +4917,11 @@ repeat:
 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 			 struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	memset(it, 0, sizeof(*it));
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, irqflags);
 
 	it->ss = css->ss;
 	it->flags = flags;
@@ -4933,7 +4935,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 
 	css_task_iter_advance(it);
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, irqflags);
 }
 
 /**
@@ -4946,12 +4948,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	if (it->cur_task) {
 		put_task_struct(it->cur_task);
 		it->cur_task = NULL;
 	}
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, irqflags);
 
 	/* @it may be half-advanced by skips, finish advancing */
 	if (it->flags & CSS_TASK_ITER_SKIPPED)
@@ -4964,7 +4968,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		css_task_iter_advance(it);
 	}
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, irqflags);
 
 	return it->cur_task;
 }
@@ -4977,11 +4981,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
  */
 void css_task_iter_end(struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	if (it->cur_cset) {
-		spin_lock_irq(&css_set_lock);
+		spin_lock_irqsave(&css_set_lock, irqflags);
 		list_del(&it->iters_node);
 		put_css_set_locked(it->cur_cset);
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_irqrestore(&css_set_lock, irqflags);
 	}
 
 	if (it->cur_dcset)
-- 
cgit v1.2.3-70-g09d2


From dfab99df147b0d364f0c199f832ff2aedfb2265a Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:43 +0800
Subject: bpf: teach the verifier to enforce css_iter and task_iter in RCU CS

css_iter and task_iter should be used in rcu section. Specifically, in
sleepable progs explicit bpf_rcu_read_lock() is needed before use these
iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
use them directly.

This patch adds a new a KF flag KF_RCU_PROTECTED for bpf_iter_task_new and
bpf_iter_css_new. It means the kfunc should be used in RCU CS. We check
whether we are in rcu cs before we want to invoke this kfunc. If the rcu
protection is guaranteed, we would let st->type = PTR_TO_STACK | MEM_RCU.
Once user do rcu_unlock during the iteration, state MEM_RCU of regs would
be cleared. is_iter_reg_valid_init() will reject if reg->type is UNTRUSTED.

It is worth noting that currently, bpf_rcu_read_unlock does not
clear the state of the STACK_ITER reg, since bpf_for_each_spilled_reg
only considers STACK_SPILL. This patch also let bpf_for_each_spilled_reg
search STACK_ITER.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-6-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 19 ++++++++++-------
 include/linux/btf.h          |  1 +
 kernel/bpf/helpers.c         |  4 ++--
 kernel/bpf/verifier.c        | 50 ++++++++++++++++++++++++++++++++++----------
 4 files changed, 53 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 94ec766432f5..e67cd45a85be 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -386,19 +386,18 @@ struct bpf_verifier_state {
 	u32 jmp_history_cnt;
 };
 
-#define bpf_get_spilled_reg(slot, frame)				\
+#define bpf_get_spilled_reg(slot, frame, mask)				\
 	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
-	  (frame->stack[slot].slot_type[0] == STACK_SPILL))		\
+	  ((1 << frame->stack[slot].slot_type[0]) & (mask))) \
 	 ? &frame->stack[slot].spilled_ptr : NULL)
 
 /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
-#define bpf_for_each_spilled_reg(iter, frame, reg)			\
-	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame);		\
+#define bpf_for_each_spilled_reg(iter, frame, reg, mask)			\
+	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask);		\
 	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
-	     iter++, reg = bpf_get_spilled_reg(iter, frame))
+	     iter++, reg = bpf_get_spilled_reg(iter, frame, mask))
 
-/* Invoke __expr over regsiters in __vst, setting __state and __reg */
-#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr)   \
+#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr)   \
 	({                                                               \
 		struct bpf_verifier_state *___vstate = __vst;            \
 		int ___i, ___j;                                          \
@@ -410,7 +409,7 @@ struct bpf_verifier_state {
 				__reg = &___regs[___j];                  \
 				(void)(__expr);                          \
 			}                                                \
-			bpf_for_each_spilled_reg(___j, __state, __reg) { \
+			bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \
 				if (!__reg)                              \
 					continue;                        \
 				(void)(__expr);                          \
@@ -418,6 +417,10 @@ struct bpf_verifier_state {
 		}                                                        \
 	})
 
+/* Invoke __expr over regsiters in __vst, setting __state and __reg */
+#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \
+	bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr)
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 928113a80a95..c2231c64d60b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -74,6 +74,7 @@
 #define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
 #define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
 #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
+#define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b1d285ed4796..da058aead20c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2563,10 +2563,10 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 974713185269..fcdf2382153a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1173,7 +1173,12 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 
 static void __mark_reg_known_zero(struct bpf_reg_state *reg);
 
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
+
 static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+				 struct bpf_kfunc_call_arg_meta *meta,
 				 struct bpf_reg_state *reg, int insn_idx,
 				 struct btf *btf, u32 btf_id, int nr_slots)
 {
@@ -1194,6 +1199,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
 
 		__mark_reg_known_zero(st);
 		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+		if (is_kfunc_rcu_protected(meta)) {
+			if (in_rcu_cs(env))
+				st->type |= MEM_RCU;
+			else
+				st->type |= PTR_UNTRUSTED;
+		}
 		st->live |= REG_LIVE_WRITTEN;
 		st->ref_obj_id = i == 0 ? id : 0;
 		st->iter.btf = btf;
@@ -1268,7 +1279,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
 	return true;
 }
 
-static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   struct btf *btf, u32 btf_id, int nr_slots)
 {
 	struct bpf_func_state *state = func(env, reg);
@@ -1276,26 +1287,28 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
 
 	spi = iter_get_spi(env, reg, nr_slots);
 	if (spi < 0)
-		return false;
+		return -EINVAL;
 
 	for (i = 0; i < nr_slots; i++) {
 		struct bpf_stack_state *slot = &state->stack[spi - i];
 		struct bpf_reg_state *st = &slot->spilled_ptr;
 
+		if (st->type & PTR_UNTRUSTED)
+			return -EPROTO;
 		/* only main (first) slot has ref_obj_id set */
 		if (i == 0 && !st->ref_obj_id)
-			return false;
+			return -EINVAL;
 		if (i != 0 && st->ref_obj_id)
-			return false;
+			return -EINVAL;
 		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
-			return false;
+			return -EINVAL;
 
 		for (j = 0; j < BPF_REG_SIZE; j++)
 			if (slot->slot_type[j] != STACK_ITER)
-				return false;
+				return -EINVAL;
 	}
 
-	return true;
+	return 0;
 }
 
 /* Check if given stack slot is "special":
@@ -7640,15 +7653,24 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 				return err;
 		}
 
-		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+		err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
 		if (err)
 			return err;
 	} else {
 		/* iter_next() or iter_destroy() expect initialized iter state*/
-		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+		err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
+		switch (err) {
+		case 0:
+			break;
+		case -EINVAL:
 			verbose(env, "expected an initialized iter_%s as arg #%d\n",
 				iter_type_str(meta->btf, btf_id), regno);
-			return -EINVAL;
+			return err;
+		case -EPROTO:
+			verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+			return err;
+		default:
+			return err;
 		}
 
 		spi = iter_get_spi(env, reg, nr_slots);
@@ -10231,6 +10253,11 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->kfunc_flags & KF_RCU;
 }
 
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_RCU_PROTECTED;
+}
+
 static bool __kfunc_param_match_suffix(const struct btf *btf,
 				       const struct btf_param *arg,
 				       const char *suffix)
@@ -11582,6 +11609,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	if (env->cur_state->active_rcu_lock) {
 		struct bpf_func_state *state;
 		struct bpf_reg_state *reg;
+		u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
 
 		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
 			verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
@@ -11592,7 +11620,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
 			return -EINVAL;
 		} else if (rcu_unlock) {
-			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+			bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
 				if (reg->type & MEM_RCU) {
 					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
 					reg->type |= PTR_UNTRUSTED;
-- 
cgit v1.2.3-70-g09d2


From 0050629cd36a58b568ac0aebeeca60bd2fde3d6d Mon Sep 17 00:00:00 2001
From: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Date: Wed, 18 Oct 2023 22:26:47 +0200
Subject: devlink: convert most of devlink_fmsg_*() to return void

Since struct devlink_fmsg retains error by now (see 1st patch of this
series), there is no longer need to keep returning it in each call.

This is a separate commit to allow per-driver conversion to stop using
those return values.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  60 ++++++++--------
 net/devlink/health.c  | 188 ++++++++++++++++++++++----------------------------
 2 files changed, 114 insertions(+), 134 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index fad8e36e3d98..9ac394bdfbe4 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1854,36 +1854,36 @@ int devlink_info_version_running_put_ext(struct devlink_info_req *req,
 					 const char *version_value,
 					 enum devlink_info_version_type version_type);
 
-int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg);
-int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg);
-
-int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name);
-int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg);
-
-int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
-				     const char *name);
-int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg);
-int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
-					const char *name);
-int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg);
-
-int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value);
-int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value);
-int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
-			    u16 value_len);
-
-int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			       bool value);
-int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			     u8 value);
-int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			      u32 value);
-int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			      u64 value);
-int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
-				 const char *value);
-int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
-				 const void *value, u32 value_len);
+void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg);
+void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg);
+
+void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name);
+void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg);
+
+void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
+				      const char *name);
+void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg);
+void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
+					 const char *name);
+void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg);
+
+void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value);
+void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value);
+void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+			     u16 value_len);
+
+void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				bool value);
+void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			      u8 value);
+void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			       u32 value);
+void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			       u64 value);
+void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				  const char *value);
+void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				  const void *value, u32 value_len);
 
 struct devlink_health_reporter *
 devl_port_health_reporter_create(struct devlink_port *port,
diff --git a/net/devlink/health.c b/net/devlink/health.c
index 3858a436598e..89405e59f45c 100644
--- a/net/devlink/health.c
+++ b/net/devlink/health.c
@@ -566,16 +566,15 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
 	if (!reporter->dump_fmsg)
 		return -ENOMEM;
 
-	err = devlink_fmsg_obj_nest_start(reporter->dump_fmsg);
-	if (err)
-		goto dump_err;
+	devlink_fmsg_obj_nest_start(reporter->dump_fmsg);
 
 	err = reporter->ops->dump(reporter, reporter->dump_fmsg,
 				  priv_ctx, extack);
 	if (err)
 		goto dump_err;
 
-	err = devlink_fmsg_obj_nest_end(reporter->dump_fmsg);
+	devlink_fmsg_obj_nest_end(reporter->dump_fmsg);
+	err = reporter->dump_fmsg->err;
 	if (err)
 		goto dump_err;
 
@@ -675,63 +674,61 @@ static void devlink_fmsg_err_if_binary(struct devlink_fmsg *fmsg)
 		fmsg->err = -EINVAL;
 }
 
-static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int attrtype)
+static void devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int attrtype)
 {
 	struct devlink_fmsg_item *item;
 
 	if (fmsg->err)
-		return fmsg->err;
+		return;
 
 	item = kzalloc(sizeof(*item), GFP_KERNEL);
 	if (!item) {
 		fmsg->err = -ENOMEM;
-		return fmsg->err;
+		return;
 	}
 
 	item->attrtype = attrtype;
 	list_add_tail(&item->list, &fmsg->item_list);
-
-	return 0;
 }
 
-int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
+void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
 {
 	devlink_fmsg_err_if_binary(fmsg);
-	return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
+	devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start);
 
-static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
+static void devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
 {
 	devlink_fmsg_err_if_binary(fmsg);
-	return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
+	devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
 }
 
-int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
+void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
 {
-	return devlink_fmsg_nest_end(fmsg);
+	devlink_fmsg_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end);
 
 #define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN)
 
-static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
+static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
 {
 	struct devlink_fmsg_item *item;
 
 	devlink_fmsg_err_if_binary(fmsg);
 	if (fmsg->err)
-		return fmsg->err;
+		return;
 
 	if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) {
 		fmsg->err = -EMSGSIZE;
-		return fmsg->err;
+		return;
 	}
 
 	item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL);
 	if (!item) {
 		fmsg->err = -ENOMEM;
-		return fmsg->err;
+		return;
 	}
 
 	item->nla_type = NLA_NUL_STRING;
@@ -739,83 +736,76 @@ static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
 	item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME;
 	memcpy(&item->value, name, item->len);
 	list_add_tail(&item->list, &fmsg->item_list);
-
-	return 0;
 }
 
-int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
+void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
 {
 	devlink_fmsg_err_if_binary(fmsg);
 	devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START);
-	return devlink_fmsg_put_name(fmsg, name);
+	devlink_fmsg_put_name(fmsg, name);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start);
 
-int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
+void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
 {
-	return devlink_fmsg_nest_end(fmsg);
+	devlink_fmsg_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end);
 
-int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
-				     const char *name)
+void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
+				      const char *name)
 {
 	devlink_fmsg_pair_nest_start(fmsg, name);
-	return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START);
+	devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start);
 
-int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
+void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
 {
 	devlink_fmsg_nest_end(fmsg);
-	return devlink_fmsg_nest_end(fmsg);
+	devlink_fmsg_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end);
 
-int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
-					const char *name)
+void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
+					 const char *name)
 {
-	int err;
-
-	err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
-	if (err)
-		return err;
-
+	devlink_fmsg_arr_pair_nest_start(fmsg, name);
 	fmsg->putting_binary = true;
-	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start);
 
-int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg)
+void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg)
 {
 	if (fmsg->err)
-		return fmsg->err;
+		return;
 
-	if (!fmsg->putting_binary) {
+	if (!fmsg->putting_binary)
 		fmsg->err = -EINVAL;
-		return fmsg->err;
-	}
 
 	fmsg->putting_binary = false;
-	return devlink_fmsg_arr_pair_nest_end(fmsg);
+	devlink_fmsg_arr_pair_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end);
 
-static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
-				  const void *value, u16 value_len,
-				  u8 value_nla_type)
+static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
+				   const void *value, u16 value_len,
+				   u8 value_nla_type)
 {
 	struct devlink_fmsg_item *item;
 
+	if (fmsg->err)
+		return;
+
 	if (value_len > DEVLINK_FMSG_MAX_SIZE) {
 		fmsg->err = -EMSGSIZE;
-		return fmsg->err;
+		return;
 	}
 
 	item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL);
 	if (!item) {
 		fmsg->err = -ENOMEM;
-		return fmsg->err;
+		return;
 	}
 
 	item->nla_type = value_nla_type;
@@ -823,125 +813,113 @@ static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
 	item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
 	memcpy(&item->value, value, item->len);
 	list_add_tail(&item->list, &fmsg->item_list);
-
-	return 0;
 }
 
-static int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
+static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
 {
 	devlink_fmsg_err_if_binary(fmsg);
-	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG);
+	devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG);
 }
 
-static int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
+static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
 {
 	devlink_fmsg_err_if_binary(fmsg);
-	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8);
+	devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8);
 }
 
-int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
+void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
 {
 	devlink_fmsg_err_if_binary(fmsg);
-	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32);
+	devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
 
-static int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
+static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
 {
 	devlink_fmsg_err_if_binary(fmsg);
-	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64);
+	devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64);
 }
 
-int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
+void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
 {
 	devlink_fmsg_err_if_binary(fmsg);
-	return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
-				      NLA_NUL_STRING);
+	devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, NLA_NUL_STRING);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
 
-int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
-			    u16 value_len)
+void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+			     u16 value_len)
 {
-	if (!fmsg->putting_binary)
-		return -EINVAL;
+	if (!fmsg->err && !fmsg->putting_binary)
+		fmsg->err = -EINVAL;
 
-	return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
+	devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
 
-int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			       bool value)
+void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				bool value)
 {
 	devlink_fmsg_pair_nest_start(fmsg, name);
 	devlink_fmsg_bool_put(fmsg, value);
-	return devlink_fmsg_pair_nest_end(fmsg);
+	devlink_fmsg_pair_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put);
 
-int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			     u8 value)
+void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			      u8 value)
 {
 	devlink_fmsg_pair_nest_start(fmsg, name);
 	devlink_fmsg_u8_put(fmsg, value);
-	return devlink_fmsg_pair_nest_end(fmsg);
+	devlink_fmsg_pair_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put);
 
-int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			      u32 value)
+void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			       u32 value)
 {
 	devlink_fmsg_pair_nest_start(fmsg, name);
 	devlink_fmsg_u32_put(fmsg, value);
-	return devlink_fmsg_pair_nest_end(fmsg);
+	devlink_fmsg_pair_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put);
 
-int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			      u64 value)
+void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			       u64 value)
 {
 	devlink_fmsg_pair_nest_start(fmsg, name);
 	devlink_fmsg_u64_put(fmsg, value);
-	return devlink_fmsg_pair_nest_end(fmsg);
+	devlink_fmsg_pair_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put);
 
-int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
-				 const char *value)
+void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				  const char *value)
 {
 	devlink_fmsg_pair_nest_start(fmsg, name);
 	devlink_fmsg_string_put(fmsg, value);
-	return devlink_fmsg_pair_nest_end(fmsg);
+	devlink_fmsg_pair_nest_end(fmsg);
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put);
 
-int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
-				 const void *value, u32 value_len)
+void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				  const void *value, u32 value_len)
 {
 	u32 data_size;
 	u32 offset;
-	int err;
 
-	err = devlink_fmsg_binary_pair_nest_start(fmsg, name);
-	if (err)
-		return err;
+	devlink_fmsg_binary_pair_nest_start(fmsg, name);
 
 	for (offset = 0; offset < value_len; offset += data_size) {
 		data_size = value_len - offset;
 		if (data_size > DEVLINK_FMSG_MAX_SIZE)
 			data_size = DEVLINK_FMSG_MAX_SIZE;
-		err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
-		if (err)
-			break;
-		/* Exit from loop with a break (instead of
-		 * return) to make sure putting_binary is turned off
-		 */
+
+		devlink_fmsg_binary_put(fmsg, value + offset, data_size);
 	}
 
-	err = devlink_fmsg_binary_pair_nest_end(fmsg);
+	devlink_fmsg_binary_pair_nest_end(fmsg);
 	fmsg->putting_binary = false;
-
-	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
 
@@ -1051,6 +1029,9 @@ static int devlink_fmsg_snd(struct devlink_fmsg *fmsg,
 	void *hdr;
 	int err;
 
+	if (fmsg->err)
+		return fmsg->err;
+
 	while (!last) {
 		int tmp_index = index;
 
@@ -1104,6 +1085,9 @@ static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb,
 	void *hdr;
 	int err;
 
+	if (fmsg->err)
+		return fmsg->err;
+
 	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			  &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd);
 	if (!hdr) {
@@ -1143,17 +1127,13 @@ int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
 	if (!fmsg)
 		return -ENOMEM;
 
-	err = devlink_fmsg_obj_nest_start(fmsg);
-	if (err)
-		goto out;
+	devlink_fmsg_obj_nest_start(fmsg);
 
 	err = reporter->ops->diagnose(reporter, fmsg, info->extack);
 	if (err)
 		goto out;
 
-	err = devlink_fmsg_obj_nest_end(fmsg);
-	if (err)
-		goto out;
+	devlink_fmsg_obj_nest_end(fmsg);
 
 	err = devlink_fmsg_snd(fmsg, info,
 			       DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
-- 
cgit v1.2.3-70-g09d2


From 374d345d9b5e13380c66d7042f9533a6ac6d1195 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 18 Oct 2023 14:39:20 -0700
Subject: netlink: add variable-length / auto integers

We currently push everyone to use padding to align 64b values
in netlink. Un-padded nla_put_u64() doesn't even exist any more.

The story behind this possibly start with this thread:
https://lore.kernel.org/netdev/20121204.130914.1457976839967676240.davem@davemloft.net/
where DaveM was concerned about the alignment of a structure
containing 64b stats. If user space tries to access such struct
directly:

	struct some_stats *stats = nla_data(attr);
	printf("A: %llu", stats->a);

lack of alignment may become problematic for some architectures.
These days we most often put every single member in a separate
attribute, meaning that the code above would use a helper like
nla_get_u64(), which can deal with alignment internally.
Even for arches which don't have good unaligned access - access
aligned to 4B should be pretty efficient.
Kernel and well known libraries deal with unaligned input already.

Padded 64b is quite space-inefficient (64b + pad means at worst 16B
per attr vs 32b which takes 8B). It is also more typing:

    if (nla_put_u64_pad(rsp, NETDEV_A_SOMETHING_SOMETHING,
                        value, NETDEV_A_SOMETHING_PAD))

Create a new attribute type which will use 32 bits at netlink
level if value is small enough (probably most of the time?),
and (4B-aligned) 64 bits otherwise. Kernel API is just:

    if (nla_put_uint(rsp, NETDEV_A_SOMETHING_SOMETHING, value))

Calling this new type "just" sint / uint with no specific size
will hopefully also make people more comfortable with using it.
Currently telling people "don't use u8, you may need the bits,
and netlink will round up to 4B, anyway" is the #1 comment
we give to newcomers.

In terms of netlink layout it looks like this:

         0       4       8       12      16
32b:     [nlattr][ u32  ]
64b:     [  pad ][nlattr][     u64      ]
uint(32) [nlattr][ u32  ]
uint(64) [nlattr][     u64      ]

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/userspace-api/netlink/specs.rst | 18 ++++++-
 include/net/netlink.h                         | 69 ++++++++++++++++++++++++++-
 include/uapi/linux/netlink.h                  |  5 ++
 lib/nlattr.c                                  | 22 +++++++++
 net/netlink/policy.c                          | 14 ++++--
 5 files changed, 121 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/netlink/specs.rst b/Documentation/userspace-api/netlink/specs.rst
index 40dd7442d2c3..c1b951649113 100644
--- a/Documentation/userspace-api/netlink/specs.rst
+++ b/Documentation/userspace-api/netlink/specs.rst
@@ -403,10 +403,21 @@ This section describes the attribute types supported by the ``genetlink``
 compatibility level. Refer to documentation of different levels for additional
 attribute types.
 
-Scalar integer types
+Common integer types
 --------------------
 
-Fixed-width integer types:
+``sint`` and ``uint`` represent signed and unsigned 64 bit integers.
+If the value can fit on 32 bits only 32 bits are carried in netlink
+messages, otherwise full 64 bits are carried. Note that the payload
+is only aligned to 4B, so the full 64 bit value may be unaligned!
+
+Common integer types should be preferred over fix-width types in majority
+of cases.
+
+Fix-width integer types
+-----------------------
+
+Fixed-width integer types include:
 ``u8``, ``u16``, ``u32``, ``u64``, ``s8``, ``s16``, ``s32``, ``s64``.
 
 Note that types smaller than 32 bit should be avoided as using them
@@ -416,6 +427,9 @@ See :ref:`pad_type` for padding of 64 bit attributes.
 The payload of the attribute is the integer in host order unless ``byte-order``
 specifies otherwise.
 
+64 bit values are usually aligned by the kernel but it is recommended
+that the user space is able to deal with unaligned values.
+
 .. _pad_type:
 
 pad
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 8a7cd1170e1f..aba2b162a226 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -128,6 +128,8 @@
  *   nla_len(nla)			length of attribute payload
  *
  * Attribute Payload Access for Basic Types:
+ *   nla_get_uint(nla)			get payload for a uint attribute
+ *   nla_get_sint(nla)			get payload for a sint attribute
  *   nla_get_u8(nla)			get payload for a u8 attribute
  *   nla_get_u16(nla)			get payload for a u16 attribute
  *   nla_get_u32(nla)			get payload for a u32 attribute
@@ -183,6 +185,8 @@ enum {
 	NLA_REJECT,
 	NLA_BE16,
 	NLA_BE32,
+	NLA_SINT,
+	NLA_UINT,
 	__NLA_TYPE_MAX,
 };
 
@@ -229,6 +233,7 @@ enum nla_policy_validation {
  *                         nested header (or empty); len field is used if
  *                         nested_policy is also used, for the max attr
  *                         number in the nested policy.
+ *    NLA_SINT, NLA_UINT,
  *    NLA_U8, NLA_U16,
  *    NLA_U32, NLA_U64,
  *    NLA_S8, NLA_S16,
@@ -260,12 +265,14 @@ enum nla_policy_validation {
  *                         while an array has the nested attributes at another
  *                         level down and the attribute types directly in the
  *                         nesting don't matter.
+ *    NLA_UINT,
  *    NLA_U8,
  *    NLA_U16,
  *    NLA_U32,
  *    NLA_U64,
  *    NLA_BE16,
  *    NLA_BE32,
+ *    NLA_SINT,
  *    NLA_S8,
  *    NLA_S16,
  *    NLA_S32,
@@ -280,6 +287,7 @@ enum nla_policy_validation {
  *                         or NLA_POLICY_FULL_RANGE_SIGNED() macros instead.
  *                         Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
  *                         NLA_POLICY_RANGE() macros.
+ *    NLA_UINT,
  *    NLA_U8,
  *    NLA_U16,
  *    NLA_U32,
@@ -288,6 +296,7 @@ enum nla_policy_validation {
  *                         to a struct netlink_range_validation that indicates
  *                         the min/max values.
  *                         Use NLA_POLICY_FULL_RANGE().
+ *    NLA_SINT,
  *    NLA_S8,
  *    NLA_S16,
  *    NLA_S32,
@@ -377,9 +386,11 @@ struct nla_policy {
 
 #define __NLA_IS_UINT_TYPE(tp)					\
 	(tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 ||	\
-	 tp == NLA_U64 || tp == NLA_BE16 || tp == NLA_BE32)
+	 tp == NLA_U64 || tp == NLA_UINT ||			\
+	 tp == NLA_BE16 || tp == NLA_BE32)
 #define __NLA_IS_SINT_TYPE(tp)						\
-	(tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64)
+	(tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \
+	 tp == NLA_SINT)
 
 #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
 #define NLA_ENSURE_UINT_TYPE(tp)			\
@@ -1357,6 +1368,22 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
 	return nla_put(skb, attrtype, sizeof(u32), &tmp);
 }
 
+/**
+ * nla_put_uint - Add a variable-size unsigned int to a socket buffer
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @value: numeric value
+ */
+static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value)
+{
+	u64 tmp64 = value;
+	u32 tmp32 = value;
+
+	if (tmp64 == tmp32)
+		return nla_put_u32(skb, attrtype, tmp32);
+	return nla_put(skb, attrtype, sizeof(u64), &tmp64);
+}
+
 /**
  * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer
  * @skb: socket buffer to add attribute to
@@ -1511,6 +1538,22 @@ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
 	return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr);
 }
 
+/**
+ * nla_put_sint - Add a variable-size signed int to a socket buffer
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @value: numeric value
+ */
+static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value)
+{
+	s64 tmp64 = value;
+	s32 tmp32 = value;
+
+	if (tmp64 == tmp32)
+		return nla_put_s32(skb, attrtype, tmp32);
+	return nla_put(skb, attrtype, sizeof(s64), &tmp64);
+}
+
 /**
  * nla_put_string - Add a string netlink attribute to a socket buffer
  * @skb: socket buffer to add attribute to
@@ -1667,6 +1710,17 @@ static inline u64 nla_get_u64(const struct nlattr *nla)
 	return tmp;
 }
 
+/**
+ * nla_get_uint - return payload of uint attribute
+ * @nla: uint netlink attribute
+ */
+static inline u64 nla_get_uint(const struct nlattr *nla)
+{
+	if (nla_len(nla) == sizeof(u32))
+		return nla_get_u32(nla);
+	return nla_get_u64(nla);
+}
+
 /**
  * nla_get_be64 - return payload of __be64 attribute
  * @nla: __be64 netlink attribute
@@ -1729,6 +1783,17 @@ static inline s64 nla_get_s64(const struct nlattr *nla)
 	return tmp;
 }
 
+/**
+ * nla_get_sint - return payload of uint attribute
+ * @nla: uint netlink attribute
+ */
+static inline s64 nla_get_sint(const struct nlattr *nla)
+{
+	if (nla_len(nla) == sizeof(s32))
+		return nla_get_s32(nla);
+	return nla_get_s64(nla);
+}
+
 /**
  * nla_get_flag - return payload of flag attribute
  * @nla: flag netlink attribute
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index e2ae82e3f9f7..f87aaf28a649 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -298,6 +298,8 @@ struct nla_bitfield32 {
  *	entry has attributes again, the policy for those inner ones
  *	and the corresponding maxtype may be specified.
  * @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute
+ * @NL_ATTR_TYPE_SINT: 32-bit or 64-bit signed attribute, aligned to 4B
+ * @NL_ATTR_TYPE_UINT: 32-bit or 64-bit unsigned attribute, aligned to 4B
  */
 enum netlink_attribute_type {
 	NL_ATTR_TYPE_INVALID,
@@ -322,6 +324,9 @@ enum netlink_attribute_type {
 	NL_ATTR_TYPE_NESTED_ARRAY,
 
 	NL_ATTR_TYPE_BITFIELD32,
+
+	NL_ATTR_TYPE_SINT,
+	NL_ATTR_TYPE_UINT,
 };
 
 /**
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 7a2b6c38fd59..dc15e7888fc1 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -134,6 +134,7 @@ void nla_get_range_unsigned(const struct nla_policy *pt,
 		range->max = U32_MAX;
 		break;
 	case NLA_U64:
+	case NLA_UINT:
 	case NLA_MSECS:
 		range->max = U64_MAX;
 		break;
@@ -183,6 +184,9 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt,
 	case NLA_U64:
 		value = nla_get_u64(nla);
 		break;
+	case NLA_UINT:
+		value = nla_get_uint(nla);
+		break;
 	case NLA_MSECS:
 		value = nla_get_u64(nla);
 		break;
@@ -248,6 +252,7 @@ void nla_get_range_signed(const struct nla_policy *pt,
 		range->max = S32_MAX;
 		break;
 	case NLA_S64:
+	case NLA_SINT:
 		range->min = S64_MIN;
 		range->max = S64_MAX;
 		break;
@@ -295,6 +300,9 @@ static int nla_validate_int_range_signed(const struct nla_policy *pt,
 	case NLA_S64:
 		value = nla_get_s64(nla);
 		break;
+	case NLA_SINT:
+		value = nla_get_sint(nla);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -320,6 +328,7 @@ static int nla_validate_int_range(const struct nla_policy *pt,
 	case NLA_U16:
 	case NLA_U32:
 	case NLA_U64:
+	case NLA_UINT:
 	case NLA_MSECS:
 	case NLA_BINARY:
 	case NLA_BE16:
@@ -329,6 +338,7 @@ static int nla_validate_int_range(const struct nla_policy *pt,
 	case NLA_S16:
 	case NLA_S32:
 	case NLA_S64:
+	case NLA_SINT:
 		return nla_validate_int_range_signed(pt, nla, extack);
 	default:
 		WARN_ON(1);
@@ -355,6 +365,9 @@ static int nla_validate_mask(const struct nla_policy *pt,
 	case NLA_U64:
 		value = nla_get_u64(nla);
 		break;
+	case NLA_UINT:
+		value = nla_get_uint(nla);
+		break;
 	case NLA_BE16:
 		value = ntohs(nla_get_be16(nla));
 		break;
@@ -433,6 +446,15 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			goto out_err;
 		break;
 
+	case NLA_SINT:
+	case NLA_UINT:
+		if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) {
+			NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
+						"invalid attribute length");
+			return -EINVAL;
+		}
+		break;
+
 	case NLA_BITFIELD32:
 		if (attrlen != sizeof(struct nla_bitfield32))
 			goto out_err;
diff --git a/net/netlink/policy.c b/net/netlink/policy.c
index e2f111edf66c..1f8909c16f14 100644
--- a/net/netlink/policy.c
+++ b/net/netlink/policy.c
@@ -230,6 +230,8 @@ int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt)
 	case NLA_S16:
 	case NLA_S32:
 	case NLA_S64:
+	case NLA_SINT:
+	case NLA_UINT:
 		/* maximum is common, u64 min/max with padding */
 		return common +
 		       2 * (nla_attr_size(0) + nla_attr_size(sizeof(u64)));
@@ -288,6 +290,7 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state,
 	case NLA_U16:
 	case NLA_U32:
 	case NLA_U64:
+	case NLA_UINT:
 	case NLA_MSECS: {
 		struct netlink_range_validation range;
 
@@ -297,8 +300,10 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state,
 			type = NL_ATTR_TYPE_U16;
 		else if (pt->type == NLA_U32)
 			type = NL_ATTR_TYPE_U32;
-		else
+		else if (pt->type == NLA_U64)
 			type = NL_ATTR_TYPE_U64;
+		else
+			type = NL_ATTR_TYPE_UINT;
 
 		if (pt->validation_type == NLA_VALIDATE_MASK) {
 			if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MASK,
@@ -320,7 +325,8 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state,
 	case NLA_S8:
 	case NLA_S16:
 	case NLA_S32:
-	case NLA_S64: {
+	case NLA_S64:
+	case NLA_SINT: {
 		struct netlink_range_validation_signed range;
 
 		if (pt->type == NLA_S8)
@@ -329,8 +335,10 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state,
 			type = NL_ATTR_TYPE_S16;
 		else if (pt->type == NLA_S32)
 			type = NL_ATTR_TYPE_S32;
-		else
+		else if (pt->type == NLA_S64)
 			type = NL_ATTR_TYPE_S64;
+		else
+			type = NL_ATTR_TYPE_SINT;
 
 		nla_get_range_signed(pt, &range);
 
-- 
cgit v1.2.3-70-g09d2


From f600bb612b06adf70ccdefbd3294c71275b650c2 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Thu, 19 Oct 2023 13:14:58 +0200
Subject: net: dsa: microchip: ksz8: Enable MIIM PHY Control reg access

Provide access to MIIM PHY Control register (Reg. 31) through
ksz8_r_phy_ctrl() and ksz8_w_phy_ctrl() functions. Necessary for
upcoming micrel.c patch to address forced link mode configuration.

Closes: https://lore.kernel.org/oe-kbuild-all/202310112224.iYgvjBUy-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/microchip/ksz8795.c | 86 +++++++++++++++++++++++++++++++++++--
 include/linux/micrel_phy.h          |  4 ++
 2 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c
index 91aba470fb2f..4bf4d67557dc 100644
--- a/drivers/net/dsa/microchip/ksz8795.c
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -632,6 +632,50 @@ static void ksz8_w_vlan_table(struct ksz_device *dev, u16 vid, u16 vlan)
 	ksz8_w_table(dev, TABLE_VLAN, addr, buf);
 }
 
+/**
+ * ksz8_r_phy_ctrl - Translates and reads from the SMI interface to a MIIM PHY
+ *		     Control register (Reg. 31).
+ * @dev: The KSZ device instance.
+ * @port: The port number to be read.
+ * @val: The value read from the SMI interface.
+ *
+ * This function reads the SMI interface and translates the hardware register
+ * bit values into their corresponding control settings for a MIIM PHY Control
+ * register.
+ *
+ * Return: 0 on success, error code on failure.
+ */
+static int ksz8_r_phy_ctrl(struct ksz_device *dev, int port, u16 *val)
+{
+	const u16 *regs = dev->info->regs;
+	u8 reg_val;
+	int ret;
+
+	*val = 0;
+
+	ret = ksz_pread8(dev, port, regs[P_LINK_STATUS], &reg_val);
+	if (ret < 0)
+		return ret;
+
+	if (reg_val & PORT_MDIX_STATUS)
+		*val |= KSZ886X_CTRL_MDIX_STAT;
+
+	ret = ksz_pread8(dev, port, REG_PORT_LINK_MD_CTRL, &reg_val);
+	if (ret < 0)
+		return ret;
+
+	if (reg_val & PORT_FORCE_LINK)
+		*val |= KSZ886X_CTRL_FORCE_LINK;
+
+	if (reg_val & PORT_POWER_SAVING)
+		*val |= KSZ886X_CTRL_PWRSAVE;
+
+	if (reg_val & PORT_PHY_REMOTE_LOOPBACK)
+		*val |= KSZ886X_CTRL_REMOTE_LOOPBACK;
+
+	return 0;
+}
+
 int ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 {
 	u8 restart, speed, ctrl, link;
@@ -769,12 +813,10 @@ int ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 				FIELD_GET(PORT_CABLE_FAULT_COUNTER_L, val2));
 		break;
 	case PHY_REG_PHY_CTRL:
-		ret = ksz_pread8(dev, p, regs[P_LINK_STATUS], &link);
+		ret = ksz8_r_phy_ctrl(dev, p, &data);
 		if (ret)
 			return ret;
 
-		if (link & PORT_MDIX_STATUS)
-			data |= KSZ886X_CTRL_MDIX_STAT;
 		break;
 	default:
 		processed = false;
@@ -786,6 +828,38 @@ int ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 	return 0;
 }
 
+/**
+ * ksz8_w_phy_ctrl - Translates and writes to the SMI interface from a MIIM PHY
+ *		     Control register (Reg. 31).
+ * @dev: The KSZ device instance.
+ * @port: The port number to be configured.
+ * @val: The register value to be written.
+ *
+ * This function translates control settings from a MIIM PHY Control register
+ * into their corresponding hardware register bit values for the SMI
+ * interface.
+ *
+ * Return: 0 on success, error code on failure.
+ */
+static int ksz8_w_phy_ctrl(struct ksz_device *dev, int port, u16 val)
+{
+	u8 reg_val = 0;
+	int ret;
+
+	if (val & KSZ886X_CTRL_FORCE_LINK)
+		reg_val |= PORT_FORCE_LINK;
+
+	if (val & KSZ886X_CTRL_PWRSAVE)
+		reg_val |= PORT_POWER_SAVING;
+
+	if (val & KSZ886X_CTRL_REMOTE_LOOPBACK)
+		reg_val |= PORT_PHY_REMOTE_LOOPBACK;
+
+	ret = ksz_prmw8(dev, port, REG_PORT_LINK_MD_CTRL, PORT_FORCE_LINK |
+			PORT_POWER_SAVING | PORT_PHY_REMOTE_LOOPBACK, reg_val);
+	return ret;
+}
+
 int ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 {
 	u8 restart, speed, ctrl, data;
@@ -926,6 +1000,12 @@ int ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 		if (val & PHY_START_CABLE_DIAG)
 			ksz_port_cfg(dev, p, REG_PORT_LINK_MD_CTRL, PORT_START_CABLE_DIAG, true);
 		break;
+
+	case PHY_REG_PHY_CTRL:
+		ret = ksz8_w_phy_ctrl(dev, p, val);
+		if (ret)
+			return ret;
+		break;
 	default:
 		break;
 	}
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 4e27ca7c49de..591bf5b5e8dc 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -64,6 +64,10 @@
 #define KSZ886X_BMCR_DISABLE_TRANSMIT		BIT(1)
 #define KSZ886X_BMCR_DISABLE_LED		BIT(0)
 
+/* PHY Special Control/Status Register (Reg 31) */
 #define KSZ886X_CTRL_MDIX_STAT			BIT(4)
+#define KSZ886X_CTRL_FORCE_LINK			BIT(3)
+#define KSZ886X_CTRL_PWRSAVE			BIT(2)
+#define KSZ886X_CTRL_REMOTE_LOOPBACK		BIT(1)
 
 #endif /* _MICREL_PHY_H */
-- 
cgit v1.2.3-70-g09d2


From b4a11b2033b7d3dfdd46592f7036a775b18cecd1 Mon Sep 17 00:00:00 2001
From: Heng Guo <heng.guo@windriver.com>
Date: Thu, 19 Oct 2023 09:20:53 +0800
Subject: net: fix IPSTATS_MIB_OUTPKGS increment in OutForwDatagrams.

Reproduce environment:
network with 3 VM linuxs is connected as below:
VM1<---->VM2(latest kernel 6.5.0-rc7)<---->VM3
VM1: eth0 ip: 192.168.122.207 MTU 1500
VM2: eth0 ip: 192.168.122.208, eth1 ip: 192.168.123.224 MTU 1500
VM3: eth0 ip: 192.168.123.240 MTU 1500

Reproduce:
VM1 send 1400 bytes UDP data to VM3 using tools scapy with flags=0.
scapy command:
send(IP(dst="192.168.123.240",flags=0)/UDP()/str('0'*1400),count=1,
inter=1.000000)

Result:
Before IP data is sent.
----------------------------------------------------------------------
root@qemux86-64:~# cat /proc/net/snmp
Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors
  ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests
  OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails
  FragOKs FragFails FragCreates
Ip: 1 64 11 0 3 4 0 0 4 7 0 0 0 0 0 0 0 0 0
......
----------------------------------------------------------------------
After IP data is sent.
----------------------------------------------------------------------
root@qemux86-64:~# cat /proc/net/snmp
Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors
  ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests
  OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails
  FragOKs FragFails FragCreates
Ip: 1 64 12 0 3 5 0 0 4 8 0 0 0 0 0 0 0 0 0
......
----------------------------------------------------------------------
"ForwDatagrams" increase from 4 to 5 and "OutRequests" also increase
from 7 to 8.

Issue description and patch:
IPSTATS_MIB_OUTPKTS("OutRequests") is counted with IPSTATS_MIB_OUTOCTETS
("OutOctets") in ip_finish_output2().
According to RFC 4293, it is "OutOctets" counted with "OutTransmits" but
not "OutRequests". "OutRequests" does not include any datagrams counted
in "ForwDatagrams".
ipSystemStatsOutOctets OBJECT-TYPE
    DESCRIPTION
           "The total number of octets in IP datagrams delivered to the
            lower layers for transmission.  Octets from datagrams
            counted in ipIfStatsOutTransmits MUST be counted here.
ipSystemStatsOutRequests OBJECT-TYPE
    DESCRIPTION
           "The total number of IP datagrams that local IP user-
            protocols (including ICMP) supplied to IP in requests for
            transmission.  Note that this counter does not include any
            datagrams counted in ipSystemStatsOutForwDatagrams.
So do patch to define IPSTATS_MIB_OUTPKTS to "OutTransmits" and add
IPSTATS_MIB_OUTREQUESTS for "OutRequests".
Add IPSTATS_MIB_OUTREQUESTS counter in __ip_local_out() for ipv4 and add
IPSTATS_MIB_OUT counter in ip6_finish_output2() for ipv6.

Test result with patch:
Before IP data is sent.
----------------------------------------------------------------------
root@qemux86-64:~# cat /proc/net/snmp
Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors
  ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests
  OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails
  FragOKs FragFails FragCreates OutTransmits
Ip: 1 64 9 0 5 1 0 0 3 3 0 0 0 0 0 0 0 0 0 4
......
root@qemux86-64:~# cat /proc/net/netstat
......
IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts
  OutBcastPkts InOctets OutOctets InMcastOctets OutMcastOctets
  InBcastOctets OutBcastOctets InCsumErrors InNoECTPkts InECT1Pkts
  InECT0Pkts InCEPkts ReasmOverlaps
IpExt: 0 0 0 0 0 0 2976 1896 0 0 0 0 0 9 0 0 0 0
----------------------------------------------------------------------
After IP data is sent.
----------------------------------------------------------------------
root@qemux86-64:~# cat /proc/net/snmp
Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors
  ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests
  OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails
  FragOKs FragFails FragCreates OutTransmits
Ip: 1 64 10 0 5 2 0 0 3 3 0 0 0 0 0 0 0 0 0 5
......
root@qemux86-64:~# cat /proc/net/netstat
......
IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts
  OutBcastPkts InOctets OutOctets InMcastOctets OutMcastOctets
  InBcastOctets OutBcastOctets InCsumErrors InNoECTPkts InECT1Pkts
  InECT0Pkts InCEPkts ReasmOverlaps
IpExt: 0 0 0 0 0 0 4404 3324 0 0 0 0 0 10 0 0 0 0
----------------------------------------------------------------------
"ForwDatagrams" increase from 1 to 2 and "OutRequests" is keeping 3.
"OutTransmits" increase from 4 to 5 and "OutOctets" increase 1428.

Signed-off-by: Heng Guo <heng.guo@windriver.com>
Reviewed-by: Kun Song <Kun.Song@windriver.com>
Reviewed-by: Filip Pudak <filip.pudak@windriver.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 3 ++-
 net/ipv4/ip_output.c      | 2 ++
 net/ipv4/proc.c           | 3 ++-
 net/ipv6/ip6_output.c     | 6 ++++--
 net/ipv6/mcast.c          | 5 ++---
 net/ipv6/ndisc.c          | 2 +-
 net/ipv6/proc.c           | 3 ++-
 net/ipv6/raw.c            | 2 +-
 8 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 26f33a4c253d..b2b72886cb6d 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -24,7 +24,7 @@ enum
 	IPSTATS_MIB_INOCTETS,			/* InOctets */
 	IPSTATS_MIB_INDELIVERS,			/* InDelivers */
 	IPSTATS_MIB_OUTFORWDATAGRAMS,		/* OutForwDatagrams */
-	IPSTATS_MIB_OUTPKTS,			/* OutRequests */
+	IPSTATS_MIB_OUTREQUESTS,		/* OutRequests */
 	IPSTATS_MIB_OUTOCTETS,			/* OutOctets */
 /* other fields */
 	IPSTATS_MIB_INHDRERRORS,		/* InHdrErrors */
@@ -57,6 +57,7 @@ enum
 	IPSTATS_MIB_ECT0PKTS,			/* InECT0Pkts */
 	IPSTATS_MIB_CEPKTS,			/* InCEPkts */
 	IPSTATS_MIB_REASM_OVERLAPS,		/* ReasmOverlaps */
+	IPSTATS_MIB_OUTPKTS,			/* OutTransmits */
 	__IPSTATS_MIB_MAX
 };
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 89e62ed08dad..b06f678b03a1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -101,6 +101,8 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct iphdr *iph = ip_hdr(skb);
 
+	IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS);
+
 	iph_set_totlen(iph, skb->len);
 	ip_send_check(iph);
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index eaf1d3113b62..a85b0aba3646 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -83,7 +83,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
 	SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
 	SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
 	SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
-	SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS),
+	SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS),
 	SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
 	SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
 	SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
@@ -93,6 +93,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
 	SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
 	SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
 	SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
+	SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a471c7e91761..571c10fb00b1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -117,6 +117,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 			return res;
 	}
 
+	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
 	rcu_read_lock();
 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
@@ -328,7 +330,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 
 	mtu = dst_mtu(dst);
 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
-		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
 
 		/* if egress device is enslaved to an L3 master device pass the
 		 * skb to its handler for processing
@@ -1987,7 +1989,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 	skb->tstamp = cork->base.transmit_time;
 
 	ip6_cork_steal_dst(skb, cork);
-	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
 	if (proto == IPPROTO_ICMPV6) {
 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 		u8 icmp6_type;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 99e28b444a4c..b75d3c9d41bb 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1789,7 +1789,7 @@ static void mld_sendpack(struct sk_buff *skb)
 
 	rcu_read_lock();
 	idev = __in6_dev_get(skb->dev);
-	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
 
 	payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) -
 		sizeof(*pip6);
@@ -2147,8 +2147,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 	full_len = sizeof(struct ipv6hdr) + payload_len;
 
 	rcu_read_lock();
-	IP6_UPD_PO_STATS(net, __in6_dev_get(dev),
-		      IPSTATS_MIB_OUT, full_len);
+	IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS);
 	rcu_read_unlock();
 
 	skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 679443d7ecb5..a19999b30bc0 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -504,7 +504,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
 
 	rcu_read_lock();
 	idev = __in6_dev_get(dst->dev);
-	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
 
 	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 		      net, sk, skb, NULL, dst->dev,
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index e20b3705c2d2..6d1d9221649d 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -61,7 +61,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = {
 	SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS),
 	SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS),
 	SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
-	SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS),
+	SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS),
 	SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS),
 	SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
 	SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
@@ -84,6 +84,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = {
 	SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
 	SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
 	SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS),
+	SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a2aa54a2baae..dd0a4e73e602 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -651,7 +651,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	 * have been queued for deletion.
 	 */
 	rcu_read_lock();
-	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
 	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
 		      NULL, rt->dst.dev, dst_output);
 	if (err > 0)
-- 
cgit v1.2.3-70-g09d2


From 20c6e05bd33deaa6fa890252d7ffc5ad54a0942c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 19 Oct 2023 08:28:15 -0700
Subject: ethtool: untangle the linkmode and ethtool headers

Commit 26c5334d344d ("ethtool: Add forced speed to supported link
modes maps") added a dependency between ethtool.h and linkmode.h.
The dependency in the opposite direction already exists so the
new code was inserted in an awkward place.

The reason for ethtool.h to include linkmode.h, is that
ethtool_forced_speed_maps_init() is a static inline helper.
That's not really necessary.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Paul Greenwalt <paul.greenwalt@intel.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h  | 22 ++--------------------
 include/linux/linkmode.h | 29 ++++++++++++++---------------
 net/ethtool/common.c     | 21 +++++++++++++++++++++
 3 files changed, 37 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 8e91e8b8a693..226a36ed5aa1 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -13,7 +13,6 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
-#include <linux/linkmode.h>
 #include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <linux/if_ether.h>
@@ -1070,23 +1069,6 @@ struct ethtool_forced_speed_map {
 	.arr_size	= ARRAY_SIZE(prefix##_##value),			\
 }
 
-/**
- * ethtool_forced_speed_maps_init
- * @maps: Pointer to an array of Ethtool forced speed map
- * @size: Array size
- *
- * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
- * should be called during driver module init.
- */
-static inline void
-ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
-{
-	for (u32 i = 0; i < size; i++) {
-		struct ethtool_forced_speed_map *map = &maps[i];
-
-		linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
-		map->cap_arr = NULL;
-		map->arr_size = 0;
-	}
-}
+void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size);
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index cd38f89553e6..7303b4bc2ce0 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -2,21 +2,6 @@
 #define __LINKMODE_H
 
 #include <linux/bitmap.h>
-
-static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
-{
-	__set_bit(nr, addr);
-}
-
-static inline void linkmode_set_bit_array(const int *array, int array_size,
-					  unsigned long *addr)
-{
-	int i;
-
-	for (i = 0; i < array_size; i++)
-		linkmode_set_bit(array[i], addr);
-}
-
 #include <linux/ethtool.h>
 #include <uapi/linux/ethtool.h>
 
@@ -53,6 +38,11 @@ static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
 	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
+static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
+{
+	__set_bit(nr, addr);
+}
+
 static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
 {
 	__clear_bit(nr, addr);
@@ -72,6 +62,15 @@ static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr)
 	return test_bit(nr, addr);
 }
 
+static inline void linkmode_set_bit_array(const int *array, int array_size,
+					  unsigned long *addr)
+{
+	int i;
+
+	for (i = 0; i < array_size; i++)
+		linkmode_set_bit(array[i], addr);
+}
+
 static inline int linkmode_equal(const unsigned long *src1,
 				 const unsigned long *src2)
 {
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index f5598c5f50de..b4419fb6df6a 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -685,3 +685,24 @@ ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
 	link_ksettings->base.duplex = link_info->duplex;
 }
 EXPORT_SYMBOL_GPL(ethtool_params_from_link_mode);
+
+/**
+ * ethtool_forced_speed_maps_init
+ * @maps: Pointer to an array of Ethtool forced speed map
+ * @size: Array size
+ *
+ * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
+ * should be called during driver module init.
+ */
+void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
+{
+	for (u32 i = 0; i < size; i++) {
+		struct ethtool_forced_speed_map *map = &maps[i];
+
+		linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
+		map->cap_arr = NULL;
+		map->arr_size = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(ethtool_forced_speed_maps_init);
-- 
cgit v1.2.3-70-g09d2


From b460bc8302f222d346f0c15bba980eb8c36d6278 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:31:57 +0800
Subject: mm/percpu.c: introduce pcpu_alloc_size()

Introduce pcpu_alloc_size() to get the size of the dynamic per-cpu
area. It will be used by bpf memory allocator in the following patches.
BPF memory allocator maintains per-cpu area caches for multiple area
sizes and its free API only has the to-be-freed per-cpu pointer, so it
needs the size of dynamic per-cpu area to select the corresponding cache
when bpf program frees the dynamic per-cpu pointer.

Acked-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/percpu.h |  1 +
 mm/percpu.c            | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 68fac2e7cbe6..8c677f185901 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -132,6 +132,7 @@ extern void __init setup_per_cpu_areas(void);
 extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
 extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
 extern void free_percpu(void __percpu *__pdata);
+extern size_t pcpu_alloc_size(void __percpu *__pdata);
 
 DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
 
diff --git a/mm/percpu.c b/mm/percpu.c
index ea607078368d..60ed078e4cd0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2244,6 +2244,37 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	mutex_unlock(&pcpu_alloc_mutex);
 }
 
+/**
+ * pcpu_alloc_size - the size of the dynamic percpu area
+ * @ptr: pointer to the dynamic percpu area
+ *
+ * Returns the size of the @ptr allocation.  This is undefined for statically
+ * defined percpu variables as there is no corresponding chunk->bound_map.
+ *
+ * RETURNS:
+ * The size of the dynamic percpu area.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
+ */
+size_t pcpu_alloc_size(void __percpu *ptr)
+{
+	struct pcpu_chunk *chunk;
+	unsigned long bit_off, end;
+	void *addr;
+
+	if (!ptr)
+		return 0;
+
+	addr = __pcpu_ptr_to_addr(ptr);
+	/* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
+	chunk = pcpu_chunk_addr_search(addr);
+	bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
+	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+			    bit_off + 1);
+	return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
+}
+
 /**
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
-- 
cgit v1.2.3-70-g09d2


From 3f2189e4f77b7a3e979d143dc4ff586488c7e8a5 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:31:59 +0800
Subject: bpf: Use pcpu_alloc_size() in bpf_mem_free{_rcu}()

For bpf_global_percpu_ma, the pointer passed to bpf_mem_free_rcu() is
allocated by kmalloc() and its size is fixed (16-bytes on x86-64). So
no matter which cache allocates the dynamic per-cpu area, on x86-64
cache[2] will always be used to free the per-cpu area.

Fix the unbalance by checking whether the bpf memory allocator is
per-cpu or not and use pcpu_alloc_size() instead of ksize() to
find the correct cache for per-cpu free.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  1 +
 kernel/bpf/memalloc.c         | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index d644bbb298af..bb1223b21308 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	bool percpu;
 	struct work_struct work;
 };
 
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 776bdf5ffd80..5308e386380a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -525,6 +525,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	/* room for llist_node and per-cpu pointer */
 	if (percpu)
 		percpu_size = LLIST_NODE_SZ + sizeof(void *);
+	ma->percpu = percpu;
 
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -874,6 +875,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
 
+static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
+{
+	size_t size;
+
+	if (percpu)
+		size = pcpu_alloc_size(*((void **)ptr));
+	else
+		size = ksize(ptr - LLIST_NODE_SZ);
+	return bpf_mem_cache_idx(size);
+}
+
 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 {
 	int idx;
@@ -881,7 +893,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+	idx = bpf_mem_free_idx(ptr, ma->percpu);
 	if (idx < 0)
 		return;
 
@@ -895,7 +907,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+	idx = bpf_mem_free_idx(ptr, ma->percpu);
 	if (idx < 0)
 		return;
 
-- 
cgit v1.2.3-70-g09d2


From e581a3461de3f129cfe888a67d9f31086328271f Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:32:00 +0800
Subject: bpf: Move the declaration of __bpf_obj_drop_impl() to bpf.h

both syscall.c and helpers.c have the declaration of
__bpf_obj_drop_impl(), so just move it to a common header file.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 1 +
 kernel/bpf/helpers.c | 2 --
 kernel/bpf/syscall.c | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4b40b45962b..ebd412179771 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index da058aead20c..c814bb44d2d1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1811,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 341f8cb4405c..69998f84f7c8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -626,8 +626,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 	bpf_timer_cancel_and_free(obj + rec->timer_off);
 }
 
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
-- 
cgit v1.2.3-70-g09d2


From e383a45902337356d9ccad797094a27c6b2150f9 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:32:01 +0800
Subject: bpf: Use bpf_global_percpu_ma for per-cpu kptr in
 __bpf_obj_drop_impl()

The following warning was reported when running "./test_progs -t
test_bpf_ma/percpu_free_through_map_free":

  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 68 at kernel/bpf/memalloc.c:342
  CPU: 1 PID: 68 Comm: kworker/u16:2 Not tainted 6.6.0-rc2+ #222
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
  Workqueue: events_unbound bpf_map_free_deferred
  RIP: 0010:bpf_mem_refill+0x21c/0x2a0
  ......
  Call Trace:
   <IRQ>
   ? bpf_mem_refill+0x21c/0x2a0
   irq_work_single+0x27/0x70
   irq_work_run_list+0x2a/0x40
   irq_work_run+0x18/0x40
   __sysvec_irq_work+0x1c/0xc0
   sysvec_irq_work+0x73/0x90
   </IRQ>
   <TASK>
   asm_sysvec_irq_work+0x1b/0x20
  RIP: 0010:unit_free+0x50/0x80
   ......
   bpf_mem_free+0x46/0x60
   __bpf_obj_drop_impl+0x40/0x90
   bpf_obj_free_fields+0x17d/0x1a0
   array_map_free+0x6b/0x170
   bpf_map_free_deferred+0x54/0xa0
   process_scheduled_works+0xba/0x370
   worker_thread+0x16d/0x2e0
   kthread+0x105/0x140
   ret_from_fork+0x39/0x60
   ret_from_fork_asm+0x1b/0x30
   </TASK>
  ---[ end trace 0000000000000000 ]---

The reason is simple: __bpf_obj_drop_impl() does not know the freeing
field is a per-cpu pointer and it uses bpf_global_ma to free the
pointer. Because bpf_global_ma is not a per-cpu allocator, so ksize() is
used to select the corresponding cache. The bpf_mem_cache with 16-bytes
unit_size will always be selected to do the unmatched free and it will
trigger the warning in free_bulk() eventually.

Because per-cpu kptr doesn't support list or rb-tree now, so fix the
problem by only checking whether or not the type of kptr is per-cpu in
bpf_obj_free_fields(), and using bpf_global_percpu_ma to these kptrs.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-7-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  2 +-
 kernel/bpf/helpers.c | 22 ++++++++++++++--------
 kernel/bpf/syscall.c |  4 ++--
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ebd412179771..b4825d3cdb29 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,7 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c814bb44d2d1..e46ac288a108 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1842,7 +1842,7 @@ unlock:
 		 * bpf_list_head which needs to be freed.
 		 */
 		migrate_disable();
-		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
 		migrate_enable();
 	}
 }
@@ -1881,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 
 
 		migrate_disable();
-		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
 		migrate_enable();
 	}
 }
@@ -1913,8 +1913,10 @@ __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 }
 
 /* Must be called under migrate_disable(), as required by bpf_mem_free */
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
 {
+	struct bpf_mem_alloc *ma;
+
 	if (rec && rec->refcount_off >= 0 &&
 	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
 		/* Object is refcounted and refcount_dec didn't result in 0
@@ -1926,10 +1928,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
 	if (rec)
 		bpf_obj_free_fields(rec, p);
 
+	if (percpu)
+		ma = &bpf_global_percpu_ma;
+	else
+		ma = &bpf_global_ma;
 	if (rec && rec->refcount_off >= 0)
-		bpf_mem_free_rcu(&bpf_global_ma, p);
+		bpf_mem_free_rcu(ma, p);
 	else
-		bpf_mem_free(&bpf_global_ma, p);
+		bpf_mem_free(ma, p);
 }
 
 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1937,7 +1943,7 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 	struct btf_struct_meta *meta = meta__ign;
 	void *p = p__alloc;
 
-	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
+	__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
 }
 
 __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1981,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
 	 */
 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
 		/* Only called from BPF prog, no need to migrate_disable */
-		__bpf_obj_drop_impl((void *)n - off, rec);
+		__bpf_obj_drop_impl((void *)n - off, rec, false);
 		return -EINVAL;
 	}
 
@@ -2080,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
 	 */
 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
 		/* Only called from BPF prog, no need to migrate_disable */
-		__bpf_obj_drop_impl((void *)n - off, rec);
+		__bpf_obj_drop_impl((void *)n - off, rec, false);
 		return -EINVAL;
 	}
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 69998f84f7c8..a9b2cb500bf7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -660,8 +660,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 									   field->kptr.btf_id);
 				migrate_disable();
 				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
-								 pointee_struct_meta->record :
-								 NULL);
+								 pointee_struct_meta->record : NULL,
+								 fields[i].type == BPF_KPTR_PERCPU);
 				migrate_enable();
 			} else {
 				field->kptr.dtor(xchgd_field);
-- 
cgit v1.2.3-70-g09d2


From fc47e86dbfb75a864c0c9dd8e78affb6506296bb Mon Sep 17 00:00:00 2001
From: Beniamino Galvani <b.galvani@gmail.com>
Date: Fri, 20 Oct 2023 13:55:25 +0200
Subject: ipv6: rename and move ip6_dst_lookup_tunnel()

At the moment ip6_dst_lookup_tunnel() is used only by bareudp.
Ideally, other UDP tunnel implementations should use it, but to do so
the function needs to accept new parameters that are specific for UDP
tunnels, such as the ports.

Prepare for these changes by renaming the function to
udp_tunnel6_dst_lookup() and move it to file
net/ipv6/ip6_udp_tunnel.c.

This is similar to what already done for IPv4 in commit bf3fcbf7e7a0
("ipv4: rename and move ip_route_output_tunnel()").

Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Beniamino Galvani <b.galvani@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bareudp.c     | 10 +++----
 include/net/ipv6.h        |  6 -----
 include/net/udp_tunnel.h  |  7 +++++
 net/ipv6/ip6_output.c     | 68 ----------------------------------------------
 net/ipv6/ip6_udp_tunnel.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 81 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 47a9c2a5583c..9a0a1a9f6cfe 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -371,8 +371,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (!sock)
 		return -ESHUTDOWN;
 
-	dst = ip6_dst_lookup_tunnel(skb, dev, bareudp->net, sock, &saddr, info,
-				    IPPROTO_UDP, use_cache);
+	dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, &saddr, info,
+				     IPPROTO_UDP, use_cache);
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 
@@ -498,9 +498,9 @@ static int bareudp_fill_metadata_dst(struct net_device *dev,
 		struct in6_addr saddr;
 		struct socket *sock = rcu_dereference(bareudp->sock);
 
-		dst = ip6_dst_lookup_tunnel(skb, dev, bareudp->net, sock,
-					    &saddr, info, IPPROTO_UDP,
-					    use_cache);
+		dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock,
+					     &saddr, info, IPPROTO_UDP,
+					     use_cache);
 		if (IS_ERR(dst))
 			return PTR_ERR(dst);
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index b3444c8a6f74..78d38dd88aba 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1133,12 +1133,6 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, st
 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 					 const struct in6_addr *final_dst,
 					 bool connected);
-struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
-					struct net_device *dev,
-					struct net *net, struct socket *sock,
-					struct in6_addr *saddr,
-					const struct ip_tunnel_info *info,
-					u8 protocol, bool use_cache);
 struct dst_entry *ip6_blackhole_route(struct net *net,
 				      struct dst_entry *orig_dst);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 4d0578fab01a..1dac296d8449 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -169,6 +169,13 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
 				     const struct ip_tunnel_key *key,
 				     __be16 sport, __be16 dport, u8 tos,
 				     struct dst_cache *dst_cache);
+struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
+					 struct net_device *dev,
+					 struct net *net,
+					 struct socket *sock,
+					 struct in6_addr *saddr,
+					 const struct ip_tunnel_info *info,
+					 u8 protocol, bool use_cache);
 
 struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
 				    __be16 flags, __be64 tunnel_id,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 571c10fb00b1..3c7de89d6755 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1283,74 +1283,6 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 }
 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
 
-/**
- *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
- *      @skb: Packet for which lookup is done
- *      @dev: Tunnel device
- *      @net: Network namespace of tunnel device
- *      @sock: Socket which provides route info
- *      @saddr: Memory to store the src ip address
- *      @info: Tunnel information
- *      @protocol: IP protocol
- *      @use_cache: Flag to enable cache usage
- *      This function performs a route lookup on a tunnel
- *
- *      It returns a valid dst pointer and stores src address to be used in
- *      tunnel in param saddr on success, else a pointer encoded error code.
- */
-
-struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
-					struct net_device *dev,
-					struct net *net,
-					struct socket *sock,
-					struct in6_addr *saddr,
-					const struct ip_tunnel_info *info,
-					u8 protocol,
-					bool use_cache)
-{
-	struct dst_entry *dst = NULL;
-#ifdef CONFIG_DST_CACHE
-	struct dst_cache *dst_cache;
-#endif
-	struct flowi6 fl6;
-	__u8 prio;
-
-#ifdef CONFIG_DST_CACHE
-	dst_cache = (struct dst_cache *)&info->dst_cache;
-	if (use_cache) {
-		dst = dst_cache_get_ip6(dst_cache, saddr);
-		if (dst)
-			return dst;
-	}
-#endif
-	memset(&fl6, 0, sizeof(fl6));
-	fl6.flowi6_mark = skb->mark;
-	fl6.flowi6_proto = protocol;
-	fl6.daddr = info->key.u.ipv6.dst;
-	fl6.saddr = info->key.u.ipv6.src;
-	prio = info->key.tos;
-	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
-
-	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
-					      NULL);
-	if (IS_ERR(dst)) {
-		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
-		return ERR_PTR(-ENETUNREACH);
-	}
-	if (dst->dev == dev) { /* is this necessary? */
-		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
-		dst_release(dst);
-		return ERR_PTR(-ELOOP);
-	}
-#ifdef CONFIG_DST_CACHE
-	if (use_cache)
-		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
-#endif
-	*saddr = fl6.saddr;
-	return dst;
-}
-EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
-
 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
 					       gfp_t gfp)
 {
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 70d38705c92f..fc122abf6b75 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -1,3 +1,4 @@
+
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/module.h>
 #include <linux/errno.h>
@@ -112,4 +113,72 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
 
+/**
+ *      udp_tunnel6_dst_lookup - perform route lookup on UDP tunnel
+ *      @skb: Packet for which lookup is done
+ *      @dev: Tunnel device
+ *      @net: Network namespace of tunnel device
+ *      @sock: Socket which provides route info
+ *      @saddr: Memory to store the src ip address
+ *      @info: Tunnel information
+ *      @protocol: IP protocol
+ *      @use_cache: Flag to enable cache usage
+ *      This function performs a route lookup on a UDP tunnel
+ *
+ *      It returns a valid dst pointer and stores src address to be used in
+ *      tunnel in param saddr on success, else a pointer encoded error code.
+ */
+
+struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
+					 struct net_device *dev,
+					 struct net *net,
+					 struct socket *sock,
+					 struct in6_addr *saddr,
+					 const struct ip_tunnel_info *info,
+					 u8 protocol,
+					 bool use_cache)
+{
+	struct dst_entry *dst = NULL;
+#ifdef CONFIG_DST_CACHE
+	struct dst_cache *dst_cache;
+#endif
+	struct flowi6 fl6;
+	__u8 prio;
+
+#ifdef CONFIG_DST_CACHE
+	dst_cache = (struct dst_cache *)&info->dst_cache;
+	if (use_cache) {
+		dst = dst_cache_get_ip6(dst_cache, saddr);
+		if (dst)
+			return dst;
+	}
+#endif
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_mark = skb->mark;
+	fl6.flowi6_proto = protocol;
+	fl6.daddr = info->key.u.ipv6.dst;
+	fl6.saddr = info->key.u.ipv6.src;
+	prio = info->key.tos;
+	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
+
+	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
+					      NULL);
+	if (IS_ERR(dst)) {
+		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
+		return ERR_PTR(-ENETUNREACH);
+	}
+	if (dst->dev == dev) { /* is this necessary? */
+		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
+		dst_release(dst);
+		return ERR_PTR(-ELOOP);
+	}
+#ifdef CONFIG_DST_CACHE
+	if (use_cache)
+		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
+#endif
+	*saddr = fl6.saddr;
+	return dst;
+}
+EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup);
+
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From 7e937dcf96d0489b3cdd1cff9dfd049617d28492 Mon Sep 17 00:00:00 2001
From: Beniamino Galvani <b.galvani@gmail.com>
Date: Fri, 20 Oct 2023 13:55:26 +0200
Subject: ipv6: remove "proto" argument from udp_tunnel6_dst_lookup()

The function is now UDP-specific, the protocol is always IPPROTO_UDP.

This is similar to what already done for IPv4 in commit 78f3655adcb5
("ipv4: remove "proto" argument from udp_tunnel_dst_lookup()").

Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Beniamino Galvani <b.galvani@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bareudp.c     | 5 ++---
 include/net/udp_tunnel.h  | 2 +-
 net/ipv6/ip6_udp_tunnel.c | 4 +---
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 9a0a1a9f6cfe..9eb5e11c09b4 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -372,7 +372,7 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		return -ESHUTDOWN;
 
 	dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, &saddr, info,
-				     IPPROTO_UDP, use_cache);
+				     use_cache);
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 
@@ -499,8 +499,7 @@ static int bareudp_fill_metadata_dst(struct net_device *dev,
 		struct socket *sock = rcu_dereference(bareudp->sock);
 
 		dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock,
-					     &saddr, info, IPPROTO_UDP,
-					     use_cache);
+					     &saddr, info, use_cache);
 		if (IS_ERR(dst))
 			return PTR_ERR(dst);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 1dac296d8449..583867643bd1 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -175,7 +175,7 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 					 struct socket *sock,
 					 struct in6_addr *saddr,
 					 const struct ip_tunnel_info *info,
-					 u8 protocol, bool use_cache);
+					 bool use_cache);
 
 struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
 				    __be16 flags, __be64 tunnel_id,
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index fc122abf6b75..b9c906518ce2 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -121,7 +121,6 @@ EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
  *      @sock: Socket which provides route info
  *      @saddr: Memory to store the src ip address
  *      @info: Tunnel information
- *      @protocol: IP protocol
  *      @use_cache: Flag to enable cache usage
  *      This function performs a route lookup on a UDP tunnel
  *
@@ -135,7 +134,6 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 					 struct socket *sock,
 					 struct in6_addr *saddr,
 					 const struct ip_tunnel_info *info,
-					 u8 protocol,
 					 bool use_cache)
 {
 	struct dst_entry *dst = NULL;
@@ -155,7 +153,7 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 #endif
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_mark = skb->mark;
-	fl6.flowi6_proto = protocol;
+	fl6.flowi6_proto = IPPROTO_UDP;
 	fl6.daddr = info->key.u.ipv6.dst;
 	fl6.saddr = info->key.u.ipv6.src;
 	prio = info->key.tos;
-- 
cgit v1.2.3-70-g09d2


From 946fcfdbc5b97e26d31339ebca2d9a51a4f975ff Mon Sep 17 00:00:00 2001
From: Beniamino Galvani <b.galvani@gmail.com>
Date: Fri, 20 Oct 2023 13:55:27 +0200
Subject: ipv6: add new arguments to udp_tunnel6_dst_lookup()

We want to make the function more generic so that it can be used by
other UDP tunnel implementations such as geneve and vxlan. To do that,
add the following arguments:

 - source and destination UDP port;
 - ifindex of the output interface, needed by vxlan;
 - the tos, because in some cases it is not taken from struct
   ip_tunnel_info (for example, when it's inherited from the inner
   packet);
 - the dst cache, because not all tunnel types (e.g. vxlan) want to
   use the one from struct ip_tunnel_info.

With these parameters, the function no longer needs the full struct
ip_tunnel_info as argument and we can pass only the relevant part of
it (struct ip_tunnel_key).

This is similar to what already done for IPv4 in commit 72fc68c6356b
("ipv4: add new arguments to udp_tunnel_dst_lookup()").

Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Beniamino Galvani <b.galvani@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bareudp.c     | 10 +++++++---
 include/net/udp_tunnel.h  |  7 ++++---
 net/ipv6/ip6_udp_tunnel.c | 33 ++++++++++++++++++---------------
 3 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 9eb5e11c09b4..9c11a0d0273b 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -371,8 +371,10 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (!sock)
 		return -ESHUTDOWN;
 
-	dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, &saddr, info,
-				     use_cache);
+	dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr,
+				     key, 0, 0, key->tos,
+				     use_cache ?
+				     (struct dst_cache *) &info->dst_cache : NULL);
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 
@@ -499,7 +501,9 @@ static int bareudp_fill_metadata_dst(struct net_device *dev,
 		struct socket *sock = rcu_dereference(bareudp->sock);
 
 		dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock,
-					     &saddr, info, use_cache);
+					     0, &saddr, &info->key,
+					     0, 0, info->key.tos,
+					     use_cache ? &info->dst_cache : NULL);
 		if (IS_ERR(dst))
 			return PTR_ERR(dst);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 583867643bd1..d716214fe03d 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -172,10 +172,11 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
 struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 					 struct net_device *dev,
 					 struct net *net,
-					 struct socket *sock,
+					 struct socket *sock, int oif,
 					 struct in6_addr *saddr,
-					 const struct ip_tunnel_info *info,
-					 bool use_cache);
+					 const struct ip_tunnel_key *key,
+					 __be16 sport, __be16 dport, u8 dsfield,
+					 struct dst_cache *dst_cache);
 
 struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
 				    __be16 flags, __be64 tunnel_id,
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index b9c906518ce2..a7bf0327b380 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
  *      @dev: Tunnel device
  *      @net: Network namespace of tunnel device
  *      @sock: Socket which provides route info
+ *      @oif: Index of the output interface
  *      @saddr: Memory to store the src ip address
- *      @info: Tunnel information
- *      @use_cache: Flag to enable cache usage
+ *      @key: Tunnel information
+ *      @sport: UDP source port
+ *      @dport: UDP destination port
+ *      @dsfield: The traffic class field
+ *      @dst_cache: The dst cache to use for lookup
  *      This function performs a route lookup on a UDP tunnel
  *
  *      It returns a valid dst pointer and stores src address to be used in
@@ -132,20 +136,17 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 					 struct net_device *dev,
 					 struct net *net,
 					 struct socket *sock,
+					 int oif,
 					 struct in6_addr *saddr,
-					 const struct ip_tunnel_info *info,
-					 bool use_cache)
+					 const struct ip_tunnel_key *key,
+					 __be16 sport, __be16 dport, u8 dsfield,
+					 struct dst_cache *dst_cache)
 {
 	struct dst_entry *dst = NULL;
-#ifdef CONFIG_DST_CACHE
-	struct dst_cache *dst_cache;
-#endif
 	struct flowi6 fl6;
-	__u8 prio;
 
 #ifdef CONFIG_DST_CACHE
-	dst_cache = (struct dst_cache *)&info->dst_cache;
-	if (use_cache) {
+	if (dst_cache) {
 		dst = dst_cache_get_ip6(dst_cache, saddr);
 		if (dst)
 			return dst;
@@ -154,10 +155,12 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_mark = skb->mark;
 	fl6.flowi6_proto = IPPROTO_UDP;
-	fl6.daddr = info->key.u.ipv6.dst;
-	fl6.saddr = info->key.u.ipv6.src;
-	prio = info->key.tos;
-	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
+	fl6.flowi6_oif = oif;
+	fl6.daddr = key->u.ipv6.dst;
+	fl6.saddr = key->u.ipv6.src;
+	fl6.fl6_sport = sport;
+	fl6.fl6_dport = dport;
+	fl6.flowlabel = ip6_make_flowinfo(dsfield, key->label);
 
 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
 					      NULL);
@@ -171,7 +174,7 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 		return ERR_PTR(-ELOOP);
 	}
 #ifdef CONFIG_DST_CACHE
-	if (use_cache)
+	if (dst_cache)
 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
 #endif
 	*saddr = fl6.saddr;
-- 
cgit v1.2.3-70-g09d2


From 73ed8e03388d16c12fc577e5c700b58a29045a15 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:37 +0000
Subject: tcp: fix cookie_init_timestamp() overflows

cookie_init_timestamp() is supposed to return a 64bit timestamp
suitable for both TSval determination and setting of skb->tstamp.

Unfortunately it uses 32bit fields and overflows after
2^32 * 10^6 nsec (~49 days) of uptime.

Generated TSval are still correct, but skb->tstamp might be set
far away in the past, potentially confusing other layers.

tcp_ns_to_ts() is changed to return a full 64bit value,
ts and ts_now variables are changed to u64 type,
and TSMASK is removed in favor of shifts operations.

While we are at it, change this sequence:
		ts >>= TSBITS;
		ts--;
		ts <<= TSBITS;
		ts |= options;
to:
		ts -= (1UL << TSBITS);

Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  2 +-
 net/ipv4/syncookies.c | 20 +++++++-------------
 2 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index bad304d173a5..d47a57a47b50 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -805,7 +805,7 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
 }
 
 /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
-static inline u32 tcp_ns_to_ts(u64 ns)
+static inline u64 tcp_ns_to_ts(u64 ns)
 {
 	return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
 }
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index dc478a0574cb..3b4dafefb4b0 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -41,7 +41,6 @@ static siphash_aligned_key_t syncookie_secret[2];
  * requested/supported by the syn/synack exchange.
  */
 #define TSBITS	6
-#define TSMASK	(((__u32)1 << TSBITS) - 1)
 
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
@@ -62,27 +61,22 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
  */
 u64 cookie_init_timestamp(struct request_sock *req, u64 now)
 {
-	struct inet_request_sock *ireq;
-	u32 ts, ts_now = tcp_ns_to_ts(now);
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	u64 ts, ts_now = tcp_ns_to_ts(now);
 	u32 options = 0;
 
-	ireq = inet_rsk(req);
-
 	options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
 	if (ireq->sack_ok)
 		options |= TS_OPT_SACK;
 	if (ireq->ecn_ok)
 		options |= TS_OPT_ECN;
 
-	ts = ts_now & ~TSMASK;
+	ts = (ts_now >> TSBITS) << TSBITS;
 	ts |= options;
-	if (ts > ts_now) {
-		ts >>= TSBITS;
-		ts--;
-		ts <<= TSBITS;
-		ts |= options;
-	}
-	return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
+	if (ts > ts_now)
+		ts -= (1UL << TSBITS);
+
+	return ts * (NSEC_PER_SEC / TCP_TS_HZ);
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 99d679556d737a14391c68e562d94076c2983252 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:38 +0000
Subject: tcp: add tcp_time_stamp_ms() helper

In preparation of adding usec TCP TS values, add tcp_time_stamp_ms()
for contexts needing ms based values.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 5 +++++
 net/ipv4/tcp_input.c | 2 +-
 net/ipv4/tcp_timer.c | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d47a57a47b50..9fc6dc4ba9e2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -804,6 +804,11 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
 	return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
 }
 
+static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
+{
+	return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
+}
+
 /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
 static inline u64 tcp_ns_to_ts(u64 ns)
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ab87f0285b72..ffce17545b62 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2856,7 +2856,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 static void tcp_update_rto_time(struct tcp_sock *tp)
 {
 	if (tp->rto_stamp) {
-		tp->total_rto_time += tcp_time_stamp(tp) - tp->rto_stamp;
+		tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp;
 		tp->rto_stamp = 0;
 	}
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0862b73dd3b5..63247c78dc13 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -422,7 +422,7 @@ static void tcp_update_rto_stats(struct sock *sk)
 
 	if (!icsk->icsk_retransmits) {
 		tp->total_rto_recoveries++;
-		tp->rto_stamp = tcp_time_stamp(tp);
+		tp->rto_stamp = tcp_time_stamp_ms(tp);
 	}
 	icsk->icsk_retransmits++;
 	tp->total_rto++;
-- 
cgit v1.2.3-70-g09d2


From 2a7c8d291ffeba69a47d8528987156f625cc05b0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:39 +0000
Subject: tcp: introduce tcp_clock_ms()

It delivers current TCP time stamp in ms unit, and is used
in place of confusing tcp_time_stamp_raw()

It is the same family than tcp_clock_ns() and tcp_clock_ms().

tcp_time_stamp_raw() will be replaced later for TSval
contexts with a more descriptive name.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h                                     | 5 +++++
 net/ipv4/tcp.c                                        | 6 ++----
 net/ipv4/tcp_minisocks.c                              | 4 ++--
 net/netfilter/nf_synproxy_core.c                      | 2 +-
 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c | 4 ++--
 5 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9fc6dc4ba9e2..3bdf1141f5a2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -798,6 +798,11 @@ static inline u64 tcp_clock_us(void)
 	return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
 }
 
+static inline u64 tcp_clock_ms(void)
+{
+	return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
+}
+
 /* This should only be used in contexts where tp->tcp_mstamp is up to date */
 static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 56a8d936000f..5b034b0356ec 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3817,10 +3817,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_total_rto = tp->total_rto;
 	info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
 	info->tcpi_total_rto_time = tp->total_rto_time;
-	if (tp->rto_stamp) {
-		info->tcpi_total_rto_time += tcp_time_stamp_raw() -
-						tp->rto_stamp;
-	}
+	if (tp->rto_stamp)
+		info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
 
 	unlock_sock_fast(sk, slow);
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3f87611077ef..a9fdba897a28 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -567,8 +567,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 					       USEC_PER_SEC / TCP_TS_HZ);
 		newtp->total_rto = req->num_timeout;
 		newtp->total_rto_recoveries = 1;
-		newtp->total_rto_time = tcp_time_stamp_raw() -
-						newtp->retrans_stamp;
+		newtp->total_rto_time = tcp_clock_ms() -
+					newtp->retrans_stamp;
 	}
 	newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 16915f8eef2b..467671f2d42f 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -153,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
 				    struct synproxy_options *opts)
 {
 	opts->tsecr = opts->tsval;
-	opts->tsval = tcp_time_stamp_raw() & ~0x3f;
+	opts->tsval = tcp_clock_ms() & ~0x3f;
 
 	if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
 		opts->tsval |= opts->wscale;
diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
index 07d786329105..e959336c7a73 100644
--- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
+++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
@@ -177,7 +177,7 @@ static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
 	return ns / (NSEC_PER_SEC / TCP_TS_HZ);
 }
 
-static __always_inline __u32 tcp_time_stamp_raw(void)
+static __always_inline __u32 tcp_clock_ms(void)
 {
 	return tcp_ns_to_ts(tcp_clock_ns());
 }
@@ -274,7 +274,7 @@ static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
 	if (!loop_ctx.option_timestamp)
 		return false;
 
-	cookie = tcp_time_stamp_raw() & ~TSMASK;
+	cookie = tcp_clock_ms() & ~TSMASK;
 	cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
 	if (loop_ctx.option_sack)
 		cookie |= TS_OPT_SACK;
-- 
cgit v1.2.3-70-g09d2


From 16cf6477741bdaa287d5e4531a1a503618a41a22 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:40 +0000
Subject: tcp: replace tcp_time_stamp_raw()

In preparation of usec TCP TS support, remove tcp_time_stamp_raw()
in favor of tcp_clock_ts() helper. This helper will return a suitable
32bit result to feed TS values, depending on a socket field.

Also add tcp_tw_tsval() and tcp_rsk_tsval() helpers to factorize
the details.

We do not yet support usec timestamps.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 25 +++++++++++++++++++------
 net/ipv4/tcp.c      |  4 ++--
 net/ipv4/tcp_ipv4.c |  4 ++--
 net/ipv6/tcp_ipv6.c |  4 ++--
 4 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3bdf1141f5a2..0534526a535d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -803,6 +803,16 @@ static inline u64 tcp_clock_ms(void)
 	return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
 }
 
+/* TCP Timestamp included in TS option (RFC 1323) can either use ms
+ * or usec resolution. Each socket carries a flag to select one or other
+ * resolution, as the route attribute could change anytime.
+ * Each flow must stick to initial resolution.
+ */
+static inline u32 tcp_clock_ts(bool usec_ts)
+{
+	return usec_ts ? tcp_clock_us() : tcp_clock_ms();
+}
+
 /* This should only be used in contexts where tp->tcp_mstamp is up to date */
 static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
 {
@@ -820,12 +830,6 @@ static inline u64 tcp_ns_to_ts(u64 ns)
 	return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
 }
 
-/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
-static inline u32 tcp_time_stamp_raw(void)
-{
-	return tcp_ns_to_ts(tcp_clock_ns());
-}
-
 void tcp_mstamp_refresh(struct tcp_sock *tp);
 
 static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
@@ -844,6 +848,15 @@ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
 	return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
 }
 
+static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
+{
+	return tcp_clock_ts(false) + tcptw->tw_ts_offset;
+}
+
+static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
+{
+	return tcp_clock_ts(false) + treq->ts_off;
+}
 
 #define tcp_flag_byte(th) (((u_int8_t *)th)[13])
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5b034b0356ec..805f8341064f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3632,7 +3632,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 		if (!tp->repair)
 			err = -EPERM;
 		else
-			WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw());
+			WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false));
 		break;
 	case TCP_REPAIR_WINDOW:
 		err = tcp_repair_set_window(tp, optval, optlen);
@@ -4143,7 +4143,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_TIMESTAMP:
-		val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset);
+		val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset);
 		break;
 	case TCP_NOTSENT_LOWAT:
 		val = READ_ONCE(tp->notsent_lowat);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a67a5de86253..cdd65cc594bc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -954,7 +954,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 	tcp_v4_send_ack(sk, skb,
 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
-			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
+			tcp_tw_tsval(tcptw),
 			tcptw->tw_ts_recent,
 			tw->tw_bound_dev_if,
 			tcp_twsk_md5_key(tcptw),
@@ -988,7 +988,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	tcp_v4_send_ack(sk, skb, seq,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
-			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
+			tcp_rsk_tsval(tcp_rsk(req)),
 			READ_ONCE(req->ts_recent),
 			0,
 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d410703bb5a1..1ee6517e9b2f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1096,7 +1096,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 
 	tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
-			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
+			tcp_tw_tsval(tcptw),
 			tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
 			tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority,
 			tw->tw_txhash);
@@ -1123,7 +1123,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
-			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
+			tcp_rsk_tsval(tcp_rsk(req)),
 			READ_ONCE(req->ts_recent), sk->sk_bound_dev_if,
 			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index),
 			ipv6_get_dsfield(ipv6_hdr(skb)), 0,
-- 
cgit v1.2.3-70-g09d2


From d1a02ed66fe62aa2edd77bd54e270ebc33bd12ff Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:41 +0000
Subject: tcp: rename tcp_skb_timestamp()

This helper returns a 32bit TCP TSval from skb->tstamp.

As we are going to support usec or ms units soon, rename it
to tcp_skb_timestamp_ts() and add a boolean to select the unit.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 14 +++++++++-----
 net/ipv4/tcp_input.c  |  2 +-
 net/ipv4/tcp_output.c |  8 ++++----
 net/ipv4/tcp_timer.c  |  4 ++--
 4 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0534526a535d..493f8550055b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -837,17 +837,21 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
 	return max_t(s64, t1 - t0, 0);
 }
 
-static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
-{
-	return tcp_ns_to_ts(skb->skb_mstamp_ns);
-}
-
 /* provide the departure time in us unit */
 static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
 {
 	return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
 }
 
+/* Provide skb TSval in usec or ms unit */
+static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
+{
+	if (usec_ts)
+		return tcp_skb_timestamp_us(skb);
+
+	return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC);
+}
+
 static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
 {
 	return tcp_clock_ts(false) + tcptw->tw_ts_offset;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ffce17545b62..de68cad82d19 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2442,7 +2442,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
 				     const struct sk_buff *skb)
 {
 	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
-	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(false, skb));
 }
 
 /* Nothing was retransmitted or returned timestamp is less
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 909f85aefd74..03a2a9fc0dc1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 
 	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
+		opts->tsval = tcp_skb_timestamp_ts(false, skb) + tp->tsoffset;
 		opts->tsecr = tp->rx_opt.ts_recent;
 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -884,7 +884,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 	}
 	if (likely(ireq->tstamp_ok)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
+		opts->tsval = tcp_skb_timestamp_ts(false, skb) + tcp_rsk(req)->ts_off;
 		opts->tsecr = READ_ONCE(req->ts_recent);
 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -943,7 +943,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 
 	if (likely(tp->rx_opt.tstamp_ok)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
+		opts->tsval = skb ? tcp_skb_timestamp_ts(false, skb) + tp->tsoffset : 0;
 		opts->tsecr = tp->rx_opt.ts_recent;
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -3379,7 +3379,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 
 	/* Save stamp of the first (attempted) retransmit. */
 	if (!tp->retrans_stamp)
-		tp->retrans_stamp = tcp_skb_timestamp(skb);
+		tp->retrans_stamp = tcp_skb_timestamp_ts(false, skb);
 
 	if (tp->undo_retrans < 0)
 		tp->undo_retrans = 0;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 63247c78dc13..8764a9a2dc21 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -479,7 +479,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
 		return false;
 
 	rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) -
-			(tp->retrans_stamp ?: tcp_skb_timestamp(skb)));
+			(tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)));
 
 	return rtx_delta > timeout;
 }
@@ -534,7 +534,7 @@ void tcp_retransmit_timer(struct sock *sk)
 		struct inet_sock *inet = inet_sk(sk);
 		u32 rtx_delta;
 
-		rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb));
+		rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb));
 		if (sk->sk_family == AF_INET) {
 			net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
 				&inet->inet_daddr, ntohs(inet->inet_dport),
-- 
cgit v1.2.3-70-g09d2


From 003e07a1e48e9423647d2fef1c86b4caab3a94be Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:42 +0000
Subject: tcp: move tcp_ns_to_ts() to net/ipv4/syncookies.c

tcp_ns_to_ts() is only used once from cookie_init_timestamp().

Also add the 'bool usec_ts' parameter to enable usec TS later.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  6 ------
 net/ipv4/syncookies.c | 10 +++++++++-
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 493f8550055b..b86abf1fbe46 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -824,12 +824,6 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
 	return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
 }
 
-/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
-static inline u64 tcp_ns_to_ts(u64 ns)
-{
-	return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
-}
-
 void tcp_mstamp_refresh(struct tcp_sock *tp);
 
 static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 3b4dafefb4b0..62395fdb0ca5 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -51,6 +51,14 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 			    count, &syncookie_secret[c]);
 }
 
+/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */
+static u64 tcp_ns_to_ts(bool usec_ts, u64 val)
+{
+	if (usec_ts)
+		return div_u64(val, NSEC_PER_USEC);
+
+	return div_u64(val, NSEC_PER_MSEC);
+}
 
 /*
  * when syncookies are in effect and tcp timestamps are enabled we encode
@@ -62,7 +70,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 u64 cookie_init_timestamp(struct request_sock *req, u64 now)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
-	u64 ts, ts_now = tcp_ns_to_ts(now);
+	u64 ts, ts_now = tcp_ns_to_ts(false, now);
 	u32 options = 0;
 
 	options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
-- 
cgit v1.2.3-70-g09d2


From 9d0c00f5ca05be9e89649c156f9d5b9421fc534e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:43 +0000
Subject: tcp: rename tcp_time_stamp() to tcp_time_stamp_ts()

This helper returns a TSval from a TCP socket.

It currently calls tcp_time_stamp_ms() but will soon
be able to return a usec based TSval, depending
on an upcoming tp->tcp_usec_ts field.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  9 ++++-----
 net/ipv4/tcp_input.c  |  6 +++---
 net/ipv4/tcp_lp.c     |  2 +-
 net/ipv4/tcp_output.c |  2 +-
 net/ipv4/tcp_timer.c  | 10 +++++-----
 5 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b86abf1fbe46..af72c1dc37f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -813,15 +813,14 @@ static inline u32 tcp_clock_ts(bool usec_ts)
 	return usec_ts ? tcp_clock_us() : tcp_clock_ms();
 }
 
-/* This should only be used in contexts where tp->tcp_mstamp is up to date */
-static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
+static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
 {
-	return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
+	return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
 }
 
-static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
+static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
 {
-	return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
+	return tcp_time_stamp_ms(tp);
 }
 
 void tcp_mstamp_refresh(struct tcp_sock *tp);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index de68cad82d19..e7e38fc1d62f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -704,7 +704,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 
 	if (TCP_SKB_CB(skb)->end_seq -
 	    TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
-		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
+		u32 delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
 		u32 delta_us;
 
 		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
@@ -3148,7 +3148,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
 	 */
 	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
 	    flag & FLAG_ACKED) {
-		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
+		u32 delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
 
 		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
 			if (!delta)
@@ -6293,7 +6293,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
 		    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
-			     tcp_time_stamp(tp))) {
+			     tcp_time_stamp_ts(tp))) {
 			NET_INC_STATS(sock_net(sk),
 					LINUX_MIB_PAWSACTIVEREJECTED);
 			goto reset_and_undo;
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index ae36780977d2..52fe17167460 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -272,7 +272,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct lp *lp = inet_csk_ca(sk);
-	u32 now = tcp_time_stamp(tp);
+	u32 now = tcp_time_stamp_ts(tp);
 	u32 delta;
 
 	if (sample->rtt_us > 0)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 03a2a9fc0dc1..a1fec8be9ac3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3961,7 +3961,7 @@ int tcp_connect(struct sock *sk)
 
 	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
 	tcp_mstamp_refresh(tp);
-	tp->retrans_stamp = tcp_time_stamp(tp);
+	tp->retrans_stamp = tcp_time_stamp_ts(tp);
 	tcp_connect_queue_skb(sk, buff);
 	tcp_ecn_send_syn(sk, buff);
 	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8764a9a2dc21..bfcf3fe44c72 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -33,7 +33,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 	user_timeout = READ_ONCE(icsk->icsk_user_timeout);
 	if (!user_timeout)
 		return icsk->icsk_rto;
-	elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
+	elapsed = tcp_time_stamp_ts(tcp_sk(sk)) - start_ts;
 	remaining = user_timeout - elapsed;
 	if (remaining <= 0)
 		return 1; /* user timeout has passed; fire ASAP */
@@ -226,7 +226,7 @@ static bool retransmits_timed_out(struct sock *sk,
 		timeout = tcp_model_timeout(sk, boundary, rto_base);
 	}
 
-	return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
+	return (s32)(tcp_time_stamp_ts(tcp_sk(sk)) - start_ts - timeout) >= 0;
 }
 
 /* A write timeout has occurred. Process the after effects. */
@@ -462,7 +462,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 	req->num_timeout++;
 	tcp_update_rto_stats(sk);
 	if (!tp->retrans_stamp)
-		tp->retrans_stamp = tcp_time_stamp(tp);
+		tp->retrans_stamp = tcp_time_stamp_ts(tp);
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 			  req->timeout << req->num_timeout, TCP_RTO_MAX);
 }
@@ -478,7 +478,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
 	if (rcv_delta <= timeout)
 		return false;
 
-	rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) -
+	rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp_ts(tp) -
 			(tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)));
 
 	return rtx_delta > timeout;
@@ -534,7 +534,7 @@ void tcp_retransmit_timer(struct sock *sk)
 		struct inet_sock *inet = inet_sk(sk);
 		u32 rtx_delta;
 
-		rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb));
+		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb));
 		if (sk->sk_family == AF_INET) {
 			net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
 				&inet->inet_daddr, ntohs(inet->inet_dport),
-- 
cgit v1.2.3-70-g09d2


From 3d44de9a10ea2b1658dfaed8ea6d3d7b6e0defbb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:45 +0000
Subject: tcp: add RTAX_FEATURE_TCP_USEC_TS

This new dst feature flag will be used to allow TCP to use usec
based timestamps instead of msec ones.

ip route .... feature tcp_usec_ts

Also document that RTAX_FEATURE_SACK and RTAX_FEATURE_TIMESTAMP
are unused.

RTAX_FEATURE_ALLFRAG is also going away soon.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h            |  5 +++++
 include/uapi/linux/rtnetlink.h | 18 +++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e15452df9804..04a0e647ef74 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -576,4 +576,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
 int tcp_sock_set_user_timeout(struct sock *sk, int val);
 
+static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
+{
+	return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
+}
+
 #endif	/* _LINUX_TCP_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 51c13cf9c5ae..aa2482a0614a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -502,13 +502,17 @@ enum {
 
 #define RTAX_MAX (__RTAX_MAX - 1)
 
-#define RTAX_FEATURE_ECN	(1 << 0)
-#define RTAX_FEATURE_SACK	(1 << 1)
-#define RTAX_FEATURE_TIMESTAMP	(1 << 2)
-#define RTAX_FEATURE_ALLFRAG	(1 << 3)
-
-#define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
-				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
+#define RTAX_FEATURE_ECN		(1 << 0)
+#define RTAX_FEATURE_SACK		(1 << 1) /* unused */
+#define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+#define RTAX_FEATURE_ALLFRAG		(1 << 3)
+#define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
+
+#define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+				 RTAX_FEATURE_SACK |		\
+				 RTAX_FEATURE_TIMESTAMP |	\
+				 RTAX_FEATURE_ALLFRAG |		\
+				 RTAX_FEATURE_TCP_USEC_TS)
 
 struct rta_session {
 	__u8	proto;
-- 
cgit v1.2.3-70-g09d2


From af7721448a609d1912b57c825194ef6e17fc71a4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:46 +0000
Subject: tcp: introduce TCP_PAWS_WRAP

tcp_paws_check() uses TCP_PAWS_24DAYS constant to detect if TCP TS
values might have wrapped after a long idle period.

This mechanism is described in RFC 7323 5.5 (Outdated Timestamps)

TCP_PAWS_24DAYS value was based on the assumption of a clock
of 1 Khz.

As we want to adopt a 1 Mhz clock in the future, we reduce
this constant.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index af72c1dc37f3..0ab577869d7a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -166,7 +166,12 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
 #define MAX_TCP_KEEPCNT		127
 #define MAX_TCP_SYNCNT		127
 
-#define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
+/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
+ * to avoid overflows. This assumes a clock smaller than 1 Mhz.
+ * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
+ */
+#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)
+
 #define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
 					 * after this time. It should be equal
 					 * (or greater than) TCP_TIMEWAIT_LEN
@@ -1619,7 +1624,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
 	if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
 		return true;
 	if (unlikely(!time_before32(ktime_get_seconds(),
-				    rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
+				    rx_opt->ts_recent_stamp + TCP_PAWS_WRAP)))
 		return true;
 	/*
 	 * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
-- 
cgit v1.2.3-70-g09d2


From 614e8316aa4cafba3e204cb8ee48bd12b92f3d93 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:47 +0000
Subject: tcp: add support for usec resolution in TCP TS values

Back in 2015, Van Jacobson suggested to use usec resolution in TCP TS values.
This has been implemented in our private kernels.

Goals were :

1) better observability of delays in networking stacks.
2) better disambiguation of events based on TSval/ecr values.
3) building block for congestion control modules needing usec resolution.

Back then we implemented a schem based on private SYN options
to negotiate the feature.

For upstream submission, we chose to use a route attribute,
because this feature is probably going to be used in private
networks [1] [2].

ip route add 10/8 ... features tcp_usec_ts

Note that RFC 7323 recommends a
  "timestamp clock frequency in the range 1 ms to 1 sec per tick.",
but also mentions
  "the maximum acceptable clock frequency is one tick every 59 ns."

[1] Unfortunately RFC 7323 5.5 (Outdated Timestamps) suggests
to invalidate TS.Recent values after a flow was idle for more
than 24 days. This is the part making usec_ts a problem
for peers following this recommendation for long living
idle flows.

[2] Attempts to standardize usec ts went nowhere:

https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h              |  4 +++-
 include/net/inet_timewait_sock.h |  3 ++-
 include/net/tcp.h                |  6 ++++--
 net/ipv4/syncookies.c            |  6 +++++-
 net/ipv4/tcp.c                   | 18 ++++++++++++++----
 net/ipv4/tcp_input.c             |  5 ++++-
 net/ipv4/tcp_ipv4.c              |  1 +
 net/ipv4/tcp_minisocks.c         | 19 ++++++++++++++-----
 net/ipv4/tcp_output.c            | 12 ++++++++----
 net/ipv4/tcp_timer.c             | 40 ++++++++++++++++++++++++++--------------
 net/ipv6/tcp_ipv6.c              |  1 +
 11 files changed, 82 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 04a0e647ef74..6df715b6e51d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -152,6 +152,7 @@ struct tcp_request_sock {
 	u64				snt_synack; /* first SYNACK sent time */
 	bool				tfo_listener;
 	bool				is_mptcp;
+	s8				req_usec_ts;
 #if IS_ENABLED(CONFIG_MPTCP)
 	bool				drop_req;
 #endif
@@ -257,7 +258,8 @@ struct tcp_sock {
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
-		unused:5;
+		tcp_usec_ts:1, /* TSval values in usec */
+		unused:4;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 4a8e578405cb..b14999ff55db 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -67,7 +67,8 @@ struct inet_timewait_sock {
 	/* And these are ours. */
 	unsigned int		tw_transparent  : 1,
 				tw_flowlabel	: 20,
-				tw_pad		: 3,	/* 3 bits hole */
+				tw_usec_ts	: 1,
+				tw_pad		: 2,	/* 2 bits hole */
 				tw_tos		: 8;
 	u32			tw_txhash;
 	u32			tw_priority;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ab577869d7a..39b731c900dd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -825,6 +825,8 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
 
 static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
 {
+	if (tp->tcp_usec_ts)
+		return tp->tcp_mstamp;
 	return tcp_time_stamp_ms(tp);
 }
 
@@ -852,12 +854,12 @@ static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
 
 static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
 {
-	return tcp_clock_ts(false) + tcptw->tw_ts_offset;
+	return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
 }
 
 static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
 {
-	return tcp_clock_ts(false) + treq->ts_off;
+	return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
 }
 
 #define tcp_flag_byte(th) (((u_int8_t *)th)[13])
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 62395fdb0ca5..c64334363230 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -84,7 +84,9 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
 	if (ts > ts_now)
 		ts -= (1UL << TSBITS);
 
-	return ts * (NSEC_PER_SEC / TCP_TS_HZ);
+	if (tcp_rsk(req)->req_usec_ts)
+		return ts * NSEC_PER_USEC;
+	return ts * NSEC_PER_MSEC;
 }
 
 
@@ -304,6 +306,8 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
 	treq->af_specific = af_ops;
 
 	treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+	treq->req_usec_ts = -1;
+
 #if IS_ENABLED(CONFIG_MPTCP)
 	treq->is_mptcp = sk_is_mptcp(sk);
 	if (treq->is_mptcp) {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 805f8341064f..b961364b4961 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3629,10 +3629,16 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 			tp->fastopen_no_cookie = val;
 		break;
 	case TCP_TIMESTAMP:
-		if (!tp->repair)
+		if (!tp->repair) {
 			err = -EPERM;
-		else
-			WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false));
+			break;
+		}
+		/* val is an opaque field,
+		 * and low order bit contains usec_ts enable bit.
+		 * Its a best effort, and we do not care if user makes an error.
+		 */
+		tp->tcp_usec_ts = val & 1;
+		WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
 		break;
 	case TCP_REPAIR_WINDOW:
 		err = tcp_repair_set_window(tp, optval, optlen);
@@ -4143,7 +4149,11 @@ int do_tcp_getsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_TIMESTAMP:
-		val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset);
+		val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
+		if (tp->tcp_usec_ts)
+			val |= 1;
+		else
+			val &= ~1;
 		break;
 	case TCP_NOTSENT_LOWAT:
 		val = READ_ONCE(tp->notsent_lowat);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5666f6137167..18b858597af4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -698,6 +698,8 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
 	u32 delta, delta_us;
 
 	delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
+	if (tp->tcp_usec_ts)
+		return delta;
 
 	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
 		if (!delta)
@@ -2452,7 +2454,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
 				     const struct sk_buff *skb)
 {
 	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
-	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(false, skb));
+	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
 }
 
 /* Nothing was retransmitted or returned timestamp is less
@@ -7045,6 +7047,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	req->syncookie = want_cookie;
 	tcp_rsk(req)->af_specific = af_ops;
 	tcp_rsk(req)->ts_off = 0;
+	tcp_rsk(req)->req_usec_ts = -1;
 #if IS_ENABLED(CONFIG_MPTCP)
 	tcp_rsk(req)->is_mptcp = 0;
 #endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cdd65cc594bc..7583d4e34c8c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -296,6 +296,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		rt = NULL;
 		goto failure;
 	}
+	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
 	/* OK, now commit destination to socket.  */
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a9fdba897a28..ace806c5bd0c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -300,6 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
+		tw->tw_usec_ts		= tp->tcp_usec_ts;
 		tcptw->tw_last_oow_ack_time = 0;
 		tcptw->tw_tx_delay	= tp->tcp_tx_delay;
 		tw->tw_txhash		= sk->sk_txhash;
@@ -554,21 +555,29 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	newtp->max_window = newtp->snd_wnd;
 
 	if (newtp->rx_opt.tstamp_ok) {
+		newtp->tcp_usec_ts = treq->req_usec_ts;
 		newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
 		newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 	} else {
+		newtp->tcp_usec_ts = 0;
 		newtp->rx_opt.ts_recent_stamp = 0;
 		newtp->tcp_header_len = sizeof(struct tcphdr);
 	}
 	if (req->num_timeout) {
-		newtp->undo_marker = treq->snt_isn;
-		newtp->retrans_stamp = div_u64(treq->snt_synack,
-					       USEC_PER_SEC / TCP_TS_HZ);
 		newtp->total_rto = req->num_timeout;
+		newtp->undo_marker = treq->snt_isn;
+		if (newtp->tcp_usec_ts) {
+			newtp->retrans_stamp = treq->snt_synack;
+			newtp->total_rto_time = (u32)(tcp_clock_us() -
+						      newtp->retrans_stamp) / USEC_PER_MSEC;
+		} else {
+			newtp->retrans_stamp = div_u64(treq->snt_synack,
+						       USEC_PER_SEC / TCP_TS_HZ);
+			newtp->total_rto_time = tcp_clock_ms() -
+						newtp->retrans_stamp;
+		}
 		newtp->total_rto_recoveries = 1;
-		newtp->total_rto_time = tcp_clock_ms() -
-					newtp->retrans_stamp;
 	}
 	newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a1fec8be9ac3..2866ccbccde0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 
 	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = tcp_skb_timestamp_ts(false, skb) + tp->tsoffset;
+		opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
 		opts->tsecr = tp->rx_opt.ts_recent;
 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -884,7 +884,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 	}
 	if (likely(ireq->tstamp_ok)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = tcp_skb_timestamp_ts(false, skb) + tcp_rsk(req)->ts_off;
+		opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
+			      tcp_rsk(req)->ts_off;
 		opts->tsecr = READ_ONCE(req->ts_recent);
 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -943,7 +944,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 
 	if (likely(tp->rx_opt.tstamp_ok)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = skb ? tcp_skb_timestamp_ts(false, skb) + tp->tsoffset : 0;
+		opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) +
+				tp->tsoffset : 0;
 		opts->tsecr = tp->rx_opt.ts_recent;
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -3379,7 +3381,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 
 	/* Save stamp of the first (attempted) retransmit. */
 	if (!tp->retrans_stamp)
-		tp->retrans_stamp = tcp_skb_timestamp_ts(false, skb);
+		tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb);
 
 	if (tp->undo_retrans < 0)
 		tp->undo_retrans = 0;
@@ -3665,6 +3667,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	memset(&opts, 0, sizeof(opts));
+	if (tcp_rsk(req)->req_usec_ts < 0)
+		tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
 	now = tcp_clock_ns();
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index bfcf3fe44c72..1f9f6c1c196b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -26,14 +26,18 @@
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	u32 elapsed, start_ts, user_timeout;
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 elapsed, user_timeout;
 	s32 remaining;
 
-	start_ts = tcp_sk(sk)->retrans_stamp;
 	user_timeout = READ_ONCE(icsk->icsk_user_timeout);
 	if (!user_timeout)
 		return icsk->icsk_rto;
-	elapsed = tcp_time_stamp_ts(tcp_sk(sk)) - start_ts;
+
+	elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp;
+	if (tp->tcp_usec_ts)
+		elapsed /= USEC_PER_MSEC;
+
 	remaining = user_timeout - elapsed;
 	if (remaining <= 0)
 		return 1; /* user timeout has passed; fire ASAP */
@@ -212,12 +216,13 @@ static bool retransmits_timed_out(struct sock *sk,
 				  unsigned int boundary,
 				  unsigned int timeout)
 {
-	unsigned int start_ts;
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int start_ts, delta;
 
 	if (!inet_csk(sk)->icsk_retransmits)
 		return false;
 
-	start_ts = tcp_sk(sk)->retrans_stamp;
+	start_ts = tp->retrans_stamp;
 	if (likely(timeout == 0)) {
 		unsigned int rto_base = TCP_RTO_MIN;
 
@@ -226,7 +231,12 @@ static bool retransmits_timed_out(struct sock *sk,
 		timeout = tcp_model_timeout(sk, boundary, rto_base);
 	}
 
-	return (s32)(tcp_time_stamp_ts(tcp_sk(sk)) - start_ts - timeout) >= 0;
+	if (tp->tcp_usec_ts) {
+		/* delta maybe off up to a jiffy due to timer granularity. */
+		delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1);
+		return (s32)(delta - timeout * USEC_PER_MSEC) >= 0;
+	}
+	return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0;
 }
 
 /* A write timeout has occurred. Process the after effects. */
@@ -468,20 +478,18 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 }
 
 static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
-				     const struct sk_buff *skb)
+				     const struct sk_buff *skb,
+				     u32 rtx_delta)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const int timeout = TCP_RTO_MAX * 2;
-	u32 rcv_delta, rtx_delta;
+	u32 rcv_delta;
 
 	rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
 	if (rcv_delta <= timeout)
 		return false;
 
-	rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp_ts(tp) -
-			(tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)));
-
-	return rtx_delta > timeout;
+	return msecs_to_jiffies(rtx_delta) > timeout;
 }
 
 /**
@@ -534,7 +542,11 @@ void tcp_retransmit_timer(struct sock *sk)
 		struct inet_sock *inet = inet_sk(sk);
 		u32 rtx_delta;
 
-		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb));
+		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: 
+				tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
+		if (tp->tcp_usec_ts)
+			rtx_delta /= USEC_PER_MSEC;
+
 		if (sk->sk_family == AF_INET) {
 			net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
 				&inet->inet_daddr, ntohs(inet->inet_dport),
@@ -551,7 +563,7 @@ void tcp_retransmit_timer(struct sock *sk)
 				rtx_delta);
 		}
 #endif
-		if (tcp_rtx_probe0_timed_out(sk, skb)) {
+		if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) {
 			tcp_write_err(sk);
 			goto out;
 		}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1ee6517e9b2f..0c8a14ba104f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -286,6 +286,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		goto failure;
 	}
 
+	tp->tcp_usec_ts = dst_tcp_usec_ts(dst);
 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
 	if (!saddr) {
-- 
cgit v1.2.3-70-g09d2


From a77a0f5c7f23a8a4981a2a3ff47baa91ceaf1f53 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:48 +0000
Subject: tcp: add TCPI_OPT_USEC_TS

Add the ability to report in tcp_info.tcpi_options if
a flow is using usec resolution in TCP TS val.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 1 +
 net/ipv4/tcp.c           | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index d1d08da6331a..8aa3916e14f6 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
 #define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */
 #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
 #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+#define TCPI_OPT_USEC_TS	64 /* usec timestamps */
 
 /*
  * Sender's congestion state indicating normal or abnormal situations
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b961364b4961..a86d8200a1e8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3760,6 +3760,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
 	if (tp->syn_data_acked)
 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+	if (tp->tcp_usec_ts)
+		info->tcpi_options |= TCPI_OPT_USEC_TS;
 
 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
 	info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
-- 
cgit v1.2.3-70-g09d2


From a1f5dcb1c0c1e26a7e158ce9fc28355041f26909 Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Thu, 28 Sep 2023 17:35:25 +0300
Subject: wifi: mac80211: add a driver callback to add vif debugfs

Add a callback which the driver can use to add the vif debugfs.
We used to have this back until commit d260ff12e776 ("mac80211:
remove vif debugfs driver callbacks") where we thought that it
will be easier to just add them during interface add/remove.

However, now with multi-link, we want to have proper debugfs
for drivers for multi-link where some files might be in the
netdev for non-MLO connections, and in the links for MLO ones,
so we need to do some reconstruction when switching the mode.

Moving to this new call enables that and MLO drivers will have
to use it for proper debugfs operation.

Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230928172905.ac38913f6ab7.Iee731d746bb08fcc628fa776f337016a12dc62ac@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    |  6 ++++++
 net/mac80211/driver-ops.c | 11 ++++++++---
 net/mac80211/driver-ops.h | 23 +++++++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 72375eceb786..a9b73e357462 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3869,6 +3869,10 @@ struct ieee80211_prep_tx_info {
  *	the station. See @sta_pre_rcu_remove if needed.
  *	This callback can sleep.
  *
+ * @vif_add_debugfs: Drivers can use this callback to add a debugfs vif
+ *	directory with its files. This callback should be within a
+ *	CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep.
+ *
  * @link_add_debugfs: Drivers can use this callback to add debugfs files
  *	when a link is added to a mac80211 vif. This callback should be within
  *	a CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep.
@@ -4368,6 +4372,8 @@ struct ieee80211_ops {
 	int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			  struct ieee80211_sta *sta);
 #ifdef CONFIG_MAC80211_DEBUGFS
+	void (*vif_add_debugfs)(struct ieee80211_hw *hw,
+				struct ieee80211_vif *vif);
 	void (*link_add_debugfs)(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
 				 struct ieee80211_bss_conf *link_conf,
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index 9112715a749a..08861ec61be9 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -72,10 +72,15 @@ int drv_add_interface(struct ieee80211_local *local,
 	ret = local->ops->add_interface(&local->hw, &sdata->vif);
 	trace_drv_return_int(local, ret);
 
-	if (ret == 0)
-		sdata->flags |= IEEE80211_SDATA_IN_DRIVER;
+	if (ret)
+		return ret;
 
-	return ret;
+	sdata->flags |= IEEE80211_SDATA_IN_DRIVER;
+
+	if (!local->in_reconfig)
+		drv_vif_add_debugfs(local, sdata);
+
+	return 0;
 }
 
 int drv_change_interface(struct ieee80211_local *local,
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 77048b9065e6..e07e65da15ee 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -489,6 +489,23 @@ static inline void drv_sta_remove(struct ieee80211_local *local,
 }
 
 #ifdef CONFIG_MAC80211_DEBUGFS
+static inline void drv_vif_add_debugfs(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata)
+{
+	might_sleep();
+
+	if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+	    WARN_ON(!sdata->vif.debugfs_dir))
+		return;
+
+	sdata = get_bss_sdata(sdata);
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	if (local->ops->vif_add_debugfs)
+		local->ops->vif_add_debugfs(&local->hw, &sdata->vif);
+}
+
 static inline void drv_link_add_debugfs(struct ieee80211_local *local,
 					struct ieee80211_sub_if_data *sdata,
 					struct ieee80211_bss_conf *link_conf,
@@ -539,6 +556,12 @@ static inline void drv_link_sta_add_debugfs(struct ieee80211_local *local,
 		local->ops->link_sta_add_debugfs(&local->hw, &sdata->vif,
 						 link_sta, dir);
 }
+#else
+static inline void drv_vif_add_debugfs(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata)
+{
+	might_sleep();
+}
 #endif
 
 static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
-- 
cgit v1.2.3-70-g09d2


From 00f823b68ecee865bb2e4e6d5b7f4359eef0b3e3 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Thu, 28 Sep 2023 17:35:27 +0300
Subject: wifi: mac80211: Rename and update IEEE80211_VIF_DISABLE_SMPS_OVERRIDE

EMLSR operation and SMPS operation cannot coexist. Thus, when EMLSR is
enabled, all SMPS signaling towards the AP should be stopped (it is
expected that the AP will consider SMPS to be off).

Rename IEEE80211_VIF_DISABLE_SMPS_OVERRIDE to IEEE80211_VIF_EML_ACTIVE
and use the flag as an indication from the driver that EMLSR is enabled.
When EMLSR is enabled SMPS flows towards the AP MLD should be stopped.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230928172905.fb2c2f9a0645.If6df5357568abd623a081f0f33b07e63fb8bba99@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c | 6 +++---
 include/net/mac80211.h                                | 6 +++---
 net/mac80211/cfg.c                                    | 6 ++++++
 net/mac80211/debugfs_netdev.c                         | 5 ++++-
 4 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
index 46e207211f21..6b4b32b5350b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
@@ -240,8 +240,8 @@ static int iwl_mvm_esr_mode_active(struct iwl_mvm *mvm,
 
 	mvmvif->esr_active = true;
 
-	/* Disable SMPS overrideing by user */
-	vif->driver_flags |= IEEE80211_VIF_DISABLE_SMPS_OVERRIDE;
+	/* Indicate to mac80211 that EML is enabled */
+	vif->driver_flags |= IEEE80211_VIF_EML_ACTIVE;
 
 	iwl_mvm_update_smps_on_active_links(mvm, vif, IWL_MVM_SMPS_REQ_FW,
 					    IEEE80211_SMPS_OFF);
@@ -399,7 +399,7 @@ static int iwl_mvm_esr_mode_inactive(struct iwl_mvm *mvm,
 
 	mvmvif->esr_active = false;
 
-	vif->driver_flags &= ~IEEE80211_VIF_DISABLE_SMPS_OVERRIDE;
+	vif->driver_flags &= ~IEEE80211_VIF_EML_ACTIVE;
 
 	iwl_mvm_update_smps_on_active_links(mvm, vif, IWL_MVM_SMPS_REQ_FW,
 					    IEEE80211_SMPS_AUTOMATIC);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a9b73e357462..7dae9aac089c 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1767,15 +1767,15 @@ struct ieee80211_channel_switch {
  * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes
  *	and send P2P_PS notification to the driver if NOA changed, even
  *	this is not pure P2P vif.
- * @IEEE80211_VIF_DISABLE_SMPS_OVERRIDE: disable user configuration of
- *	SMPS mode via debugfs.
+ * @IEEE80211_VIF_EML_ACTIVE: The driver indicates that EML operation is
+ *      enabled for the interface.
  */
 enum ieee80211_vif_flags {
 	IEEE80211_VIF_BEACON_FILTER		= BIT(0),
 	IEEE80211_VIF_SUPPORTS_CQM_RSSI		= BIT(1),
 	IEEE80211_VIF_SUPPORTS_UAPSD		= BIT(2),
 	IEEE80211_VIF_GET_NOA_UPDATE		= BIT(3),
-	IEEE80211_VIF_DISABLE_SMPS_OVERRIDE	= BIT(4),
+	IEEE80211_VIF_EML_ACTIVE	        = BIT(4),
 };
 
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5cec0c251e86..606b1b2e4123 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3158,6 +3158,12 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	old_req = link->u.mgd.req_smps;
 	link->u.mgd.req_smps = smps_mode;
 
+	/* The driver indicated that EML is enabled for the interface, which
+	 * implies that SMPS flows towards the AP should be stopped.
+	 */
+	if (sdata->vif.driver_flags & IEEE80211_VIF_EML_ACTIVE)
+		return 0;
+
 	if (old_req == smps_mode &&
 	    smps_mode != IEEE80211_SMPS_AUTOMATIC)
 		return 0;
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index b383dad18841..ec91e131b29e 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -323,7 +323,10 @@ static int ieee80211_set_smps(struct ieee80211_link_data *link,
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 
-	if (sdata->vif.driver_flags & IEEE80211_VIF_DISABLE_SMPS_OVERRIDE)
+	/* The driver indicated that EML is enabled for the interface, thus do
+	 * not allow to override the SMPS state.
+	 */
+	if (sdata->vif.driver_flags & IEEE80211_VIF_EML_ACTIVE)
 		return -EOPNOTSUPP;
 
 	if (!(local->hw.wiphy->features & NL80211_FEATURE_STATIC_SMPS) &&
-- 
cgit v1.2.3-70-g09d2


From ec06bdb22500c8f10f17326d6db3f8ad3e2347b1 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Thu, 28 Sep 2023 17:35:32 +0300
Subject: wifi: cfg80211: Fix typo in documentation

Fix a small typo in a comment.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230928172905.9dce226e393f.I929bfb9371e31c9e8d2bb1c1a96e9b1f3d02f2d0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a9ef43b33748..ebc0d733f876 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2661,7 +2661,7 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
  *	or no match (RSSI only)
  * @rssi_thold: don't report scan results below this threshold (in s32 dBm)
  * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied
- *	for filtering out scan results received. Drivers advertize this support
+ *	for filtering out scan results received. Drivers advertise this support
  *	of band specific rssi based filtering through the feature capability
  *	%NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band
  *	specific rssi thresholds take precedence over rssi_thold, if specified.
-- 
cgit v1.2.3-70-g09d2


From 271d14b37fa5f2f9bd9e22711c3ba6b1532c8de1 Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Thu, 28 Sep 2023 17:35:35 +0300
Subject: wifi: mac80211: make mgd_protect_tdls_discover MLO-aware

Since userspace can choose now what link to establish the
TDLS on, we should know on what channel to do session protection.
Add a link id parameter to this callback.

Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230928172905.ef12ce3eb835.If864f406cfd9e24f36a2b88fd13a37328633fcf9@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h  | 3 ++-
 drivers/net/wireless/intel/iwlwifi/mvm/tdls.c | 5 +++--
 include/net/mac80211.h                        | 3 ++-
 net/mac80211/driver-ops.h                     | 8 ++++++--
 net/mac80211/tdls.c                           | 2 +-
 5 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index 66d9de0f1511..74cb2f863472 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -2345,7 +2345,8 @@ void iwl_mvm_teardown_tdls_peers(struct iwl_mvm *mvm);
 void iwl_mvm_recalc_tdls_state(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 			       bool sta_added);
 void iwl_mvm_mac_mgd_protect_tdls_discover(struct ieee80211_hw *hw,
-					   struct ieee80211_vif *vif);
+					   struct ieee80211_vif *vif,
+					   unsigned int link_id);
 int iwl_mvm_tdls_channel_switch(struct ieee80211_hw *hw,
 				struct ieee80211_vif *vif,
 				struct ieee80211_sta *sta, u8 oper_class,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c b/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
index dae6f2a1aad9..fac992af3ddb 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
@@ -2,7 +2,7 @@
 /*
  * Copyright (C) 2014 Intel Mobile Communications GmbH
  * Copyright (C) 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2020, 2022 Intel Corporation
+ * Copyright (C) 2018-2020, 2022-2023 Intel Corporation
  */
 #include <linux/etherdevice.h>
 #include "mvm.h"
@@ -144,7 +144,8 @@ void iwl_mvm_recalc_tdls_state(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 }
 
 void iwl_mvm_mac_mgd_protect_tdls_discover(struct ieee80211_hw *hw,
-					   struct ieee80211_vif *vif)
+					   struct ieee80211_vif *vif,
+					   unsigned int link_id)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	u32 duration = 2 * vif->bss_conf.dtim_period * vif->bss_conf.beacon_int;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7dae9aac089c..0ce5b0831884 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4522,7 +4522,8 @@ struct ieee80211_ops {
 				   struct ieee80211_prep_tx_info *info);
 
 	void	(*mgd_protect_tdls_discover)(struct ieee80211_hw *hw,
-					     struct ieee80211_vif *vif);
+					     struct ieee80211_vif *vif,
+					     unsigned int link_id);
 
 	int (*add_chanctx)(struct ieee80211_hw *hw,
 			   struct ieee80211_chanctx_conf *ctx);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index e07e65da15ee..d92de4cd960b 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -955,7 +955,8 @@ static inline void drv_mgd_complete_tx(struct ieee80211_local *local,
 
 static inline void
 drv_mgd_protect_tdls_discover(struct ieee80211_local *local,
-			      struct ieee80211_sub_if_data *sdata)
+			      struct ieee80211_sub_if_data *sdata,
+			      int link_id)
 {
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
@@ -964,9 +965,12 @@ drv_mgd_protect_tdls_discover(struct ieee80211_local *local,
 		return;
 	WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);
 
+	link_id = link_id > 0 ? link_id : 0;
+
 	trace_drv_mgd_protect_tdls_discover(local, sdata);
 	if (local->ops->mgd_protect_tdls_discover)
-		local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif);
+		local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif,
+						      link_id);
 	trace_drv_return_void(local);
 }
 
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index f3fd66d30b84..05a7dff69fe9 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1318,7 +1318,7 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev,
 		 * response frame. It is transmitted directly and not buffered
 		 * by the AP.
 		 */
-		drv_mgd_protect_tdls_discover(sdata->local, sdata);
+		drv_mgd_protect_tdls_discover(sdata->local, sdata, link_id);
 		fallthrough;
 	case WLAN_TDLS_SETUP_CONFIRM:
 	case WLAN_PUB_ACTION_TDLS_DISCOVER_RES:
-- 
cgit v1.2.3-70-g09d2


From e76f3b4a73ea60ef098c5762b2aef4d11e094a04 Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Thu, 28 Sep 2023 17:35:34 +0300
Subject: wifi: mac80211: add link id to mgd_prepare_tx()

As we are moving to MLO and links terms, also the airtime protection
will be done for a link rather than for a vif. Thus, some
drivers will need to know for which link to protect airtime.
Add link id as a parameter to the mgd_prepare_tx() callback.

Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230928172905.c7fc59a6780b.Ic88a5037d31e184a2dce0b031ece1a0a93a3a9da@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    | 3 +++
 net/mac80211/driver-ops.h | 1 +
 net/mac80211/mlme.c       | 9 ++++++++-
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 0ce5b0831884..c839a04ad9db 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3641,11 +3641,14 @@ enum ieee80211_reconfig_type {
  * @success: whether the frame exchange was successful, only
  *	used with the mgd_complete_tx() method, and then only
  *	valid for auth and (re)assoc.
+ * @link_id: the link id on which the frame will be TX'ed.
+ *	Only used with the mgd_prepare_tx() method.
  */
 struct ieee80211_prep_tx_info {
 	u16 duration;
 	u16 subtype;
 	u8 success:1;
+	int link_id;
 };
 
 /**
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index d92de4cd960b..568633b38c47 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -928,6 +928,7 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
 		return;
 	WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);
 
+	info->link_id = info->link_id < 0 ? 0 : info->link_id;
 	trace_drv_mgd_prepare_tx(local, sdata, info->duration,
 				 info->subtype, info->success);
 	if (local->ops->mgd_prepare_tx)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e71c5129cc8b..54a9f6db799e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1584,6 +1584,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 
 	ifmgd->assoc_req_ies_len = pos - ie_start;
 
+	info.link_id = assoc_data->assoc_link_id;
 	drv_mgd_prepare_tx(local, sdata, &info);
 
 	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
@@ -2948,8 +2949,10 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 				if (link->u.mgd.have_beacon)
 					break;
 			}
-			if (link_id == IEEE80211_MLD_MAX_NUM_LINKS)
+			if (link_id == IEEE80211_MLD_MAX_NUM_LINKS) {
+				info.link_id = ffs(sdata->vif.active_links) - 1;
 				drv_mgd_prepare_tx(sdata->local, sdata, &info);
+			}
 		}
 
 		ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr,
@@ -3577,6 +3580,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 	u32 tx_flags = 0;
 	struct ieee80211_prep_tx_info info = {
 		.subtype = IEEE80211_STYPE_AUTH,
+		.link_id = auth_data->link_id,
 	};
 
 	pos = mgmt->u.auth.variable;
@@ -6569,6 +6573,7 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
 	if (auth_data->algorithm == WLAN_AUTH_SAE)
 		info.duration = jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE);
 
+	info.link_id = auth_data->link_id;
 	drv_mgd_prepare_tx(local, sdata, &info);
 
 	sdata_info(sdata, "send auth to %pM (try %d/%d)\n",
@@ -7989,6 +7994,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 			   req->bssid, req->reason_code,
 			   ieee80211_get_reason_code_string(req->reason_code));
 
+		info.link_id = ifmgd->auth_data->link_id;
 		drv_mgd_prepare_tx(sdata->local, sdata, &info);
 		ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid,
 					       IEEE80211_STYPE_DEAUTH,
@@ -8009,6 +8015,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 			   req->bssid, req->reason_code,
 			   ieee80211_get_reason_code_string(req->reason_code));
 
+		info.link_id = ifmgd->assoc_data->assoc_link_id;
 		drv_mgd_prepare_tx(sdata->local, sdata, &info);
 		ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid,
 					       IEEE80211_STYPE_DEAUTH,
-- 
cgit v1.2.3-70-g09d2


From 98e0c7f85773ab161d804103649b33f8a97cf1b3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 1 Oct 2023 12:16:31 -0700
Subject: wifi: cfg80211: fix header kernel-doc typos

Correct spelling of several words.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: linux-wireless@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: netdev@vger.kernel.org
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231001191633.19090-1-rdunlap@infradead.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ebc0d733f876..ff4ab88393c3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2443,7 +2443,7 @@ struct mesh_config {
  * @user_mpm: userspace handles all MPM functions
  * @dtim_period: DTIM period to use
  * @beacon_interval: beacon interval to use
- * @mcast_rate: multicat rate for Mesh Node [6Mbps is the default for 802.11a]
+ * @mcast_rate: multicast rate for Mesh Node [6Mbps is the default for 802.11a]
  * @basic_rates: basic rates to use when creating the mesh
  * @beacon_rate: bitrate to be used for beacons
  * @userspace_handles_dfs: whether user space controls DFS operation, i.e.
@@ -2713,7 +2713,7 @@ struct cfg80211_bss_select_adjust {
  * @match_sets: sets of parameters to be matched for a scan result
  *	entry to be considered valid and to be passed to the host
  *	(others are filtered out).
- *	If ommited, all results are passed.
+ *	If omitted, all results are passed.
  * @n_match_sets: number of match sets
  * @report_results: indicates that results were reported for this request
  * @wiphy: the wiphy this was for
@@ -2747,7 +2747,7 @@ struct cfg80211_bss_select_adjust {
  *	to the specified band while deciding whether a better BSS is reported
  *	using @relative_rssi. If delta is a negative number, the BSSs that
  *	belong to the specified band will be penalized by delta dB in relative
- *	comparisions.
+ *	comparisons.
  */
 struct cfg80211_sched_scan_request {
 	u64 reqid;
@@ -3587,7 +3587,7 @@ struct cfg80211_update_ft_ies_params {
  * This structure provides information needed to transmit a mgmt frame
  *
  * @chan: channel to use
- * @offchan: indicates wether off channel operation is required
+ * @offchan: indicates whether off channel operation is required
  * @wait: duration for ROC
  * @buf: buffer to transmit
  * @len: buffer length
@@ -3705,7 +3705,7 @@ struct cfg80211_nan_func_filter {
  * @publish_bcast: if true, the solicited publish should be broadcasted
  * @subscribe_active: if true, the subscribe is active
  * @followup_id: the instance ID for follow up
- * @followup_reqid: the requestor instance ID for follow up
+ * @followup_reqid: the requester instance ID for follow up
  * @followup_dest: MAC address of the recipient of the follow up
  * @ttl: time to live counter in DW.
  * @serv_spec_info: Service Specific Info
@@ -6714,7 +6714,7 @@ static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
  * @ies: data consisting of IEs
  * @len: length of data
  *
- * Return: %NULL if the etended element could not be found or if
+ * Return: %NULL if the extended element could not be found or if
  * the element is invalid (claims to be longer than the given
  * data) or if the byte array doesn't match; otherwise return the
  * requested element struct.
@@ -6861,7 +6861,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2);
 /**
  * regulatory_set_wiphy_regd - set regdom info for self managed drivers
  * @wiphy: the wireless device we want to process the regulatory domain on
- * @rd: the regulatory domain informatoin to use for this wiphy
+ * @rd: the regulatory domain information to use for this wiphy
  *
  * Set the regulatory domain information for self-managed wiphys, only they
  * may use this function. See %REGULATORY_WIPHY_SELF_MANAGED for more
@@ -6952,7 +6952,7 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy);
  * Regulatory self-managed driver can use it to proactively
  *
  * @alpha2: the ISO/IEC 3166 alpha2 wmm rule to be queried.
- * @freq: the freqency(in MHz) to be queried.
+ * @freq: the frequency (in MHz) to be queried.
  * @rule: pointer to store the wmm rule from the regulatory db.
  *
  * Self-managed wireless drivers can use this function to  query
@@ -9129,9 +9129,9 @@ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype,
 
 /**
  * cfg80211_assoc_comeback - notification of association that was
- * temporarly rejected with a comeback
+ * temporarily rejected with a comeback
  * @netdev: network device
- * @ap_addr: AP (MLD) address that rejected the assocation
+ * @ap_addr: AP (MLD) address that rejected the association
  * @timeout: timeout interval value TUs.
  *
  * this function may sleep. the caller must hold the corresponding wdev's mutex.
-- 
cgit v1.2.3-70-g09d2


From cbb56fbaafd95e7e8d7e95d3753477a9069e795f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 1 Oct 2023 12:16:32 -0700
Subject: wifi: mac80211: fix header kernel-doc typos

Correct typos and fix run-on sentences.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: linux-wireless@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: netdev@vger.kernel.org
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231001191633.19090-2-rdunlap@infradead.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c839a04ad9db..511d5d1c042f 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -79,7 +79,7 @@
  * helpers for sanity checking. Drivers must ensure all work added onto the
  * mac80211 workqueue should be cancelled on the driver stop() callback.
  *
- * mac80211 will flushed the workqueue upon interface removal and during
+ * mac80211 will flush the workqueue upon interface removal and during
  * suspend.
  *
  * All work performed on the mac80211 workqueue must not acquire the RTNL lock.
@@ -138,7 +138,7 @@
  * field to the frame RX timestamp and report the ack TX timestamp in the
  * ieee80211_rx_status struct.
  *
- * Similarly, To report hardware timestamps for Timing Measurement or Fine
+ * Similarly, to report hardware timestamps for Timing Measurement or Fine
  * Timing Measurement frame TX, the driver should set the SKB's hwtstamp field
  * to the frame TX timestamp and report the ack RX timestamp in the
  * ieee80211_tx_status struct.
@@ -3062,7 +3062,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * The set_key() call for the %SET_KEY command should return 0 if
  * the key is now in use, -%EOPNOTSUPP or -%ENOSPC if it couldn't be
  * added; if you return 0 then hw_key_idx must be assigned to the
- * hardware key index, you are free to use the full u8 range.
+ * hardware key index. You are free to use the full u8 range.
  *
  * Note that in the case that the @IEEE80211_HW_SW_CRYPTO_CONTROL flag is
  * set, mac80211 will not automatically fall back to software crypto if
@@ -3072,7 +3072,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * When the cmd is %DISABLE_KEY then it must succeed.
  *
  * Note that it is permissible to not decrypt a frame even if a key
- * for it has been uploaded to hardware, the stack will not make any
+ * for it has been uploaded to hardware. The stack will not make any
  * decision based on whether a key has been uploaded or not but rather
  * based on the receive flags.
  *
@@ -3087,7 +3087,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * The update_tkip_key() call updates the driver with the new phase 1 key.
  * This happens every time the iv16 wraps around (every 65536 packets). The
  * set_key() call will happen only once for each key (unless the AP did
- * rekeying), it will not include a valid phase 1 key. The valid phase 1 key is
+ * rekeying); it will not include a valid phase 1 key. The valid phase 1 key is
  * provided by update_tkip_key only. The trigger that makes mac80211 call this
  * handler is software decryption with wrap around of iv16.
  *
@@ -3114,7 +3114,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  *
  * mac80211 has support for various powersave implementations.
  *
- * First, it can support hardware that handles all powersaving by itself,
+ * First, it can support hardware that handles all powersaving by itself;
  * such hardware should simply set the %IEEE80211_HW_SUPPORTS_PS hardware
  * flag. In that case, it will be told about the desired powersave mode
  * with the %IEEE80211_CONF_PS flag depending on the association status.
@@ -3139,12 +3139,12 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * %IEEE80211_HW_PS_NULLFUNC_STACK flags. The hardware is of course still
  * required to pass up beacons. The hardware is still required to handle
  * waking up for multicast traffic; if it cannot the driver must handle that
- * as best as it can, mac80211 is too slow to do that.
+ * as best as it can; mac80211 is too slow to do that.
  *
  * Dynamic powersave is an extension to normal powersave in which the
  * hardware stays awake for a user-specified period of time after sending a
  * frame so that reply frames need not be buffered and therefore delayed to
- * the next wakeup. It's compromise of getting good enough latency when
+ * the next wakeup. It's a compromise of getting good enough latency when
  * there's data traffic and still saving significantly power in idle
  * periods.
  *
@@ -3209,7 +3209,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * Note that change, for the sake of simplification, also includes information
  * elements appearing or disappearing from the beacon.
  *
- * Some hardware supports an "ignore list" instead, just make sure nothing
+ * Some hardware supports an "ignore list" instead. Just make sure nothing
  * that was requested is on the ignore list, and include commonly changing
  * information element IDs in the ignore list, for example 11 (BSS load) and
  * the various vendor-assigned IEs with unknown contents (128, 129, 133-136,
@@ -3220,7 +3220,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * In addition to these capabilities, hardware should support notifying the
  * host of changes in the beacon RSSI. This is relevant to implement roaming
  * when no traffic is flowing (when traffic is flowing we see the RSSI of
- * the received data packets). This can consist in notifying the host when
+ * the received data packets). This can consist of notifying the host when
  * the RSSI changes significantly or when it drops below or rises above
  * configurable thresholds. In the future these thresholds will also be
  * configured by mac80211 (which gets them from userspace) to implement
@@ -3367,8 +3367,8 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * period starts for any reason, @release_buffered_frames is called
  * with the number of frames to be released and which TIDs they are
  * to come from. In this case, the driver is responsible for setting
- * the EOSP (for uAPSD) and MORE_DATA bits in the released frames,
- * to help the @more_data parameter is passed to tell the driver if
+ * the EOSP (for uAPSD) and MORE_DATA bits in the released frames.
+ * To help the @more_data parameter is passed to tell the driver if
  * there is more data on other TIDs -- the TIDs to release frames
  * from are ignored since mac80211 doesn't know how many frames the
  * buffers for those TIDs contain.
@@ -3417,7 +3417,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * Additionally, the driver has to then use these HW queue IDs for the queue
  * management functions (ieee80211_stop_queue() et al.)
  *
- * The driver is free to set up the queue mappings as needed, multiple virtual
+ * The driver is free to set up the queue mappings as needed; multiple virtual
  * interfaces may map to the same hardware queues if needed. The setup has to
  * happen during add_interface or change_interface callbacks. For example, a
  * driver supporting station+station and station+AP modes might decide to have
@@ -7266,7 +7266,7 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq,
  *
  * This function is used to check whether given txq is allowed to transmit by
  * the airtime scheduler, and can be used by drivers to access the airtime
- * fairness accounting without going using the scheduling order enfored by
+ * fairness accounting without using the scheduling order enforced by
  * next_txq().
  *
  * Returns %true if the airtime scheduler thinks the TXQ should be allowed to
-- 
cgit v1.2.3-70-g09d2


From 8c90b8b4e8eb671929dc684b96e059cd862c067f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 1 Oct 2023 12:16:33 -0700
Subject: wifi: nl80211: fix doc typos

Correct some typos.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: linux-wireless@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: netdev@vger.kernel.org
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231001191633.19090-3-rdunlap@infradead.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 0c64994b118e..dced2c49daec 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -167,7 +167,7 @@
  * following events occur.
  * a) Expiration of hardware timer whose expiration time is set to maximum
  * coalescing delay of matching coalesce rule.
- * b) Coalescing buffer in hardware reaches it's limit.
+ * b) Coalescing buffer in hardware reaches its limit.
  * c) Packet doesn't match any of the configured coalesce rules.
  *
  * User needs to configure following parameters for creating a coalesce
@@ -326,7 +326,7 @@
 /**
  * DOC: Multi-Link Operation
  *
- * In Multi-Link Operation, a connection between to MLDs utilizes multiple
+ * In Multi-Link Operation, a connection between two MLDs utilizes multiple
  * links. To use this in nl80211, various commands and responses now need
  * to or will include the new %NL80211_ATTR_MLO_LINKS attribute.
  * Additionally, various commands that need to operate on a specific link
-- 
cgit v1.2.3-70-g09d2


From 89141f965325de9aac3805cd506df788179b2809 Mon Sep 17 00:00:00 2001
From: Dmitry Antipov <dmantipov@yandex.ru>
Date: Wed, 4 Oct 2023 18:30:29 +0300
Subject: wifi: remove unused argument of ieee80211_get_tdls_action()

Remove unused 'hdr_size' argument of 'ieee80211_get_tdls_action()'
and adjust 'ieee80211_report_used_skb()' accordingly.

Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Link: https://lore.kernel.org/r/20231004153032.206134-1-dmantipov@yandex.ru
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 3 +--
 net/mac80211/status.c     | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 62b4469c6866..045a776ee547 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4490,12 +4490,11 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
 /**
  * ieee80211_get_tdls_action - get tdls packet action (or -1, if not tdls packet)
  * @skb: the skb containing the frame, length will not be checked
- * @hdr_size: the size of the ieee80211_hdr that starts at skb->data
  *
  * This function assumes the frame is a data frame, and that the network header
  * is in the correct place.
  */
-static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size)
+static inline int ieee80211_get_tdls_action(struct sk_buff *skb)
 {
 	if (!skb_is_nonlinear(skb) &&
 	    skb->len > (skb_network_offset(skb) + 2)) {
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index f67eafada741..807cdab38d5e 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -731,12 +731,9 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 		if (!sdata) {
 			skb->dev = NULL;
 		} else if (!dropped) {
-			unsigned int hdr_size =
-				ieee80211_hdrlen(hdr->frame_control);
-
 			/* Check to see if packet is a TDLS teardown packet */
 			if (ieee80211_is_data(hdr->frame_control) &&
-			    (ieee80211_get_tdls_action(skb, hdr_size) ==
+			    (ieee80211_get_tdls_action(skb) ==
 			     WLAN_TDLS_TEARDOWN)) {
 				ieee80211_tdls_td_tx_handle(local, sdata, skb,
 							    info->flags);
-- 
cgit v1.2.3-70-g09d2


From 2703bc8513996e848b5aefa2deb1ff3baae5d79b Mon Sep 17 00:00:00 2001
From: Kalle Valo <quic_kvalo@quicinc.com>
Date: Thu, 12 Oct 2023 14:42:28 +0300
Subject: wifi: mac80211: rename ieee80211_tx_status() to
 ieee80211_tx_status_skb()

make htmldocs warns:

Documentation/driver-api/80211/mac80211:109: ./include/net/mac80211.h:5170: WARNING: Duplicate C declaration, also defined at mac80211:1117.
Declaration is '.. c:function:: void ieee80211_tx_status (struct ieee80211_hw *hw, struct sk_buff *skb)'.

This is because there's a function named ieee80211_tx_status() and a struct named
ieee80211_tx_status. This has been discussed previously but no solution found:

https://lore.kernel.org/all/20220521114629.6ee9fc06@coco.lan/

There's also a bug open for three years with no solution in sight:

https://github.com/sphinx-doc/sphinx/pull/8313

So I guess we have no other solution than to a workaround this in the code,
for example to rename the function to ieee80211_tx_status_skb() to avoid the
name conflict. I got the idea for the name from ieee80211_tx_status_noskb() in
which the skb is not provided as an argument, instead with
ieee80211_tx_status_skb() the skb is provided.

Compile tested only.

Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com>
Link: https://lore.kernel.org/r/20231012114229.2931808-2-kvalo@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 Documentation/driver-api/80211/mac80211.rst    |  2 +-
 drivers/net/wireless/ath/ath12k/dp_tx.c        |  4 ++--
 drivers/net/wireless/ath/ath5k/base.c          |  2 +-
 drivers/net/wireless/ath/ath9k/htc_drv_txrx.c  |  2 +-
 drivers/net/wireless/ath/ath9k/xmit.c          |  2 +-
 drivers/net/wireless/broadcom/b43/dma.c        |  4 ++--
 drivers/net/wireless/broadcom/b43/pio.c        |  2 +-
 drivers/net/wireless/intel/iwlwifi/dvm/tx.c    |  4 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c    |  4 ++--
 drivers/net/wireless/mediatek/mt7601u/tx.c     |  2 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00dev.c |  2 +-
 drivers/net/wireless/st/cw1200/txrx.c          |  2 +-
 drivers/net/wireless/ti/wl1251/tx.c            |  6 +++---
 include/net/mac80211.h                         | 30 +++++++++++++-------------
 net/mac80211/main.c                            |  2 +-
 net/mac80211/status.c                          |  4 ++--
 16 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/80211/mac80211.rst b/Documentation/driver-api/80211/mac80211.rst
index 67d2e58b45e4..e38a220401f5 100644
--- a/Documentation/driver-api/80211/mac80211.rst
+++ b/Documentation/driver-api/80211/mac80211.rst
@@ -120,7 +120,7 @@ functions/definitions
 	ieee80211_rx
 	ieee80211_rx_ni
 	ieee80211_rx_irqsafe
-	ieee80211_tx_status
+	ieee80211_tx_status_skb
 	ieee80211_tx_status_ni
 	ieee80211_tx_status_irqsafe
 	ieee80211_rts_get
diff --git a/drivers/net/wireless/ath/ath12k/dp_tx.c b/drivers/net/wireless/ath/ath12k/dp_tx.c
index f5e0f5426226..492ca6ce6714 100644
--- a/drivers/net/wireless/ath/ath12k/dp_tx.c
+++ b/drivers/net/wireless/ath/ath12k/dp_tx.c
@@ -401,7 +401,7 @@ ath12k_dp_tx_htt_tx_complete_buf(struct ath12k_base *ab,
 		}
 	}
 
-	ieee80211_tx_status(ar->hw, msdu);
+	ieee80211_tx_status_skb(ar->hw, msdu);
 }
 
 static void
@@ -498,7 +498,7 @@ static void ath12k_dp_tx_complete_msdu(struct ath12k *ar,
 	 * Might end up reporting it out-of-band from HTT stats.
 	 */
 
-	ieee80211_tx_status(ar->hw, msdu);
+	ieee80211_tx_status_skb(ar->hw, msdu);
 
 exit:
 	rcu_read_unlock();
diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c
index 597d1f916dfd..9f534ed2fbb3 100644
--- a/drivers/net/wireless/ath/ath5k/base.c
+++ b/drivers/net/wireless/ath/ath5k/base.c
@@ -1770,7 +1770,7 @@ ath5k_tx_frame_completed(struct ath5k_hw *ah, struct sk_buff *skb,
 		ah->stats.antenna_tx[0]++; /* invalid */
 
 	trace_ath5k_tx_complete(ah, skb, txq, ts);
-	ieee80211_tx_status(ah->hw, skb);
+	ieee80211_tx_status_skb(ah->hw, skb);
 }
 
 static void
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c b/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c
index 672789e3c55d..800177021baf 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c
@@ -523,7 +523,7 @@ send_mac80211:
 	}
 
 	/* Send status to mac80211 */
-	ieee80211_tx_status(priv->hw, skb);
+	ieee80211_tx_status_skb(priv->hw, skb);
 }
 
 static inline void ath9k_htc_tx_drainq(struct ath9k_htc_priv *priv,
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index 4e939dcac1c9..f15684379b03 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -94,7 +94,7 @@ static void ath_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 
 	if (info->flags & (IEEE80211_TX_CTL_REQ_TX_STATUS |
 			   IEEE80211_TX_STATUS_EOSP)) {
-		ieee80211_tx_status(hw, skb);
+		ieee80211_tx_status_skb(hw, skb);
 		return;
 	}
 
diff --git a/drivers/net/wireless/broadcom/b43/dma.c b/drivers/net/wireless/broadcom/b43/dma.c
index 9a7c62bd5e43..760d1a28edc6 100644
--- a/drivers/net/wireless/broadcom/b43/dma.c
+++ b/drivers/net/wireless/broadcom/b43/dma.c
@@ -1531,9 +1531,9 @@ void b43_dma_handle_txstatus(struct b43_wldev *dev,
 				ring->nr_failed_tx_packets++;
 			ring->nr_total_packet_tries += status->frame_count;
 #endif /* DEBUG */
-			ieee80211_tx_status(dev->wl->hw, meta->skb);
+			ieee80211_tx_status_skb(dev->wl->hw, meta->skb);
 
-			/* skb will be freed by ieee80211_tx_status().
+			/* skb will be freed by ieee80211_tx_status_skb().
 			 * Poison our pointer. */
 			meta->skb = B43_DMA_PTR_POISON;
 		} else {
diff --git a/drivers/net/wireless/broadcom/b43/pio.c b/drivers/net/wireless/broadcom/b43/pio.c
index 8c28a9250cd1..0cf70fdb60a6 100644
--- a/drivers/net/wireless/broadcom/b43/pio.c
+++ b/drivers/net/wireless/broadcom/b43/pio.c
@@ -582,7 +582,7 @@ void b43_pio_handle_txstatus(struct b43_wldev *dev,
 	q->buffer_used -= total_len;
 	q->free_packet_slots += 1;
 
-	ieee80211_tx_status(dev->wl->hw, pack->skb);
+	ieee80211_tx_status_skb(dev->wl->hw, pack->skb);
 	pack->skb = NULL;
 	list_add(&pack->list, &q->packets_list);
 
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/tx.c b/drivers/net/wireless/intel/iwlwifi/dvm/tx.c
index 60a7b61d59aa..b0322af8e081 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/tx.c
@@ -1247,7 +1247,7 @@ void iwlagn_rx_reply_tx(struct iwl_priv *priv, struct iwl_rx_cmd_buffer *rxb)
 
 	while (!skb_queue_empty(&skbs)) {
 		skb = __skb_dequeue(&skbs);
-		ieee80211_tx_status(priv->hw, skb);
+		ieee80211_tx_status_skb(priv->hw, skb);
 	}
 }
 
@@ -1384,6 +1384,6 @@ void iwlagn_rx_reply_compressed_ba(struct iwl_priv *priv,
 
 	while (!skb_queue_empty(&reclaimed_skbs)) {
 		skb = __skb_dequeue(&reclaimed_skbs);
-		ieee80211_tx_status(priv->hw, skb);
+		ieee80211_tx_status_skb(priv->hw, skb);
 	}
 }
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index b0f3d51a7613..73c5f1094a75 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -1724,7 +1724,7 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm,
 			RS_DRV_DATA_PACK(lq_color, tx_resp->reduced_tpc);
 
 		if (likely(!iwl_mvm_time_sync_frame(mvm, skb, hdr->addr1)))
-			ieee80211_tx_status(mvm->hw, skb);
+			ieee80211_tx_status_skb(mvm->hw, skb);
 	}
 
 	/* This is an aggregation queue or might become one, so we use
@@ -2080,7 +2080,7 @@ out:
 
 	while (!skb_queue_empty(&reclaimed_skbs)) {
 		skb = __skb_dequeue(&reclaimed_skbs);
-		ieee80211_tx_status(mvm->hw, skb);
+		ieee80211_tx_status_skb(mvm->hw, skb);
 	}
 }
 
diff --git a/drivers/net/wireless/mediatek/mt7601u/tx.c b/drivers/net/wireless/mediatek/mt7601u/tx.c
index 51d977ffc52f..5aeeac0dd9fe 100644
--- a/drivers/net/wireless/mediatek/mt7601u/tx.c
+++ b/drivers/net/wireless/mediatek/mt7601u/tx.c
@@ -110,7 +110,7 @@ void mt7601u_tx_status(struct mt7601u_dev *dev, struct sk_buff *skb)
 	info->flags |= IEEE80211_TX_STAT_ACK;
 
 	spin_lock_bh(&dev->mac_lock);
-	ieee80211_tx_status(dev->hw, skb);
+	ieee80211_tx_status_skb(dev->hw, skb);
 	spin_unlock_bh(&dev->mac_lock);
 }
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c
index 9a9cfd0ce402..c88ce446e117 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c
@@ -533,7 +533,7 @@ void rt2x00lib_txdone(struct queue_entry *entry,
 	 */
 	if (!(skbdesc_flags & SKBDESC_NOT_MAC80211)) {
 		if (rt2x00_has_cap_flag(rt2x00dev, REQUIRE_TASKLET_CONTEXT))
-			ieee80211_tx_status(rt2x00dev->hw, entry->skb);
+			ieee80211_tx_status_skb(rt2x00dev->hw, entry->skb);
 		else
 			ieee80211_tx_status_ni(rt2x00dev->hw, entry->skb);
 	} else {
diff --git a/drivers/net/wireless/st/cw1200/txrx.c b/drivers/net/wireless/st/cw1200/txrx.c
index e16e9ae90d20..084d52b11f5b 100644
--- a/drivers/net/wireless/st/cw1200/txrx.c
+++ b/drivers/net/wireless/st/cw1200/txrx.c
@@ -994,7 +994,7 @@ void cw1200_skb_dtor(struct cw1200_common *priv,
 					  txpriv->raw_link_id, txpriv->tid);
 		tx_policy_put(priv, txpriv->rate_id);
 	}
-	ieee80211_tx_status(priv->hw, skb);
+	ieee80211_tx_status_skb(priv->hw, skb);
 }
 
 void cw1200_rx_cb(struct cw1200_common *priv,
diff --git a/drivers/net/wireless/ti/wl1251/tx.c b/drivers/net/wireless/ti/wl1251/tx.c
index e9dc3c72bb11..474b603c121c 100644
--- a/drivers/net/wireless/ti/wl1251/tx.c
+++ b/drivers/net/wireless/ti/wl1251/tx.c
@@ -434,7 +434,7 @@ static void wl1251_tx_packet_cb(struct wl1251 *wl,
 		     result->status, wl1251_tx_parse_status(result->status));
 
 
-	ieee80211_tx_status(wl->hw, skb);
+	ieee80211_tx_status_skb(wl->hw, skb);
 
 	wl->tx_frames[result->id] = NULL;
 }
@@ -566,7 +566,7 @@ void wl1251_tx_flush(struct wl1251 *wl)
 		if (!(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS))
 				continue;
 
-		ieee80211_tx_status(wl->hw, skb);
+		ieee80211_tx_status_skb(wl->hw, skb);
 	}
 
 	for (i = 0; i < FW_TX_CMPLT_BLOCK_SIZE; i++)
@@ -577,7 +577,7 @@ void wl1251_tx_flush(struct wl1251 *wl)
 			if (!(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS))
 				continue;
 
-			ieee80211_tx_status(wl->hw, skb);
+			ieee80211_tx_status_skb(wl->hw, skb);
 			wl->tx_frames[i] = NULL;
 		}
 }
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 511d5d1c042f..580781ff9dcf 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4911,7 +4911,7 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw);
  * for a single hardware must be synchronized against each other. Calls to
  * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be
  * mixed for a single hardware. Must not run concurrently with
- * ieee80211_tx_status() or ieee80211_tx_status_ni().
+ * ieee80211_tx_status_skb() or ieee80211_tx_status_ni().
  *
  * This function must be called with BHs disabled and RCU read lock
  *
@@ -4936,7 +4936,7 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *sta,
  * for a single hardware must be synchronized against each other. Calls to
  * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be
  * mixed for a single hardware. Must not run concurrently with
- * ieee80211_tx_status() or ieee80211_tx_status_ni().
+ * ieee80211_tx_status_skb() or ieee80211_tx_status_ni().
  *
  * This function must be called with BHs disabled.
  *
@@ -4961,7 +4961,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *sta,
  * for a single hardware must be synchronized against each other. Calls to
  * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be
  * mixed for a single hardware. Must not run concurrently with
- * ieee80211_tx_status() or ieee80211_tx_status_ni().
+ * ieee80211_tx_status_skb() or ieee80211_tx_status_ni().
  *
  * In process context use instead ieee80211_rx_ni().
  *
@@ -4981,7 +4981,7 @@ static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
  *
  * Calls to this function, ieee80211_rx() or ieee80211_rx_ni() may not
  * be mixed for a single hardware.Must not run concurrently with
- * ieee80211_tx_status() or ieee80211_tx_status_ni().
+ * ieee80211_tx_status_skb() or ieee80211_tx_status_ni().
  *
  * @hw: the hardware this frame came in on
  * @skb: the buffer to receive, owned by mac80211 after this call
@@ -4996,7 +4996,7 @@ void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb);
  *
  * Calls to this function, ieee80211_rx() and ieee80211_rx_irqsafe() may
  * not be mixed for a single hardware. Must not run concurrently with
- * ieee80211_tx_status() or ieee80211_tx_status_ni().
+ * ieee80211_tx_status_skb() or ieee80211_tx_status_ni().
  *
  * @hw: the hardware this frame came in on
  * @skb: the buffer to receive, owned by mac80211 after this call
@@ -5172,7 +5172,7 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
 			      struct ieee80211_tx_info *info);
 
 /**
- * ieee80211_tx_status - transmit status callback
+ * ieee80211_tx_status_skb - transmit status callback
  *
  * Call this function for all transmitted frames after they have been
  * transmitted. It is permissible to not call this function for
@@ -5187,13 +5187,13 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
  * @hw: the hardware the frame was transmitted by
  * @skb: the frame that was transmitted, owned by mac80211 after this call
  */
-void ieee80211_tx_status(struct ieee80211_hw *hw,
-			 struct sk_buff *skb);
+void ieee80211_tx_status_skb(struct ieee80211_hw *hw,
+			     struct sk_buff *skb);
 
 /**
  * ieee80211_tx_status_ext - extended transmit status callback
  *
- * This function can be used as a replacement for ieee80211_tx_status
+ * This function can be used as a replacement for ieee80211_tx_status_skb()
  * in drivers that may want to provide extra information that does not
  * fit into &struct ieee80211_tx_info.
  *
@@ -5210,7 +5210,7 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 /**
  * ieee80211_tx_status_noskb - transmit status callback without skb
  *
- * This function can be used as a replacement for ieee80211_tx_status
+ * This function can be used as a replacement for ieee80211_tx_status_skb()
  * in drivers that cannot reliably map tx status information back to
  * specific skbs.
  *
@@ -5238,9 +5238,9 @@ static inline void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
 /**
  * ieee80211_tx_status_ni - transmit status callback (in process context)
  *
- * Like ieee80211_tx_status() but can be called in process context.
+ * Like ieee80211_tx_status_skb() but can be called in process context.
  *
- * Calls to this function, ieee80211_tx_status() and
+ * Calls to this function, ieee80211_tx_status_skb() and
  * ieee80211_tx_status_irqsafe() may not be mixed
  * for a single hardware.
  *
@@ -5251,17 +5251,17 @@ static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw,
 					  struct sk_buff *skb)
 {
 	local_bh_disable();
-	ieee80211_tx_status(hw, skb);
+	ieee80211_tx_status_skb(hw, skb);
 	local_bh_enable();
 }
 
 /**
  * ieee80211_tx_status_irqsafe - IRQ-safe transmit status callback
  *
- * Like ieee80211_tx_status() but can be called in IRQ context
+ * Like ieee80211_tx_status_skb() but can be called in IRQ context
  * (internally defers to a tasklet.)
  *
- * Calls to this function, ieee80211_tx_status() and
+ * Calls to this function, ieee80211_tx_status_skb() and
  * ieee80211_tx_status_ni() may not be mixed for a single hardware.
  *
  * @hw: the hardware the frame was transmitted by
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index b46f4d733c5d..033a5261ac3a 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -319,7 +319,7 @@ static void ieee80211_tasklet_handler(struct tasklet_struct *t)
 			break;
 		case IEEE80211_TX_STATUS_MSG:
 			skb->pkt_type = 0;
-			ieee80211_tx_status(&local->hw, skb);
+			ieee80211_tx_status_skb(&local->hw, skb);
 			break;
 		default:
 			WARN(1, "mac80211: Packet is of unknown type %d\n",
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 807cdab38d5e..1708b33cdc5e 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -1092,7 +1092,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 			     send_to_cooked, status);
 }
 
-void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
+void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct ieee80211_local *local = hw_to_local(hw);
@@ -1111,7 +1111,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 	ieee80211_tx_status_ext(hw, &status);
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(ieee80211_tx_status);
+EXPORT_SYMBOL(ieee80211_tx_status_skb);
 
 void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 			     struct ieee80211_tx_status *status)
-- 
cgit v1.2.3-70-g09d2


From 8e4687f6061ec00d16b06cb2ed6d2593cae19c46 Mon Sep 17 00:00:00 2001
From: Kalle Valo <quic_kvalo@quicinc.com>
Date: Thu, 12 Oct 2023 14:42:29 +0300
Subject: wifi: mac80211: rename struct cfg80211_rx_assoc_resp to
 cfg80211_rx_assoc_resp_data

make htmldocs warns:

Documentation/driver-api/80211/cfg80211:48: ./include/net/cfg80211.h:7290: WARNING: Duplicate C declaration, also defined at cfg80211:7251.
Declaration is '.. c:function:: void cfg80211_rx_assoc_resp (struct net_device *dev, struct cfg80211_rx_assoc_resp *data)'.

This is because there's a function named cfg80211_rx_assoc_resp() and a struct
named cfg80211_rx_assoc_resp, see previous patch for more info.

To workaround this rename the struct to cfg80211_rx_assoc_resp_data. The
parameter for the function is named 'data' anyway so the naming here is
consistent.

Compile tested only.

Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com>
Link: https://lore.kernel.org/r/20231012114229.2931808-3-kvalo@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 8 ++++----
 net/mac80211/mlme.c    | 2 +-
 net/wireless/mlme.c    | 2 +-
 net/wireless/nl80211.c | 2 +-
 net/wireless/nl80211.h | 2 +-
 net/wireless/trace.h   | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ff4ab88393c3..0e7abb083a77 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7269,7 +7269,7 @@ void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len);
 void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);
 
 /**
- * struct cfg80211_rx_assoc_resp - association response data
+ * struct cfg80211_rx_assoc_resp_data - association response data
  * @bss: the BSS that association was requested with, ownership of the pointer
  *	moves to cfg80211 in the call to cfg80211_rx_assoc_resp()
  * @buf: (Re)Association Response frame (header + body)
@@ -7284,7 +7284,7 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);
  * @links.status: Set this (along with a BSS pointer) for links that
  *	were rejected by the AP.
  */
-struct cfg80211_rx_assoc_resp {
+struct cfg80211_rx_assoc_resp_data {
 	const u8 *buf;
 	size_t len;
 	const u8 *req_ies;
@@ -7301,7 +7301,7 @@ struct cfg80211_rx_assoc_resp {
 /**
  * cfg80211_rx_assoc_resp - notification of processed association response
  * @dev: network device
- * @data: association response data, &struct cfg80211_rx_assoc_resp
+ * @data: association response data, &struct cfg80211_rx_assoc_resp_data
  *
  * After being asked to associate via cfg80211_ops::assoc() the driver must
  * call either this function or cfg80211_auth_timeout().
@@ -7309,7 +7309,7 @@ struct cfg80211_rx_assoc_resp {
  * This function may sleep. The caller must hold the corresponding wdev's mutex.
  */
 void cfg80211_rx_assoc_resp(struct net_device *dev,
-			    struct cfg80211_rx_assoc_resp *data);
+			    struct cfg80211_rx_assoc_resp_data *data);
 
 /**
  * struct cfg80211_assoc_failure - association failure data
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 54a9f6db799e..887b496f2b81 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5281,7 +5281,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		.u.mlme.data = ASSOC_EVENT,
 	};
 	struct ieee80211_prep_tx_info info = {};
-	struct cfg80211_rx_assoc_resp resp = {
+	struct cfg80211_rx_assoc_resp_data resp = {
 		.uapsd_queues = -1,
 	};
 	u8 ap_mld_addr[ETH_ALEN] __aligned(2);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 579fea2f3548..f90f58c65688 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -22,7 +22,7 @@
 
 
 void cfg80211_rx_assoc_resp(struct net_device *dev,
-			    struct cfg80211_rx_assoc_resp *data)
+			    struct cfg80211_rx_assoc_resp_data *data)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2650543dcebe..d468e015d828 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -17799,7 +17799,7 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
 
 void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
 			   struct net_device *netdev,
-			   struct cfg80211_rx_assoc_resp *data)
+			   struct cfg80211_rx_assoc_resp_data *data)
 {
 	nl80211_send_mlme_event(rdev, netdev, data->buf, data->len,
 				NL80211_CMD_ASSOCIATE, GFP_KERNEL,
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index b4af53f9b227..3edd53c38912 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -60,7 +60,7 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
 			  const u8 *buf, size_t len, gfp_t gfp);
 void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
 			   struct net_device *netdev,
-			   struct cfg80211_rx_assoc_resp *data);
+			   struct cfg80211_rx_assoc_resp_data *data);
 void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev,
 			 const u8 *buf, size_t len,
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index f6667bf3fd12..30cd1bd58aac 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2928,7 +2928,7 @@ DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth,
 
 TRACE_EVENT(cfg80211_send_rx_assoc,
 	TP_PROTO(struct net_device *netdev,
-		 struct cfg80211_rx_assoc_resp *data),
+		 struct cfg80211_rx_assoc_resp_data *data),
 	TP_ARGS(netdev, data),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
-- 
cgit v1.2.3-70-g09d2


From e4e7e3af73694380f0d9a742d13b80598a3393e9 Mon Sep 17 00:00:00 2001
From: Vinayak Yadawad <vinayak.yadawad@broadcom.com>
Date: Tue, 26 Sep 2023 16:02:22 +0530
Subject: wifi: cfg80211: Allow AP/P2PGO to indicate port authorization to peer
 STA/P2PClient

In 4way handshake offload, cfg80211_port_authorized enables driver
to indicate successful 4way handshake to cfg80211 layer. Currently
this path of port authorization is restricted to interface type
NL80211_IFTYPE_STATION and NL80211_IFTYPE_P2P_CLIENT. This patch
extends the support for NL80211_IFTYPE_AP and NL80211_IFTYPE_P2P_GO
interfaces to authorize peer STA/P2P_CLIENT, whenever authentication
is offloaded on the AP/P2P_GO interface.

Signed-off-by: Vinayak Yadawad <vinayak.yadawad@broadcom.com>
Link: https://lore.kernel.org/r/dee3b0a2b4f617e932c90bff4504a89389273632.1695721435.git.vinayak.yadawad@broadcom.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  8 ++++++--
 net/wireless/core.h    |  4 ++--
 net/wireless/nl80211.c |  4 ++--
 net/wireless/nl80211.h |  5 ++++-
 net/wireless/sme.c     | 23 ++++++++++++++---------
 net/wireless/util.c    |  2 +-
 6 files changed, 29 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0e7abb083a77..b137a33a1b68 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8028,7 +8028,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
  * cfg80211_port_authorized - notify cfg80211 of successful security association
  *
  * @dev: network device
- * @bssid: the BSSID of the AP
+ * @peer_addr: BSSID of the AP/P2P GO in case of STA/GC or STA/GC MAC address
+ *	in case of AP/P2P GO
  * @td_bitmap: transition disable policy
  * @td_bitmap_len: Length of transition disable policy
  * @gfp: allocation flags
@@ -8039,8 +8040,11 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
  * should be preceded with a call to cfg80211_connect_result(),
  * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to
  * indicate the 802.11 association.
+ * This function can also be called by AP/P2P GO driver that supports
+ * authentication offload. In this case the peer_mac passed is that of
+ * associated STA/GC.
  */
-void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
+void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr,
 			      const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp);
 
 /**
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 79b1c6d17847..4c692c7faf30 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -270,7 +270,7 @@ struct cfg80211_event {
 			struct ieee80211_channel *channel;
 		} ij;
 		struct {
-			u8 bssid[ETH_ALEN];
+			u8 peer_addr[ETH_ALEN];
 			const u8 *td_bitmap;
 			u8 td_bitmap_len;
 		} pa;
@@ -404,7 +404,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 			bool wextev);
 void __cfg80211_roamed(struct wireless_dev *wdev,
 		       struct cfg80211_roam_info *info);
-void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid,
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr,
 				const u8 *td_bitmap, u8 td_bitmap_len);
 int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d468e015d828..56fd7cf2563b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18124,7 +18124,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 }
 
 void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
-				  struct net_device *netdev, const u8 *bssid,
+				  struct net_device *netdev, const u8 *peer_addr,
 				  const u8 *td_bitmap, u8 td_bitmap_len)
 {
 	struct sk_buff *msg;
@@ -18142,7 +18142,7 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
 
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
-	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
+	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, peer_addr))
 		goto nla_put_failure;
 
 	if ((td_bitmap_len > 0) && td_bitmap)
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 3edd53c38912..aad40240d9cb 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -82,8 +82,11 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev,
 			 struct cfg80211_roam_info *info, gfp_t gfp);
+/* For STA/GC, indicate port authorized with AP/GO bssid.
+ * For GO/AP, use peer GC/STA mac_addr.
+ */
 void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
-				  struct net_device *netdev, const u8 *bssid,
+				  struct net_device *netdev, const u8 *peer_addr,
 				  const u8 *td_bitmap, u8 td_bitmap_len);
 void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 			       struct net_device *netdev, u16 reason,
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 50fcb27e6dab..acfe66da7109 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1281,24 +1281,29 @@ out:
 }
 EXPORT_SYMBOL(cfg80211_roamed);
 
-void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid,
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr,
 					const u8 *td_bitmap, u8 td_bitmap_len)
 {
 	lockdep_assert_wiphy(wdev->wiphy);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
-		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+		wdev->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+		wdev->iftype != NL80211_IFTYPE_AP &&
+		wdev->iftype != NL80211_IFTYPE_P2P_GO))
 		return;
 
-	if (WARN_ON(!wdev->connected) ||
-	    WARN_ON(!ether_addr_equal(wdev->u.client.connected_addr, bssid)))
-		return;
+	if (wdev->iftype == NL80211_IFTYPE_STATION ||
+		wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) {
+		if (WARN_ON(!wdev->connected) ||
+			WARN_ON(!ether_addr_equal(wdev->u.client.connected_addr, peer_addr)))
+			return;
+	}
 
 	nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev,
-				     bssid, td_bitmap, td_bitmap_len);
+				     peer_addr, td_bitmap, td_bitmap_len);
 }
 
-void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
+void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr,
 			      const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -1306,7 +1311,7 @@ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
 	struct cfg80211_event *ev;
 	unsigned long flags;
 
-	if (WARN_ON(!bssid))
+	if (WARN_ON(!peer_addr))
 		return;
 
 	ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp);
@@ -1314,7 +1319,7 @@ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
 		return;
 
 	ev->type = EVENT_PORT_AUTHORIZED;
-	memcpy(ev->pa.bssid, bssid, ETH_ALEN);
+	memcpy(ev->pa.peer_addr, peer_addr, ETH_ALEN);
 	ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev);
 	ev->pa.td_bitmap_len = td_bitmap_len;
 	memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 0893b7f57832..626b858b4b35 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1062,7 +1062,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 			cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
 			break;
 		case EVENT_PORT_AUTHORIZED:
-			__cfg80211_port_authorized(wdev, ev->pa.bssid,
+			__cfg80211_port_authorized(wdev, ev->pa.peer_addr,
 						   ev->pa.td_bitmap,
 						   ev->pa.td_bitmap_len);
 			break;
-- 
cgit v1.2.3-70-g09d2


From a254b90c9aac3d3d938a07e019773e35a977451b Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Wed, 6 Sep 2023 16:59:54 +0300
Subject: Bluetooth: ISO: Fix BIS cleanup

This fixes the master BIS cleanup procedure - as opposed to CIS cleanup,
no HCI disconnect command should be issued. A master BIS should only be
terminated by disabling periodic and extended advertising, and terminating
the BIG.

In case of a Broadcast Receiver, all BIS and PA connections can be
cleaned up by calling hci_conn_failed, since it contains all function
calls that are necessary for successful cleanup.

Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 ++
 net/bluetooth/hci_conn.c         |  7 +++++++
 net/bluetooth/hci_sync.c         | 28 ++++++++++++----------------
 3 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 57eeb07aeb25..6efbc2152146 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -80,6 +80,8 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len,
 			   u8 *data, u32 flags, u16 min_interval,
 			   u16 max_interval, u16 sync_interval);
 
+int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance);
+
 int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk,
 				u8 instance, bool force);
 int hci_disable_advertising_sync(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 73470cc3518a..973c9de01a04 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -759,6 +759,7 @@ static int terminate_big_sync(struct hci_dev *hdev, void *data)
 
 	bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", d->big, d->bis);
 
+	hci_disable_per_advertising_sync(hdev, d->bis);
 	hci_remove_ext_adv_instance_sync(hdev, d->bis, NULL);
 
 	/* Only terminate BIG if it has been created */
@@ -1247,6 +1248,12 @@ void hci_conn_failed(struct hci_conn *conn, u8 status)
 		break;
 	}
 
+	/* In case of BIG/PA sync failed, clear conn flags so that
+	 * the conns will be correctly cleaned up by ISO layer
+	 */
+	test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags);
+	test_and_clear_bit(HCI_CONN_PA_SYNC_FAILED, &conn->flags);
+
 	conn->state = BT_CLOSED;
 	hci_connect_cfm(conn, status);
 	hci_conn_del(conn);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index a15ab0b874a9..3485602518fe 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -1312,7 +1312,7 @@ int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance)
 	return hci_enable_ext_advertising_sync(hdev, instance);
 }
 
-static int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance)
+int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance)
 {
 	struct hci_cp_le_set_per_adv_enable cp;
 	struct adv_info *adv = NULL;
@@ -5232,6 +5232,17 @@ static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn,
 	if (conn->type == AMP_LINK)
 		return hci_disconnect_phy_link_sync(hdev, conn->handle, reason);
 
+	if (test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) {
+		/* This is a BIS connection, hci_conn_del will
+		 * do the necessary cleanup.
+		 */
+		hci_dev_lock(hdev);
+		hci_conn_failed(conn, reason);
+		hci_dev_unlock(hdev);
+
+		return 0;
+	}
+
 	memset(&cp, 0, sizeof(cp));
 	cp.handle = cpu_to_le16(conn->handle);
 	cp.reason = reason;
@@ -5384,21 +5395,6 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason)
 		err = hci_reject_conn_sync(hdev, conn, reason);
 		break;
 	case BT_OPEN:
-		hci_dev_lock(hdev);
-
-		/* Cleanup bis or pa sync connections */
-		if (test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags) ||
-		    test_and_clear_bit(HCI_CONN_PA_SYNC_FAILED, &conn->flags)) {
-			hci_conn_failed(conn, reason);
-		} else if (test_bit(HCI_CONN_PA_SYNC, &conn->flags) ||
-			   test_bit(HCI_CONN_BIG_SYNC, &conn->flags)) {
-			conn->state = BT_CLOSED;
-			hci_disconn_cfm(conn, reason);
-			hci_conn_del(conn);
-		}
-
-		hci_dev_unlock(hdev);
-		return 0;
 	case BT_BOUND:
 		break;
 	default:
-- 
cgit v1.2.3-70-g09d2


From 1d11d70d1f6b23e7d3fc00396c17b90b876162a4 Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Wed, 6 Sep 2023 17:01:03 +0300
Subject: Bluetooth: ISO: Pass BIG encryption info through QoS

This enables a broadcast sink to be informed if the PA
it has synced with is associated with an encrypted BIG,
by retrieving the socket QoS and checking the encryption
field.

After PA sync has been successfully established and the
first BIGInfo advertising report is received, a new hcon
is added and notified to the ISO layer. The ISO layer
sets the encryption field of the socket and hcon QoS
according to the encryption parameter of the BIGInfo
advertising report event.

After that, the userspace is woken up, and the QoS of the
new PA sync socket can be read, to inspect the encryption
field and follow up accordingly.

Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  3 +++
 include/net/bluetooth/hci_core.h | 25 ++++++++++++++++++-
 net/bluetooth/hci_conn.c         |  1 +
 net/bluetooth/hci_event.c        | 54 +++++++++++++++++++++++++++++-----------
 net/bluetooth/iso.c              | 19 ++++++++++----
 5 files changed, 82 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 87d92accc26e..bdee5d649cc6 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (C) 2000-2001 Qualcomm Incorporated
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -673,6 +674,8 @@ enum {
 #define HCI_TX_POWER_INVALID	127
 #define HCI_RSSI_INVALID	127
 
+#define HCI_SYNC_HANDLE_INVALID	0xffff
+
 #define HCI_ROLE_MASTER		0x00
 #define HCI_ROLE_SLAVE		0x01
 
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index c33348ba1657..f36c1fd5d64e 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1314,7 +1314,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev *
 }
 
 static inline struct hci_conn *
-hci_conn_hash_lookup_pa_sync(struct hci_dev *hdev, __u8 big)
+hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
 	struct hci_conn  *c;
@@ -1336,6 +1336,29 @@ hci_conn_hash_lookup_pa_sync(struct hci_dev *hdev, __u8 big)
 	return NULL;
 }
 
+static inline struct hci_conn *
+hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c->type != ISO_LINK ||
+			!test_bit(HCI_CONN_PA_SYNC, &c->flags))
+			continue;
+
+		if (c->sync_handle == sync_handle) {
+			rcu_read_unlock();
+			return c;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
 							__u8 type, __u16 state)
 {
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 973c9de01a04..271da46e7428 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -974,6 +974,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	conn->rssi = HCI_RSSI_INVALID;
 	conn->tx_power = HCI_TX_POWER_INVALID;
 	conn->max_tx_power = HCI_TX_POWER_INVALID;
+	conn->sync_handle = HCI_SYNC_HANDLE_INVALID;
 
 	set_bit(HCI_CONN_POWER_SAVE, &conn->flags);
 	conn->disc_timeout = HCI_DISCONN_TIMEOUT;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 1e1c9147356c..9b34c9f8ee02 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6603,7 +6603,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
 	struct hci_ev_le_pa_sync_established *ev = data;
 	int mask = hdev->link_mode;
 	__u8 flags = 0;
-	struct hci_conn *bis;
+	struct hci_conn *pa_sync;
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
 
@@ -6620,20 +6620,19 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
 	if (!(flags & HCI_PROTO_DEFER))
 		goto unlock;
 
-	/* Add connection to indicate the PA sync event */
-	bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
-			   HCI_ROLE_SLAVE);
+	if (ev->status) {
+		/* Add connection to indicate the failed PA sync event */
+		pa_sync = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
+				       HCI_ROLE_SLAVE);
 
-	if (!bis)
-		goto unlock;
+		if (!pa_sync)
+			goto unlock;
 
-	if (ev->status)
-		set_bit(HCI_CONN_PA_SYNC_FAILED, &bis->flags);
-	else
-		set_bit(HCI_CONN_PA_SYNC, &bis->flags);
+		set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
 
-	/* Notify connection to iso layer */
-	hci_connect_cfm(bis, ev->status);
+		/* Notify iso layer */
+		hci_connect_cfm(pa_sync, ev->status);
+	}
 
 unlock:
 	hci_dev_unlock(hdev);
@@ -7125,7 +7124,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 	hci_dev_lock(hdev);
 
 	if (!ev->status) {
-		pa_sync = hci_conn_hash_lookup_pa_sync(hdev, ev->handle);
+		pa_sync = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle);
 		if (pa_sync)
 			/* Also mark the BIG sync established event on the
 			 * associated PA sync hcon
@@ -7186,15 +7185,42 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
 	struct hci_evt_le_big_info_adv_report *ev = data;
 	int mask = hdev->link_mode;
 	__u8 flags = 0;
+	struct hci_conn *pa_sync;
 
 	bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle));
 
 	hci_dev_lock(hdev);
 
 	mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags);
-	if (!(mask & HCI_LM_ACCEPT))
+	if (!(mask & HCI_LM_ACCEPT)) {
 		hci_le_pa_term_sync(hdev, ev->sync_handle);
+		goto unlock;
+	}
 
+	if (!(flags & HCI_PROTO_DEFER))
+		goto unlock;
+
+	pa_sync = hci_conn_hash_lookup_pa_sync_handle
+			(hdev,
+			le16_to_cpu(ev->sync_handle));
+
+	if (pa_sync)
+		goto unlock;
+
+	/* Add connection to indicate the PA sync event */
+	pa_sync = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
+			       HCI_ROLE_SLAVE);
+
+	if (!pa_sync)
+		goto unlock;
+
+	pa_sync->sync_handle = le16_to_cpu(ev->sync_handle);
+	set_bit(HCI_CONN_PA_SYNC, &pa_sync->flags);
+
+	/* Notify iso layer */
+	hci_connect_cfm(pa_sync, 0x00);
+
+unlock:
 	hci_dev_unlock(hdev);
 }
 
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 71248163ce9a..2132a16be93c 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -77,6 +77,7 @@ static struct bt_iso_qos default_qos;
 static bool check_ucast_qos(struct bt_iso_qos *qos);
 static bool check_bcast_qos(struct bt_iso_qos *qos);
 static bool iso_match_sid(struct sock *sk, void *data);
+static bool iso_match_sync_handle(struct sock *sk, void *data);
 static void iso_sock_disconn(struct sock *sk);
 
 /* ---- ISO timers ---- */
@@ -1202,7 +1203,6 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
 			    test_bit(HCI_CONN_PA_SYNC, &pi->conn->hcon->flags)) {
 				iso_conn_big_sync(sk);
 				sk->sk_state = BT_LISTEN;
-				set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags);
 			} else {
 				iso_conn_defer_accept(pi->conn->hcon);
 				sk->sk_state = BT_CONFIG;
@@ -1579,6 +1579,7 @@ static void iso_conn_ready(struct iso_conn *conn)
 	struct sock *sk = conn->sk;
 	struct hci_ev_le_big_sync_estabilished *ev = NULL;
 	struct hci_ev_le_pa_sync_established *ev2 = NULL;
+	struct hci_evt_le_big_info_adv_report *ev3 = NULL;
 	struct hci_conn *hcon;
 
 	BT_DBG("conn %p", conn);
@@ -1603,14 +1604,20 @@ static void iso_conn_ready(struct iso_conn *conn)
 				parent = iso_get_sock_listen(&hcon->src,
 							     &hcon->dst,
 							     iso_match_big, ev);
-		} else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags) ||
-				test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) {
+		} else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) {
 			ev2 = hci_recv_event_data(hcon->hdev,
 						  HCI_EV_LE_PA_SYNC_ESTABLISHED);
 			if (ev2)
 				parent = iso_get_sock_listen(&hcon->src,
 							     &hcon->dst,
 							     iso_match_sid, ev2);
+		} else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) {
+			ev3 = hci_recv_event_data(hcon->hdev,
+						  HCI_EVT_LE_BIG_INFO_ADV_REPORT);
+			if (ev3)
+				parent = iso_get_sock_listen(&hcon->src,
+							     &hcon->dst,
+							     iso_match_sync_handle, ev3);
 		}
 
 		if (!parent)
@@ -1650,11 +1657,13 @@ static void iso_conn_ready(struct iso_conn *conn)
 			hcon->sync_handle = iso_pi(parent)->sync_handle;
 		}
 
-		if (ev2 && !ev2->status) {
-			iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle;
+		if (ev3) {
 			iso_pi(sk)->qos = iso_pi(parent)->qos;
+			iso_pi(sk)->qos.bcast.encryption = ev3->encryption;
+			hcon->iso_qos = iso_pi(sk)->qos;
 			iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis;
 			memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS);
+			set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags);
 		}
 
 		bacpy(&iso_pi(sk)->dst, &hcon->dst);
-- 
cgit v1.2.3-70-g09d2


From fcb89f120376c054005e6d7a82bb89f95b3b04ef Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Wed, 11 Oct 2023 17:24:07 +0300
Subject: Bluetooth: ISO: Fix bcast listener cleanup

This fixes the cleanup callback for slave bis and pa sync hcons.

Closing all bis hcons will trigger BIG Terminate Sync, while closing
all bises and the pa sync hcon will also trigger PA Terminate Sync.

Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 43 +++++++++++++++++++---------------------
 net/bluetooth/hci_conn.c         | 38 +++++++++++++++++++++++++++--------
 net/bluetooth/hci_event.c        | 10 ----------
 3 files changed, 50 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f36c1fd5d64e..99865c23e461 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1290,29 +1290,6 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev,
 	return NULL;
 }
 
-static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev *hdev,
-							__u8 handle)
-{
-	struct hci_conn_hash *h = &hdev->conn_hash;
-	struct hci_conn  *c;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK)
-			continue;
-
-		if (handle != BT_ISO_QOS_BIG_UNSET && handle == c->iso_qos.bcast.big) {
-			rcu_read_unlock();
-			return c;
-		}
-	}
-
-	rcu_read_unlock();
-
-	return NULL;
-}
-
 static inline struct hci_conn *
 hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big)
 {
@@ -1400,6 +1377,26 @@ static inline void hci_conn_hash_list_state(struct hci_dev *hdev,
 	rcu_read_unlock();
 }
 
+static inline void hci_conn_hash_list_flag(struct hci_dev *hdev,
+					    hci_conn_func_t func, __u8 type,
+					    __u8 flag, void *data)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	if (!func)
+		return;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c->type == type && test_bit(flag, &c->flags))
+			func(c, data);
+	}
+
+	rcu_read_unlock();
+}
+
 static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 974631e652c1..7727fe30e5c3 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -815,6 +815,17 @@ static int big_terminate_sync(struct hci_dev *hdev, void *data)
 	return 0;
 }
 
+static void find_bis(struct hci_conn *conn, void *data)
+{
+	struct iso_list_data *d = data;
+
+	/* Ignore if BIG doesn't match */
+	if (d->big != conn->iso_qos.bcast.big)
+		return;
+
+	d->count++;
+}
+
 static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn)
 {
 	struct iso_list_data *d;
@@ -826,10 +837,27 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c
 	if (!d)
 		return -ENOMEM;
 
+	memset(d, 0, sizeof(*d));
 	d->big = big;
 	d->sync_handle = conn->sync_handle;
-	d->pa_sync_term = test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags);
-	d->big_sync_term = test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags);
+
+	if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) {
+		hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK,
+					HCI_CONN_PA_SYNC, d);
+
+		if (!d->count)
+			d->pa_sync_term = true;
+
+		d->count = 0;
+	}
+
+	if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) {
+		hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK,
+					HCI_CONN_BIG_SYNC, d);
+
+		if (!d->count)
+			d->big_sync_term = true;
+	}
 
 	ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d,
 				 terminate_big_destroy);
@@ -865,12 +893,6 @@ static void bis_cleanup(struct hci_conn *conn)
 
 		hci_le_terminate_big(hdev, conn);
 	} else {
-		bis = hci_conn_hash_lookup_big_any_dst(hdev,
-						       conn->iso_qos.bcast.big);
-
-		if (bis)
-			return;
-
 		hci_le_big_terminate(hdev, conn->iso_qos.bcast.big,
 				     conn);
 	}
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 9b34c9f8ee02..32fb2f102a12 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -7112,7 +7112,6 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 {
 	struct hci_evt_le_big_sync_estabilished *ev = data;
 	struct hci_conn *bis;
-	struct hci_conn *pa_sync;
 	int i;
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
@@ -7123,15 +7122,6 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 
 	hci_dev_lock(hdev);
 
-	if (!ev->status) {
-		pa_sync = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle);
-		if (pa_sync)
-			/* Also mark the BIG sync established event on the
-			 * associated PA sync hcon
-			 */
-			set_bit(HCI_CONN_BIG_SYNC, &pa_sync->flags);
-	}
-
 	for (i = 0; i < ev->num_bis; i++) {
 		u16 handle = le16_to_cpu(ev->bis[i]);
 		__le32 interval;
-- 
cgit v1.2.3-70-g09d2


From 181a42edddf51d5d9697ecdf365d72ebeab5afb0 Mon Sep 17 00:00:00 2001
From: Ziyang Xuan <william.xuanziyang@huawei.com>
Date: Wed, 11 Oct 2023 17:57:31 +0800
Subject: Bluetooth: Make handle of hci_conn be unique

The handle of new hci_conn is always HCI_CONN_HANDLE_MAX + 1 if
the handle of the first hci_conn entry in hci_dev->conn_hash->list
is not HCI_CONN_HANDLE_MAX + 1. Use ida to manage the allocation of
hci_conn->handle to make it be unique.

Fixes: 9f78191cc9f1 ("Bluetooth: hci_conn: Always allocate unique handles")
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  6 ++++-
 net/bluetooth/amp.c              |  3 +--
 net/bluetooth/hci_conn.c         | 57 ++++++++++++++++++++++------------------
 net/bluetooth/hci_core.c         |  3 +++
 net/bluetooth/hci_event.c        | 38 +++++++++++----------------
 5 files changed, 56 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 99865c23e461..20988623c5cc 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -350,6 +350,8 @@ struct hci_dev {
 	struct list_head list;
 	struct mutex	lock;
 
+	struct ida	unset_handle_ida;
+
 	const char	*name;
 	unsigned long	flags;
 	__u16		id;
@@ -1446,7 +1448,9 @@ int hci_le_create_cis_pending(struct hci_dev *hdev);
 int hci_conn_check_create_cis(struct hci_conn *conn);
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
-			      u8 role);
+			      u8 role, u16 handle);
+struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
+				    bdaddr_t *dst, u8 role);
 void hci_conn_del(struct hci_conn *conn);
 void hci_conn_hash_flush(struct hci_dev *hdev);
 void hci_conn_check_pending(struct hci_dev *hdev);
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index 2134f92bd7ac..5d698f19868c 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -109,7 +109,7 @@ struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
 	struct hci_conn *hcon;
 	u8 role = out ? HCI_ROLE_MASTER : HCI_ROLE_SLAVE;
 
-	hcon = hci_conn_add(hdev, AMP_LINK, dst, role);
+	hcon = hci_conn_add(hdev, AMP_LINK, dst, role, __next_handle(mgr));
 	if (!hcon)
 		return NULL;
 
@@ -117,7 +117,6 @@ struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
 
 	hcon->state = BT_CONNECT;
 	hcon->attempt++;
-	hcon->handle = __next_handle(mgr);
 	hcon->remote_id = remote_id;
 	hcon->amp_mgr = amp_mgr_get(mgr);
 
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 7727fe30e5c3..3735764b3169 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -153,6 +153,9 @@ static void hci_conn_cleanup(struct hci_conn *conn)
 
 	hci_conn_hash_del(hdev, conn);
 
+	if (HCI_CONN_HANDLE_UNSET(conn->handle))
+		ida_free(&hdev->unset_handle_ida, conn->handle);
+
 	if (conn->cleanup)
 		conn->cleanup(conn);
 
@@ -951,31 +954,18 @@ static void cis_cleanup(struct hci_conn *conn)
 	hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig);
 }
 
-static u16 hci_conn_hash_alloc_unset(struct hci_dev *hdev)
+static int hci_conn_hash_alloc_unset(struct hci_dev *hdev)
 {
-	struct hci_conn_hash *h = &hdev->conn_hash;
-	struct hci_conn  *c;
-	u16 handle = HCI_CONN_HANDLE_MAX + 1;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(c, &h->list, list) {
-		/* Find the first unused handle */
-		if (handle == 0xffff || c->handle != handle)
-			break;
-		handle++;
-	}
-	rcu_read_unlock();
-
-	return handle;
+	return ida_alloc_range(&hdev->unset_handle_ida, HCI_CONN_HANDLE_MAX + 1,
+			       U16_MAX, GFP_ATOMIC);
 }
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
-			      u8 role)
+			      u8 role, u16 handle)
 {
 	struct hci_conn *conn;
 
-	BT_DBG("%s dst %pMR", hdev->name, dst);
+	bt_dev_dbg(hdev, "dst %pMR handle 0x%4.4x", dst, handle);
 
 	conn = kzalloc(sizeof(*conn), GFP_KERNEL);
 	if (!conn)
@@ -983,7 +973,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 
 	bacpy(&conn->dst, dst);
 	bacpy(&conn->src, &hdev->bdaddr);
-	conn->handle = hci_conn_hash_alloc_unset(hdev);
+	conn->handle = handle;
 	conn->hdev  = hdev;
 	conn->type  = type;
 	conn->role  = role;
@@ -1068,6 +1058,20 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	return conn;
 }
 
+struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
+				    bdaddr_t *dst, u8 role)
+{
+	int handle;
+
+	bt_dev_dbg(hdev, "dst %pMR", dst);
+
+	handle = hci_conn_hash_alloc_unset(hdev);
+	if (unlikely(handle < 0))
+		return NULL;
+
+	return hci_conn_add(hdev, type, dst, role, handle);
+}
+
 static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason)
 {
 	if (!reason)
@@ -1304,6 +1308,9 @@ u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle)
 	if (conn->abort_reason)
 		return conn->abort_reason;
 
+	if (HCI_CONN_HANDLE_UNSET(conn->handle))
+		ida_free(&hdev->unset_handle_ida, conn->handle);
+
 	conn->handle = handle;
 
 	return 0;
@@ -1411,7 +1418,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 	if (conn) {
 		bacpy(&conn->dst, dst);
 	} else {
-		conn = hci_conn_add(hdev, LE_LINK, dst, role);
+		conn = hci_conn_add_unset(hdev, LE_LINK, dst, role);
 		if (!conn)
 			return ERR_PTR(-ENOMEM);
 		hci_conn_hold(conn);
@@ -1588,7 +1595,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
 		     memcmp(conn->le_per_adv_data, base, base_len)))
 		return ERR_PTR(-EADDRINUSE);
 
-	conn = hci_conn_add(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
+	conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
 	if (!conn)
 		return ERR_PTR(-ENOMEM);
 
@@ -1632,7 +1639,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
 
 	BT_DBG("requesting refresh of dst_addr");
 
-	conn = hci_conn_add(hdev, LE_LINK, dst, HCI_ROLE_MASTER);
+	conn = hci_conn_add_unset(hdev, LE_LINK, dst, HCI_ROLE_MASTER);
 	if (!conn)
 		return ERR_PTR(-ENOMEM);
 
@@ -1680,7 +1687,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 
 	acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
 	if (!acl) {
-		acl = hci_conn_add(hdev, ACL_LINK, dst, HCI_ROLE_MASTER);
+		acl = hci_conn_add_unset(hdev, ACL_LINK, dst, HCI_ROLE_MASTER);
 		if (!acl)
 			return ERR_PTR(-ENOMEM);
 	}
@@ -1740,7 +1747,7 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 
 	sco = hci_conn_hash_lookup_ba(hdev, type, dst);
 	if (!sco) {
-		sco = hci_conn_add(hdev, type, dst, HCI_ROLE_MASTER);
+		sco = hci_conn_add_unset(hdev, type, dst, HCI_ROLE_MASTER);
 		if (!sco) {
 			hci_conn_drop(acl);
 			return ERR_PTR(-ENOMEM);
@@ -1932,7 +1939,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 	cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig,
 				       qos->ucast.cis);
 	if (!cis) {
-		cis = hci_conn_add(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
+		cis = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
 		if (!cis)
 			return ERR_PTR(-ENOMEM);
 		cis->cleanup = cis_cleanup;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 195aea2198a9..65601aa52e0d 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2535,6 +2535,8 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
 	mutex_init(&hdev->lock);
 	mutex_init(&hdev->req_lock);
 
+	ida_init(&hdev->unset_handle_ida);
+
 	INIT_LIST_HEAD(&hdev->mesh_pending);
 	INIT_LIST_HEAD(&hdev->mgmt_pending);
 	INIT_LIST_HEAD(&hdev->reject_list);
@@ -2789,6 +2791,7 @@ void hci_release_dev(struct hci_dev *hdev)
 	hci_codec_list_clear(&hdev->local_codecs);
 	hci_dev_unlock(hdev);
 
+	ida_destroy(&hdev->unset_handle_ida);
 	ida_simple_remove(&hci_index_ida, hdev->id);
 	kfree_skb(hdev->sent_cmd);
 	kfree_skb(hdev->recv_event);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 32fb2f102a12..0849e0dafa95 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2335,8 +2335,8 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 		}
 	} else {
 		if (!conn) {
-			conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr,
-					    HCI_ROLE_MASTER);
+			conn = hci_conn_add_unset(hdev, ACL_LINK, &cp->bdaddr,
+						  HCI_ROLE_MASTER);
 			if (!conn)
 				bt_dev_err(hdev, "no memory for new connection");
 		}
@@ -3151,8 +3151,8 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 		    hci_bdaddr_list_lookup_with_flags(&hdev->accept_list,
 						      &ev->bdaddr,
 						      BDADDR_BREDR)) {
-			conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr,
-					    HCI_ROLE_SLAVE);
+			conn = hci_conn_add_unset(hdev, ev->link_type,
+						  &ev->bdaddr, HCI_ROLE_SLAVE);
 			if (!conn) {
 				bt_dev_err(hdev, "no memory for new conn");
 				goto unlock;
@@ -3317,8 +3317,8 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data,
 	conn = hci_conn_hash_lookup_ba(hdev, ev->link_type,
 			&ev->bdaddr);
 	if (!conn) {
-		conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr,
-				    HCI_ROLE_SLAVE);
+		conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr,
+					  HCI_ROLE_SLAVE);
 		if (!conn) {
 			bt_dev_err(hdev, "no memory for new connection");
 			goto unlock;
@@ -5890,7 +5890,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 		if (status)
 			goto unlock;
 
-		conn = hci_conn_add(hdev, LE_LINK, bdaddr, role);
+		conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, role);
 		if (!conn) {
 			bt_dev_err(hdev, "no memory for new connection");
 			goto unlock;
@@ -5952,17 +5952,11 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 
 	conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL);
 
-	if (handle > HCI_CONN_HANDLE_MAX) {
-		bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", handle,
-			   HCI_CONN_HANDLE_MAX);
-		status = HCI_ERROR_INVALID_PARAMETERS;
-	}
-
 	/* All connection failure handling is taken care of by the
 	 * hci_conn_failed function which is triggered by the HCI
 	 * request completion callbacks used for connecting.
 	 */
-	if (status)
+	if (status || hci_conn_set_handle(conn, handle))
 		goto unlock;
 
 	/* Drop the connection if it has been aborted */
@@ -5986,7 +5980,6 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 		mgmt_device_connected(hdev, conn, NULL, 0);
 
 	conn->sec_level = BT_SECURITY_LOW;
-	conn->handle = handle;
 	conn->state = BT_CONFIG;
 
 	/* Store current advertising instance as connection advertising instance
@@ -6622,8 +6615,8 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
 
 	if (ev->status) {
 		/* Add connection to indicate the failed PA sync event */
-		pa_sync = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
-				       HCI_ROLE_SLAVE);
+		pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY,
+					     HCI_ROLE_SLAVE);
 
 		if (!pa_sync)
 			goto unlock;
@@ -7019,12 +7012,12 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
 
 	cis = hci_conn_hash_lookup_handle(hdev, cis_handle);
 	if (!cis) {
-		cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE);
+		cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE,
+				   cis_handle);
 		if (!cis) {
 			hci_le_reject_cis(hdev, ev->cis_handle);
 			goto unlock;
 		}
-		cis->handle = cis_handle;
 	}
 
 	cis->iso_qos.ucast.cig = ev->cig_id;
@@ -7129,10 +7122,9 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 		bis = hci_conn_hash_lookup_handle(hdev, handle);
 		if (!bis) {
 			bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
-					   HCI_ROLE_SLAVE);
+					   HCI_ROLE_SLAVE, handle);
 			if (!bis)
 				continue;
-			bis->handle = handle;
 		}
 
 		if (ev->status != 0x42)
@@ -7198,8 +7190,8 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
 		goto unlock;
 
 	/* Add connection to indicate the PA sync event */
-	pa_sync = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
-			       HCI_ROLE_SLAVE);
+	pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY,
+				     HCI_ROLE_SLAVE);
 
 	if (!pa_sync)
 		goto unlock;
-- 
cgit v1.2.3-70-g09d2


From b63dadd6f97522513fe9497b5fde84a154e39a0b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 23 Oct 2023 20:50:15 +0200
Subject: bpf, tcx: Get rid of tcx_link_const

Small clean up to get rid of the extra tcx_link_const() and only retain
the tcx_link().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20231023185015.21152-1-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/tcx.h | 7 +------
 kernel/bpf/tcx.c  | 4 ++--
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/tcx.h b/include/net/tcx.h
index 264f147953ba..04be9377785d 100644
--- a/include/net/tcx.h
+++ b/include/net/tcx.h
@@ -38,16 +38,11 @@ static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
 	return container_of(bundle, struct tcx_entry, bundle);
 }
 
-static inline struct tcx_link *tcx_link(struct bpf_link *link)
+static inline struct tcx_link *tcx_link(const struct bpf_link *link)
 {
 	return container_of(link, struct tcx_link, link);
 }
 
-static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link)
-{
-	return tcx_link((struct bpf_link *)link);
-}
-
 void tcx_inc(void);
 void tcx_dec(void);
 
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
index 1338a13a8b64..2e4885e7781f 100644
--- a/kernel/bpf/tcx.c
+++ b/kernel/bpf/tcx.c
@@ -250,7 +250,7 @@ static void tcx_link_dealloc(struct bpf_link *link)
 
 static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
 {
-	const struct tcx_link *tcx = tcx_link_const(link);
+	const struct tcx_link *tcx = tcx_link(link);
 	u32 ifindex = 0;
 
 	rtnl_lock();
@@ -267,7 +267,7 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
 static int tcx_link_fill_info(const struct bpf_link *link,
 			      struct bpf_link_info *info)
 {
-	const struct tcx_link *tcx = tcx_link_const(link);
+	const struct tcx_link *tcx = tcx_link(link);
 	u32 ifindex = 0;
 
 	rtnl_lock();
-- 
cgit v1.2.3-70-g09d2


From e3570f040836b2f6332ce5cb4db0fd070448aeb5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 21 Oct 2023 13:27:07 +0200
Subject: devlink: make devlink_flash_overwrite enum named one

Since this enum is going to be used in generated userspace file, name it
properly.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://lore.kernel.org/r/20231021112711.660606-7-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/devlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index cd4b82458d1b..b3c8383d342d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -265,7 +265,7 @@ enum {
  * Documentation/networking/devlink/devlink-flash.rst
  *
  */
-enum {
+enum devlink_flash_overwrite {
 	DEVLINK_FLASH_OVERWRITE_SETTINGS_BIT,
 	DEVLINK_FLASH_OVERWRITE_IDENTIFIERS_BIT,
 
-- 
cgit v1.2.3-70-g09d2


From 58d53d8f7da63dd13903bec0a40b3009a841b61b Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 20 Oct 2023 17:59:48 +0800
Subject: page_pool: unify frag_count handling in page_pool_is_last_frag()

Currently when page_pool_create() is called with
PP_FLAG_PAGE_FRAG flag, page_pool_alloc_pages() is only
allowed to be called under the below constraints:
1. page_pool_fragment_page() need to be called to setup
   page->pp_frag_count immediately.
2. page_pool_defrag_page() often need to be called to drain
   the page->pp_frag_count when there is no more user will
   be holding on to that page.

Those constraints exist in order to support a page to be
split into multi fragments.

And those constraints have some overhead because of the
cache line dirtying/bouncing and atomic update.

Those constraints are unavoidable for case when we need a
page to be split into more than one fragment, but there is
also case that we want to avoid the above constraints and
their overhead when a page can't be split as it can only
hold a fragment as requested by user, depending on different
use cases:
use case 1: allocate page without page splitting.
use case 2: allocate page with page splitting.
use case 3: allocate page with or without page splitting
            depending on the fragment size.

Currently page pool only provide page_pool_alloc_pages() and
page_pool_alloc_frag() API to enable the 1 & 2 separately,
so we can not use a combination of 1 & 2 to enable 3, it is
not possible yet because of the per page_pool flag
PP_FLAG_PAGE_FRAG.

So in order to allow allocating unsplit page without the
overhead of split page while still allow allocating split
page we need to remove the per page_pool flag in
page_pool_is_last_frag(), as best as I can think of, it seems
there are two methods as below:
1. Add per page flag/bit to indicate a page is split or
   not, which means we might need to update that flag/bit
   everytime the page is recycled, dirtying the cache line
   of 'struct page' for use case 1.
2. Unify the page->pp_frag_count handling for both split and
   unsplit page by assuming all pages in the page pool is split
   into a big fragment initially.

As page pool already supports use case 1 without dirtying the
cache line of 'struct page' whenever a page is recyclable, we
need to support the above use case 3 with minimal overhead,
especially not adding any noticeable overhead for use case 1,
and we are already doing an optimization by not updating
pp_frag_count in page_pool_defrag_page() for the last fragment
user, this patch chooses to unify the pp_frag_count handling
to support the above use case 3.

There is no noticeable performance degradation and some
justification for unifying the frag_count handling with this
patch applied using a micro-benchmark testing in [1].

1. https://lore.kernel.org/all/bf2591f8-7b3c-4480-bb2c-31dc9da1d6ac@huawei.com/

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20231020095952.11055-2-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/helpers.h | 47 +++++++++++++++++++++++++++++------------
 net/core/page_pool.c            | 10 ++++++++-
 2 files changed, 43 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 8f64adf86f5b..759489c037c7 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -115,28 +115,49 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
 	long ret;
 
 	/* If nr == pp_frag_count then we have cleared all remaining
-	 * references to the page. No need to actually overwrite it, instead
-	 * we can leave this to be overwritten by the calling function.
+	 * references to the page:
+	 * 1. 'n == 1': no need to actually overwrite it.
+	 * 2. 'n != 1': overwrite it with one, which is the rare case
+	 *              for pp_frag_count draining.
 	 *
-	 * The main advantage to doing this is that an atomic_read is
-	 * generally a much cheaper operation than an atomic update,
-	 * especially when dealing with a page that may be partitioned
-	 * into only 2 or 3 pieces.
+	 * The main advantage to doing this is that not only we avoid a atomic
+	 * update, as an atomic_read is generally a much cheaper operation than
+	 * an atomic update, especially when dealing with a page that may be
+	 * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count
+	 * handling by ensuring all pages have partitioned into only 1 piece
+	 * initially, and only overwrite it when the page is partitioned into
+	 * more than one piece.
 	 */
-	if (atomic_long_read(&page->pp_frag_count) == nr)
+	if (atomic_long_read(&page->pp_frag_count) == nr) {
+		/* As we have ensured nr is always one for constant case using
+		 * the BUILD_BUG_ON(), only need to handle the non-constant case
+		 * here for pp_frag_count draining, which is a rare case.
+		 */
+		BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
+		if (!__builtin_constant_p(nr))
+			atomic_long_set(&page->pp_frag_count, 1);
+
 		return 0;
+	}
 
 	ret = atomic_long_sub_return(nr, &page->pp_frag_count);
 	WARN_ON(ret < 0);
+
+	/* We are the last user here too, reset pp_frag_count back to 1 to
+	 * ensure all pages have been partitioned into 1 piece initially,
+	 * this should be the rare case when the last two fragment users call
+	 * page_pool_defrag_page() currently.
+	 */
+	if (unlikely(!ret))
+		atomic_long_set(&page->pp_frag_count, 1);
+
 	return ret;
 }
 
-static inline bool page_pool_is_last_frag(struct page_pool *pool,
-					  struct page *page)
+static inline bool page_pool_is_last_frag(struct page *page)
 {
-	/* If fragments aren't enabled or count is 0 we were the last user */
-	return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
-	       (page_pool_defrag_page(page, 1) == 0);
+	/* If page_pool_defrag_page() returns 0, we were the last user */
+	return page_pool_defrag_page(page, 1) == 0;
 }
 
 /**
@@ -161,7 +182,7 @@ static inline void page_pool_put_page(struct page_pool *pool,
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	if (!page_pool_is_last_frag(pool, page))
+	if (!page_pool_is_last_frag(page))
 		return;
 
 	page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 8a9868ea5067..953535cab081 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -376,6 +376,14 @@ static void page_pool_set_pp_info(struct page_pool *pool,
 {
 	page->pp = pool;
 	page->pp_magic |= PP_SIGNATURE;
+
+	/* Ensuring all pages have been split into one fragment initially:
+	 * page_pool_set_pp_info() is only called once for every page when it
+	 * is allocated from the page allocator and page_pool_fragment_page()
+	 * is dirtying the same cache line as the page->pp_magic above, so
+	 * the overhead is negligible.
+	 */
+	page_pool_fragment_page(page, 1);
 	if (pool->p.init_callback)
 		pool->p.init_callback(page, pool->p.init_arg);
 }
@@ -672,7 +680,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 		struct page *page = virt_to_head_page(data[i]);
 
 		/* It is not the last user for the page frag case */
-		if (!page_pool_is_last_frag(pool, page))
+		if (!page_pool_is_last_frag(page))
 			continue;
 
 		page = __page_pool_put_page(pool, page, -1, false);
-- 
cgit v1.2.3-70-g09d2


From 09d96ee5674a0eaa800c664353756ecc45c4a87f Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 20 Oct 2023 17:59:49 +0800
Subject: page_pool: remove PP_FLAG_PAGE_FRAG

PP_FLAG_PAGE_FRAG is not really needed after pp_frag_count
handling is unified and page_pool_alloc_frag() is supported
in 32-bit arch with 64-bit DMA, so remove it.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20231020095952.11055-3-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c                | 2 --
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c          | 3 +--
 drivers/net/ethernet/intel/idpf/idpf_txrx.c              | 3 ---
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c        | 2 +-
 drivers/net/wireless/mediatek/mt76/mac80211.c            | 2 +-
 include/net/page_pool/types.h                            | 6 ++----
 net/core/page_pool.c                                     | 3 +--
 net/core/skbuff.c                                        | 2 +-
 9 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5d7a29f99401..d0359b569afe 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3302,8 +3302,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
 	pp.dma_dir = bp->rx_dir;
 	pp.max_len = PAGE_SIZE;
 	pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
-	if (PAGE_SIZE > BNXT_RX_PAGE_SIZE)
-		pp.flags |= PP_FLAG_PAGE_FRAG;
 
 	rxr->page_pool = page_pool_create(&pp);
 	if (IS_ERR(rxr->page_pool)) {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index cf50368441b7..06117502001f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -4940,8 +4940,7 @@ static void hns3_put_ring_config(struct hns3_nic_priv *priv)
 static void hns3_alloc_page_pool(struct hns3_enet_ring *ring)
 {
 	struct page_pool_params pp_params = {
-		.flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG |
-				PP_FLAG_DMA_SYNC_DEV,
+		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
 		.order = hns3_page_order(ring),
 		.pool_size = ring->desc_num * hns3_buf_size(ring) /
 				(PAGE_SIZE << hns3_page_order(ring)),
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 58c5412d3173..5e1ef70d54fe 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -595,9 +595,6 @@ static struct page_pool *idpf_rx_create_page_pool(struct idpf_queue *rxbufq)
 		.offset		= 0,
 	};
 
-	if (rxbufq->rx_buf_size == IDPF_RX_BUF_2048)
-		pp.flags |= PP_FLAG_PAGE_FRAG;
-
 	return page_pool_create(&pp);
 }
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index 818ce76185b2..1a42bfded872 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -1404,7 +1404,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
 	}
 
 	pp_params.order = get_order(buf_size);
-	pp_params.flags = PP_FLAG_PAGE_FRAG | PP_FLAG_DMA_MAP;
+	pp_params.flags = PP_FLAG_DMA_MAP;
 	pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs);
 	pp_params.nid = NUMA_NO_NODE;
 	pp_params.dev = pfvf->dev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9325b8f00af0..ea58c6917433 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -897,7 +897,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
 		struct page_pool_params pp_params = { 0 };
 
 		pp_params.order     = 0;
-		pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_PAGE_FRAG;
+		pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
 		pp_params.pool_size = pool_size;
 		pp_params.nid       = node;
 		pp_params.dev       = rq->pdev;
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index cb76053973aa..51a767121b0d 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -570,7 +570,7 @@ int mt76_create_page_pool(struct mt76_dev *dev, struct mt76_queue *q)
 {
 	struct page_pool_params pp_params = {
 		.order = 0,
-		.flags = PP_FLAG_PAGE_FRAG,
+		.flags = 0,
 		.nid = NUMA_NO_NODE,
 		.dev = dev->dma_dev,
 	};
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 887e7946a597..6fc5134095ed 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -17,10 +17,8 @@
 					* Please note DMA-sync-for-CPU is still
 					* device driver responsibility
 					*/
-#define PP_FLAG_PAGE_FRAG	BIT(2) /* for page frag feature */
 #define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
-				 PP_FLAG_DMA_SYNC_DEV |\
-				 PP_FLAG_PAGE_FRAG)
+				 PP_FLAG_DMA_SYNC_DEV)
 
 /*
  * Fast allocation side cache array/stack
@@ -45,7 +43,7 @@ struct pp_alloc_cache {
 
 /**
  * struct page_pool_params - page pool parameters
- * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG
+ * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV
  * @order:	2^order pages on allocation
  * @pool_size:	size of the ptr_ring
  * @nid:	NUMA node id to allocate from pages from
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 953535cab081..2a3671c97ca7 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -756,8 +756,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
 	unsigned int max_size = PAGE_SIZE << pool->p.order;
 	struct page *page = pool->frag_page;
 
-	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
-		    size > max_size))
+	if (WARN_ON(size > max_size))
 		return NULL;
 
 	size = ALIGN(size, dma_get_cache_alignment());
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 975c9a6ffb4a..c52ddd6891d9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5765,7 +5765,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 	/* In general, avoid mixing page_pool and non-page_pool allocated
 	 * pages within the same SKB. Additionally avoid dealing with clones
 	 * with page_pool pages, in case the SKB is using page_pool fragment
-	 * references (PP_FLAG_PAGE_FRAG). Since we only take full page
+	 * references (page_pool_alloc_frag()). Since we only take full page
 	 * references for cloned SKBs at the moment that would result in
 	 * inconsistent reference counts.
 	 * In theory we could take full references if @from is cloned and
-- 
cgit v1.2.3-70-g09d2


From de97502e16fc406a74edee8359612e518986cf59 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 20 Oct 2023 17:59:50 +0800
Subject: page_pool: introduce page_pool_alloc() API

Currently page pool supports the below use cases:
use case 1: allocate page without page splitting using
            page_pool_alloc_pages() API if the driver knows
            that the memory it need is always bigger than
            half of the page allocated from page pool.
use case 2: allocate page frag with page splitting using
            page_pool_alloc_frag() API if the driver knows
            that the memory it need is always smaller than
            or equal to the half of the page allocated from
            page pool.

There is emerging use case [1] & [2] that is a mix of the
above two case: the driver doesn't know the size of memory it
need beforehand, so the driver may use something like below to
allocate memory with least memory utilization and performance
penalty:

if (size << 1 > max_size)
	page = page_pool_alloc_pages();
else
	page = page_pool_alloc_frag();

To avoid the driver doing something like above, add the
page_pool_alloc() API to support the above use case, and update
the true size of memory that is acctually allocated by updating
'*size' back to the driver in order to avoid exacerbating
truesize underestimate problem.

Rename page_pool_free() which is used in the destroy process to
__page_pool_destroy() to avoid confusion with the newly added
API.

1. https://lore.kernel.org/all/d3ae6bd3537fbce379382ac6a42f67e22f27ece2.1683896626.git.lorenzo@kernel.org/
2. https://lore.kernel.org/all/20230526054621.18371-3-liangchen.linux@gmail.com/

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20231020095952.11055-4-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/helpers.h | 66 +++++++++++++++++++++++++++++++++++++++++
 net/core/page_pool.c            |  4 +--
 2 files changed, 68 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 759489c037c7..1b76e05dc4d2 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -82,6 +82,66 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
 	return page_pool_alloc_frag(pool, offset, size, gfp);
 }
 
+static inline struct page *page_pool_alloc(struct page_pool *pool,
+					   unsigned int *offset,
+					   unsigned int *size, gfp_t gfp)
+{
+	unsigned int max_size = PAGE_SIZE << pool->p.order;
+	struct page *page;
+
+	if ((*size << 1) > max_size) {
+		*size = max_size;
+		*offset = 0;
+		return page_pool_alloc_pages(pool, gfp);
+	}
+
+	page = page_pool_alloc_frag(pool, offset, *size, gfp);
+	if (unlikely(!page))
+		return NULL;
+
+	/* There is very likely not enough space for another fragment, so append
+	 * the remaining size to the current fragment to avoid truesize
+	 * underestimate problem.
+	 */
+	if (pool->frag_offset + *size > max_size) {
+		*size = max_size - *offset;
+		pool->frag_offset = max_size;
+	}
+
+	return page;
+}
+
+static inline struct page *page_pool_dev_alloc(struct page_pool *pool,
+					       unsigned int *offset,
+					       unsigned int *size)
+{
+	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_alloc(pool, offset, size, gfp);
+}
+
+static inline void *page_pool_alloc_va(struct page_pool *pool,
+				       unsigned int *size, gfp_t gfp)
+{
+	unsigned int offset;
+	struct page *page;
+
+	/* Mask off __GFP_HIGHMEM to ensure we can use page_address() */
+	page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM);
+	if (unlikely(!page))
+		return NULL;
+
+	return page_address(page) + offset;
+}
+
+static inline void *page_pool_dev_alloc_va(struct page_pool *pool,
+					   unsigned int *size)
+{
+	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_alloc_va(pool, size, gfp);
+}
+
 /**
  * page_pool_get_dma_dir() - Retrieve the stored DMA direction.
  * @pool:	pool from which page was allocated
@@ -221,6 +281,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA	\
 		(sizeof(dma_addr_t) > sizeof(unsigned long))
 
+static inline void page_pool_free_va(struct page_pool *pool, void *va,
+				     bool allow_direct)
+{
+	page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct);
+}
+
 /**
  * page_pool_get_dma_addr() - Retrieve the stored DMA address.
  * @page:	page allocated from a page pool
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 2a3671c97ca7..5e409b98aba0 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -809,7 +809,7 @@ static void page_pool_empty_ring(struct page_pool *pool)
 	}
 }
 
-static void page_pool_free(struct page_pool *pool)
+static void __page_pool_destroy(struct page_pool *pool)
 {
 	if (pool->disconnect)
 		pool->disconnect(pool);
@@ -860,7 +860,7 @@ static int page_pool_release(struct page_pool *pool)
 	page_pool_scrub(pool);
 	inflight = page_pool_inflight(pool);
 	if (!inflight)
-		page_pool_free(pool);
+		__page_pool_destroy(pool);
 
 	return inflight;
 }
-- 
cgit v1.2.3-70-g09d2


From 8ab32fa1c7947f4807b1d98af2d411a2587bb841 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 20 Oct 2023 17:59:51 +0800
Subject: page_pool: update document about fragment API

As more drivers begin to use the fragment API, update the
document about how to decide which API to use for the
driver author.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
CC: Dima Tisnek <dimaqq@gmail.com>
Link: https://lore.kernel.org/r/20231020095952.11055-5-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/page_pool.rst |  4 +-
 include/net/page_pool/helpers.h        | 93 +++++++++++++++++++++++++++++-----
 2 files changed, 83 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst
index 215ebc92752c..60993cb56b32 100644
--- a/Documentation/networking/page_pool.rst
+++ b/Documentation/networking/page_pool.rst
@@ -58,7 +58,9 @@ a page will cause no race conditions is enough.
 
 .. kernel-doc:: include/net/page_pool/helpers.h
    :identifiers: page_pool_put_page page_pool_put_full_page
-		 page_pool_recycle_direct page_pool_dev_alloc_pages
+		 page_pool_recycle_direct page_pool_free_va
+		 page_pool_dev_alloc_pages page_pool_dev_alloc_frag
+		 page_pool_dev_alloc page_pool_dev_alloc_va
 		 page_pool_get_dma_addr page_pool_get_dma_dir
 
 .. kernel-doc:: net/core/page_pool.c
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 1b76e05dc4d2..4ebd544ae977 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -8,23 +8,46 @@
 /**
  * DOC: page_pool allocator
  *
- * The page_pool allocator is optimized for the XDP mode that
- * uses one frame per-page, but it can fallback on the
- * regular page allocator APIs.
+ * The page_pool allocator is optimized for recycling page or page fragment used
+ * by skb packet and xdp frame.
  *
- * Basic use involves replacing alloc_pages() calls with the
- * page_pool_alloc_pages() call.  Drivers should use
- * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
+ * Basic use involves replacing and alloc_pages() calls with page_pool_alloc(),
+ * which allocate memory with or without page splitting depending on the
+ * requested memory size.
  *
- * The API keeps track of in-flight pages, in order to let API users know
- * when it is safe to free a page_pool object.  Thus, API users
- * must call page_pool_put_page() to free the page, or attach
- * the page to a page_pool-aware object like skbs marked with
+ * If the driver knows that it always requires full pages or its allocations are
+ * always smaller than half a page, it can use one of the more specific API
+ * calls:
+ *
+ * 1. page_pool_alloc_pages(): allocate memory without page splitting when
+ * driver knows that the memory it need is always bigger than half of the page
+ * allocated from page pool. There is no cache line dirtying for 'struct page'
+ * when a page is recycled back to the page pool.
+ *
+ * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver
+ * knows that the memory it need is always smaller than or equal to half of the
+ * page allocated from page pool. Page splitting enables memory saving and thus
+ * avoids TLB/cache miss for data access, but there also is some cost to
+ * implement page splitting, mainly some cache line dirtying/bouncing for
+ * 'struct page' and atomic operation for page->pp_frag_count.
+ *
+ * The API keeps track of in-flight pages, in order to let API users know when
+ * it is safe to free a page_pool object, the API users must call
+ * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or
+ * attach the page_pool object to a page_pool-aware object like skbs marked with
  * skb_mark_for_recycle().
  *
- * API users must call page_pool_put_page() once on a page, as it
- * will either recycle the page, or in case of refcnt > 1, it will
- * release the DMA mapping and in-flight state accounting.
+ * page_pool_put_page() may be called multi times on the same page if a page is
+ * split into multi fragments. For the last fragment, it will either recycle the
+ * page, or in case of page->_refcount > 1, it will release the DMA mapping and
+ * in-flight state accounting.
+ *
+ * dma_sync_single_range_for_device() is only called for the last fragment when
+ * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the
+ * last freed fragment to do the sync_for_device operation for all fragments in
+ * the same page when a page is split, the API user must setup pool->p.max_len
+ * and pool->p.offset correctly and ensure that page_pool_put_page() is called
+ * with dma_sync_size being -1 for fragment API.
  */
 #ifndef _NET_PAGE_POOL_HELPERS_H
 #define _NET_PAGE_POOL_HELPERS_H
@@ -73,6 +96,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
 	return page_pool_alloc_pages(pool, gfp);
 }
 
+/**
+ * page_pool_dev_alloc_frag() - allocate a page fragment.
+ * @pool: pool from which to allocate
+ * @offset: offset to the allocated page
+ * @size: requested size
+ *
+ * Get a page fragment from the page allocator or page_pool caches.
+ *
+ * Return:
+ * Return allocated page fragment, otherwise return NULL.
+ */
 static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
 						    unsigned int *offset,
 						    unsigned int size)
@@ -111,6 +145,19 @@ static inline struct page *page_pool_alloc(struct page_pool *pool,
 	return page;
 }
 
+/**
+ * page_pool_dev_alloc() - allocate a page or a page fragment.
+ * @pool: pool from which to allocate
+ * @offset: offset to the allocated page
+ * @size: in as the requested size, out as the allocated size
+ *
+ * Get a page or a page fragment from the page allocator or page_pool caches
+ * depending on the requested size in order to allocate memory with least memory
+ * utilization and performance penalty.
+ *
+ * Return:
+ * Return allocated page or page fragment, otherwise return NULL.
+ */
 static inline struct page *page_pool_dev_alloc(struct page_pool *pool,
 					       unsigned int *offset,
 					       unsigned int *size)
@@ -134,6 +181,18 @@ static inline void *page_pool_alloc_va(struct page_pool *pool,
 	return page_address(page) + offset;
 }
 
+/**
+ * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its
+ *			      va.
+ * @pool: pool from which to allocate
+ * @size: in as the requested size, out as the allocated size
+ *
+ * This is just a thin wrapper around the page_pool_alloc() API, and
+ * it returns va of the allocated page or page fragment.
+ *
+ * Return:
+ * Return the va for the allocated page or page fragment, otherwise return NULL.
+ */
 static inline void *page_pool_dev_alloc_va(struct page_pool *pool,
 					   unsigned int *size)
 {
@@ -281,6 +340,14 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA	\
 		(sizeof(dma_addr_t) > sizeof(unsigned long))
 
+/**
+ * page_pool_free_va() - free a va into the page_pool
+ * @pool: pool from which va was allocated
+ * @va: va to be freed
+ * @allow_direct: freed by the consumer, allow lockless caching
+ *
+ * Free a va allocated from page_pool_allo_va().
+ */
 static inline void page_pool_free_va(struct page_pool *pool, void *va,
 				     bool allow_direct)
 {
-- 
cgit v1.2.3-70-g09d2


From 2793a8b015f7f1caadb9bce9c63dc659f7522676 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:13 +0300
Subject: bpf: exact states comparison for iterator convergence checks

Convergence for open coded iterators is computed in is_state_visited()
by examining states with branches count > 1 and using states_equal().
states_equal() computes sub-state relation using read and precision marks.
Read and precision marks are propagated from children states,
thus are not guaranteed to be complete inside a loop when branches
count > 1. This could be demonstrated using the following unsafe program:

     1. r7 = -16
     2. r6 = bpf_get_prandom_u32()
     3. while (bpf_iter_num_next(&fp[-8])) {
     4.   if (r6 != 42) {
     5.     r7 = -32
     6.     r6 = bpf_get_prandom_u32()
     7.     continue
     8.   }
     9.   r0 = r10
    10.   r0 += r7
    11.   r8 = *(u64 *)(r0 + 0)
    12.   r6 = bpf_get_prandom_u32()
    13. }

Here verifier would first visit path 1-3, create a checkpoint at 3
with r7=-16, continue to 4-7,3 with r7=-32.

Because instructions at 9-12 had not been visitied yet existing
checkpoint at 3 does not have read or precision mark for r7.
Thus states_equal() would return true and verifier would discard
current state, thus unsafe memory access at 11 would not be caught.

This commit fixes this loophole by introducing exact state comparisons
for iterator convergence logic:
- registers are compared using regs_exact() regardless of read or
  precision marks;
- stack slots have to have identical type.

Unfortunately, this is too strict even for simple programs like below:

    i = 0;
    while(iter_next(&it))
      i++;

At each iteration step i++ would produce a new distinct state and
eventually instruction processing limit would be reached.

To avoid such behavior speculatively forget (widen) range for
imprecise scalar registers, if those registers were not precise at the
end of the previous iteration and do not match exactly.

This a conservative heuristic that allows to verify wide range of
programs, however it precludes verification of programs that conjure
an imprecise value on the first loop iteration and use it as precise
on the second.

Test case iter_task_vma_for_each() presents one of such cases:

        unsigned int seen = 0;
        ...
        bpf_for_each(task_vma, vma, task, 0) {
                if (seen >= 1000)
                        break;
                ...
                seen++;
        }

Here clang generates the following code:

<LBB0_4>:
      24:       r8 = r6                          ; stash current value of
                ... body ...                       'seen'
      29:       r1 = r10
      30:       r1 += -0x8
      31:       call bpf_iter_task_vma_next
      32:       r6 += 0x1                        ; seen++;
      33:       if r0 == 0x0 goto +0x2 <LBB0_6>  ; exit on next() == NULL
      34:       r7 += 0x10
      35:       if r8 < 0x3e7 goto -0xc <LBB0_4> ; loop on seen < 1000

<LBB0_6>:
      ... exit ...

Note that counter in r6 is copied to r8 and then incremented,
conditional jump is done using r8. Because of this precision mark for
r6 lags one state behind of precision mark on r8 and widening logic
kicks in.

Adding barrier_var(seen) after conditional is sufficient to force
clang use the same register for both counting and conditional jump.

This issue was discussed in the thread [1] which was started by
Andrew Werner <awerner32@gmail.com> demonstrating a similar bug
in callback functions handling. The callbacks would be addressed
in a followup patch.

[1] https://lore.kernel.org/bpf/97a90da09404c65c8e810cf83c94ac703705dc0e.camel@gmail.com/

Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |   1 +
 kernel/bpf/verifier.c                              | 218 ++++++++++++++++++---
 tools/testing/selftests/bpf/progs/iters_task_vma.c |   1 +
 3 files changed, 189 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e67cd45a85be..38b788228594 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -384,6 +384,7 @@ struct bpf_verifier_state {
 	 */
 	struct bpf_idx_pair *jmp_history;
 	u32 jmp_history_cnt;
+	u32 dfs_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 366029e484a0..6573e8c77329 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1802,6 +1802,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
+	dst_state->dfs_depth = src->dfs_depth;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -7723,6 +7724,81 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 	return 0;
 }
 
+/* Look for a previous loop entry at insn_idx: nearest parent state
+ * stopped at insn_idx with callsites matching those in cur->frame.
+ */
+static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
+						  struct bpf_verifier_state *cur,
+						  int insn_idx)
+{
+	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state *st;
+
+	/* Explored states are pushed in stack order, most recent states come first */
+	sl = *explored_state(env, insn_idx);
+	for (; sl; sl = sl->next) {
+		/* If st->branches != 0 state is a part of current DFS verification path,
+		 * hence cur & st for a loop.
+		 */
+		st = &sl->state;
+		if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
+		    st->dfs_depth < cur->dfs_depth)
+			return st;
+	}
+
+	return NULL;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env);
+static bool regs_exact(const struct bpf_reg_state *rold,
+		       const struct bpf_reg_state *rcur,
+		       struct bpf_idmap *idmap);
+
+static void maybe_widen_reg(struct bpf_verifier_env *env,
+			    struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+			    struct bpf_idmap *idmap)
+{
+	if (rold->type != SCALAR_VALUE)
+		return;
+	if (rold->type != rcur->type)
+		return;
+	if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+		return;
+	__mark_reg_unknown(env, rcur);
+}
+
+static int widen_imprecise_scalars(struct bpf_verifier_env *env,
+				   struct bpf_verifier_state *old,
+				   struct bpf_verifier_state *cur)
+{
+	struct bpf_func_state *fold, *fcur;
+	int i, fr;
+
+	reset_idmap_scratch(env);
+	for (fr = old->curframe; fr >= 0; fr--) {
+		fold = old->frame[fr];
+		fcur = cur->frame[fr];
+
+		for (i = 0; i < MAX_BPF_REG; i++)
+			maybe_widen_reg(env,
+					&fold->regs[i],
+					&fcur->regs[i],
+					&env->idmap_scratch);
+
+		for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+			if (!is_spilled_reg(&fold->stack[i]) ||
+			    !is_spilled_reg(&fcur->stack[i]))
+				continue;
+
+			maybe_widen_reg(env,
+					&fold->stack[i].spilled_ptr,
+					&fcur->stack[i].spilled_ptr,
+					&env->idmap_scratch);
+		}
+	}
+	return 0;
+}
+
 /* process_iter_next_call() is called when verifier gets to iterator's next
  * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
  * to it as just "iter_next()" in comments below.
@@ -7764,25 +7840,47 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
  * is some statically known limit on number of iterations (e.g., if there is
  * an explicit `if n > 100 then break;` statement somewhere in the loop).
  *
- * One very subtle but very important aspect is that we *always* simulate NULL
- * condition first (as the current state) before we simulate non-NULL case.
- * This has to do with intricacies of scalar precision tracking. By simulating
- * "exit condition" of iter_next() returning NULL first, we make sure all the
- * relevant precision marks *that will be set **after** we exit iterator loop*
- * are propagated backwards to common parent state of NULL and non-NULL
- * branches. Thanks to that, state equivalence checks done later in forked
- * state, when reaching iter_next() for ACTIVE iterator, can assume that
- * precision marks are finalized and won't change. Because simulating another
- * ACTIVE iterator iteration won't change them (because given same input
- * states we'll end up with exactly same output states which we are currently
- * comparing; and verification after the loop already propagated back what
- * needs to be **additionally** tracked as precise). It's subtle, grok
- * precision tracking for more intuitive understanding.
+ * Iteration convergence logic in is_state_visited() relies on exact
+ * states comparison, which ignores read and precision marks.
+ * This is necessary because read and precision marks are not finalized
+ * while in the loop. Exact comparison might preclude convergence for
+ * simple programs like below:
+ *
+ *     i = 0;
+ *     while(iter_next(&it))
+ *       i++;
+ *
+ * At each iteration step i++ would produce a new distinct state and
+ * eventually instruction processing limit would be reached.
+ *
+ * To avoid such behavior speculatively forget (widen) range for
+ * imprecise scalar registers, if those registers were not precise at the
+ * end of the previous iteration and do not match exactly.
+ *
+ * This is a conservative heuristic that allows to verify wide range of programs,
+ * however it precludes verification of programs that conjure an
+ * imprecise value on the first loop iteration and use it as precise on a second.
+ * For example, the following safe program would fail to verify:
+ *
+ *     struct bpf_num_iter it;
+ *     int arr[10];
+ *     int i = 0, a = 0;
+ *     bpf_iter_num_new(&it, 0, 10);
+ *     while (bpf_iter_num_next(&it)) {
+ *       if (a == 0) {
+ *         a = 1;
+ *         i = 7; // Because i changed verifier would forget
+ *                // it's range on second loop entry.
+ *       } else {
+ *         arr[i] = 42; // This would fail to verify.
+ *       }
+ *     }
+ *     bpf_iter_num_destroy(&it);
  */
 static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 				  struct bpf_kfunc_call_arg_meta *meta)
 {
-	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
 	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
 	struct bpf_reg_state *cur_iter, *queued_iter;
 	int iter_frameno = meta->iter.frameno;
@@ -7800,6 +7898,19 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 	}
 
 	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+		/* Because iter_next() call is a checkpoint is_state_visitied()
+		 * should guarantee parent state with same call sites and insn_idx.
+		 */
+		if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
+		    !same_callsites(cur_st->parent, cur_st)) {
+			verbose(env, "bug: bad parent state for iter next call");
+			return -EFAULT;
+		}
+		/* Note cur_st->parent in the call below, it is necessary to skip
+		 * checkpoint created for cur_st by is_state_visited()
+		 * right at this instruction.
+		 */
+		prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
 		/* branch out active iter state */
 		queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
 		if (!queued_st)
@@ -7808,6 +7919,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
 		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
 		queued_iter->iter.depth++;
+		if (prev_st)
+			widen_imprecise_scalars(env, prev_st, queued_st);
 
 		queued_fr = queued_st->frame[queued_st->curframe];
 		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
@@ -15948,8 +16061,11 @@ static bool regs_exact(const struct bpf_reg_state *rold,
 
 /* Returns true if (rold safe implies rcur safe) */
 static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
-		    struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
+		    struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
 {
+	if (exact)
+		return regs_exact(rold, rcur, idmap);
+
 	if (!(rold->live & REG_LIVE_READ))
 		/* explored state didn't use this */
 		return true;
@@ -16066,7 +16182,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 }
 
 static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
-		      struct bpf_func_state *cur, struct bpf_idmap *idmap)
+		      struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
 {
 	int i, spi;
 
@@ -16079,7 +16195,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 
 		spi = i / BPF_REG_SIZE;
 
-		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+		if (exact &&
+		    old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+			return false;
+
+		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
 			i += BPF_REG_SIZE - 1;
 			/* explored state didn't use this */
 			continue;
@@ -16129,7 +16250,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			 * return false to continue verification of this path
 			 */
 			if (!regsafe(env, &old->stack[spi].spilled_ptr,
-				     &cur->stack[spi].spilled_ptr, idmap))
+				     &cur->stack[spi].spilled_ptr, idmap, exact))
 				return false;
 			break;
 		case STACK_DYNPTR:
@@ -16211,16 +16332,16 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
  * the current state will reach 'bpf_exit' instruction safely
  */
 static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
-			      struct bpf_func_state *cur)
+			      struct bpf_func_state *cur, bool exact)
 {
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (!regsafe(env, &old->regs[i], &cur->regs[i],
-			     &env->idmap_scratch))
+			     &env->idmap_scratch, exact))
 			return false;
 
-	if (!stacksafe(env, old, cur, &env->idmap_scratch))
+	if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
 		return false;
 
 	if (!refsafe(old, cur, &env->idmap_scratch))
@@ -16229,17 +16350,23 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 	return true;
 }
 
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+	env->idmap_scratch.tmp_id_gen = env->id_gen;
+	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+}
+
 static bool states_equal(struct bpf_verifier_env *env,
 			 struct bpf_verifier_state *old,
-			 struct bpf_verifier_state *cur)
+			 struct bpf_verifier_state *cur,
+			 bool exact)
 {
 	int i;
 
 	if (old->curframe != cur->curframe)
 		return false;
 
-	env->idmap_scratch.tmp_id_gen = env->id_gen;
-	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+	reset_idmap_scratch(env);
 
 	/* Verification state from speculative execution simulation
 	 * must never prune a non-speculative execution one.
@@ -16269,7 +16396,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 	for (i = 0; i <= old->curframe; i++) {
 		if (old->frame[i]->callsite != cur->frame[i]->callsite)
 			return false;
-		if (!func_states_equal(env, old->frame[i], cur->frame[i]))
+		if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
 			return false;
 	}
 	return true;
@@ -16524,7 +16651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
-	int i, j, err, states_cnt = 0;
+	int i, j, n, err, states_cnt = 0;
 	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
 	bool add_new_state = force_new_state;
 
@@ -16579,9 +16706,33 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * It's safe to assume that iterator loop will finish, taking into
 			 * account iter_next() contract of eventually returning
 			 * sticky NULL result.
+			 *
+			 * Note, that states have to be compared exactly in this case because
+			 * read and precision marks might not be finalized inside the loop.
+			 * E.g. as in the program below:
+			 *
+			 *     1. r7 = -16
+			 *     2. r6 = bpf_get_prandom_u32()
+			 *     3. while (bpf_iter_num_next(&fp[-8])) {
+			 *     4.   if (r6 != 42) {
+			 *     5.     r7 = -32
+			 *     6.     r6 = bpf_get_prandom_u32()
+			 *     7.     continue
+			 *     8.   }
+			 *     9.   r0 = r10
+			 *    10.   r0 += r7
+			 *    11.   r8 = *(u64 *)(r0 + 0)
+			 *    12.   r6 = bpf_get_prandom_u32()
+			 *    13. }
+			 *
+			 * Here verifier would first visit path 1-3, create a checkpoint at 3
+			 * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+			 * not have read or precision mark for r7 yet, thus inexact states
+			 * comparison would discard current state with r7=-32
+			 * => unsafe memory access at 11 would not be caught.
 			 */
 			if (is_iter_next_insn(env, insn_idx)) {
-				if (states_equal(env, &sl->state, cur)) {
+				if (states_equal(env, &sl->state, cur, true)) {
 					struct bpf_func_state *cur_frame;
 					struct bpf_reg_state *iter_state, *iter_reg;
 					int spi;
@@ -16604,7 +16755,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			}
 			/* attempt to detect infinite loop to avoid unnecessary doomed work */
 			if (states_maybe_looping(&sl->state, cur) &&
-			    states_equal(env, &sl->state, cur) &&
+			    states_equal(env, &sl->state, cur, false) &&
 			    !iter_active_depths_differ(&sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
@@ -16629,7 +16780,7 @@ skip_inf_loop_check:
 				add_new_state = false;
 			goto miss;
 		}
-		if (states_equal(env, &sl->state, cur)) {
+		if (states_equal(env, &sl->state, cur, false)) {
 hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -16668,8 +16819,12 @@ miss:
 		 * to keep checking from state equivalence point of view.
 		 * Higher numbers increase max_states_per_insn and verification time,
 		 * but do not meaningfully decrease insn_processed.
+		 * 'n' controls how many times state could miss before eviction.
+		 * Use bigger 'n' for checkpoints because evicting checkpoint states
+		 * too early would hinder iterator convergence.
 		 */
-		if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+		n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+		if (sl->miss_cnt > sl->hit_cnt * n + n) {
 			/* the state is unlikely to be useful. Remove it to
 			 * speed up verification
 			 */
@@ -16743,6 +16898,7 @@ next:
 
 	cur->parent = new;
 	cur->first_insn_idx = insn_idx;
+	cur->dfs_depth = new->dfs_depth + 1;
 	clear_jmp_history(cur);
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma.c b/tools/testing/selftests/bpf/progs/iters_task_vma.c
index 44edecfdfaee..e085a51d153e 100644
--- a/tools/testing/selftests/bpf/progs/iters_task_vma.c
+++ b/tools/testing/selftests/bpf/progs/iters_task_vma.c
@@ -30,6 +30,7 @@ int iter_task_vma_for_each(const void *ctx)
 	bpf_for_each(task_vma, vma, task, 0) {
 		if (seen >= 1000)
 			break;
+		barrier_var(seen);
 
 		vm_ranges[seen].vm_start = vma->vm_start;
 		vm_ranges[seen].vm_end = vma->vm_end;
-- 
cgit v1.2.3-70-g09d2


From 2a0992829ea3864939d917a5c7b48be6629c6217 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:15 +0300
Subject: bpf: correct loop detection for iterators convergence

It turns out that .branches > 0 in is_state_visited() is not a
sufficient condition to identify if two verifier states form a loop
when iterators convergence is computed. This commit adds logic to
distinguish situations like below:

 (I)            initial       (II)            initial
                  |                             |
                  V                             V
     .---------> hdr                           ..
     |            |                             |
     |            V                             V
     |    .------...                    .------..
     |    |       |                     |       |
     |    V       V                     V       V
     |   ...     ...               .-> hdr     ..
     |    |       |                |    |       |
     |    V       V                |    V       V
     |   succ <- cur               |   succ <- cur
     |    |                        |    |
     |    V                        |    V
     |   ...                       |   ...
     |    |                        |    |
     '----'                        '----'

For both (I) and (II) successor 'succ' of the current state 'cur' was
previously explored and has branches count at 0. However, loop entry
'hdr' corresponding to 'succ' might be a part of current DFS path.
If that is the case 'succ' and 'cur' are members of the same loop
and have to be compared exactly.

Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  15 ++++
 kernel/bpf/verifier.c        | 207 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 218 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 38b788228594..24213a99cc79 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -373,10 +373,25 @@ struct bpf_verifier_state {
 	struct bpf_active_lock active_lock;
 	bool speculative;
 	bool active_rcu_lock;
+	/* If this state was ever pointed-to by other state's loop_entry field
+	 * this flag would be set to true. Used to avoid freeing such states
+	 * while they are still in use.
+	 */
+	bool used_as_loop_entry;
 
 	/* first and last insn idx of this verifier state */
 	u32 first_insn_idx;
 	u32 last_insn_idx;
+	/* If this state is a part of states loop this field points to some
+	 * parent of this state such that:
+	 * - it is also a member of the same states loop;
+	 * - DFS states traversal starting from initial state visits loop_entry
+	 *   state before this state.
+	 * Used to compute topmost loop entry for state loops.
+	 * State loops might appear because of open coded iterators logic.
+	 * See get_loop_entry() for more information.
+	 */
+	struct bpf_verifier_state *loop_entry;
 	/* jmp history recorded from first to last.
 	 * backtracking is using it to go from last to first.
 	 * For most states jmp_history_cnt is [0-3].
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6573e8c77329..f23fbfe82c59 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1803,6 +1803,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
 	dst_state->dfs_depth = src->dfs_depth;
+	dst_state->used_as_loop_entry = src->used_as_loop_entry;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -1845,11 +1846,176 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
 	return true;
 }
 
+/* Open coded iterators allow back-edges in the state graph in order to
+ * check unbounded loops that iterators.
+ *
+ * In is_state_visited() it is necessary to know if explored states are
+ * part of some loops in order to decide whether non-exact states
+ * comparison could be used:
+ * - non-exact states comparison establishes sub-state relation and uses
+ *   read and precision marks to do so, these marks are propagated from
+ *   children states and thus are not guaranteed to be final in a loop;
+ * - exact states comparison just checks if current and explored states
+ *   are identical (and thus form a back-edge).
+ *
+ * Paper "A New Algorithm for Identifying Loops in Decompilation"
+ * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
+ * algorithm for loop structure detection and gives an overview of
+ * relevant terminology. It also has helpful illustrations.
+ *
+ * [1] https://api.semanticscholar.org/CorpusID:15784067
+ *
+ * We use a similar algorithm but because loop nested structure is
+ * irrelevant for verifier ours is significantly simpler and resembles
+ * strongly connected components algorithm from Sedgewick's textbook.
+ *
+ * Define topmost loop entry as a first node of the loop traversed in a
+ * depth first search starting from initial state. The goal of the loop
+ * tracking algorithm is to associate topmost loop entries with states
+ * derived from these entries.
+ *
+ * For each step in the DFS states traversal algorithm needs to identify
+ * the following situations:
+ *
+ *          initial                     initial                   initial
+ *            |                           |                         |
+ *            V                           V                         V
+ *           ...                         ...           .---------> hdr
+ *            |                           |            |            |
+ *            V                           V            |            V
+ *           cur                     .-> succ          |    .------...
+ *            |                      |    |            |    |       |
+ *            V                      |    V            |    V       V
+ *           succ                    '-- cur           |   ...     ...
+ *                                                     |    |       |
+ *                                                     |    V       V
+ *                                                     |   succ <- cur
+ *                                                     |    |
+ *                                                     |    V
+ *                                                     |   ...
+ *                                                     |    |
+ *                                                     '----'
+ *
+ *  (A) successor state of cur   (B) successor state of cur or it's entry
+ *      not yet traversed            are in current DFS path, thus cur and succ
+ *                                   are members of the same outermost loop
+ *
+ *                      initial                  initial
+ *                        |                        |
+ *                        V                        V
+ *                       ...                      ...
+ *                        |                        |
+ *                        V                        V
+ *                .------...               .------...
+ *                |       |                |       |
+ *                V       V                V       V
+ *           .-> hdr     ...              ...     ...
+ *           |    |       |                |       |
+ *           |    V       V                V       V
+ *           |   succ <- cur              succ <- cur
+ *           |    |                        |
+ *           |    V                        V
+ *           |   ...                      ...
+ *           |    |                        |
+ *           '----'                       exit
+ *
+ * (C) successor state of cur is a part of some loop but this loop
+ *     does not include cur or successor state is not in a loop at all.
+ *
+ * Algorithm could be described as the following python code:
+ *
+ *     traversed = set()   # Set of traversed nodes
+ *     entries = {}        # Mapping from node to loop entry
+ *     depths = {}         # Depth level assigned to graph node
+ *     path = set()        # Current DFS path
+ *
+ *     # Find outermost loop entry known for n
+ *     def get_loop_entry(n):
+ *         h = entries.get(n, None)
+ *         while h in entries and entries[h] != h:
+ *             h = entries[h]
+ *         return h
+ *
+ *     # Update n's loop entry if h's outermost entry comes
+ *     # before n's outermost entry in current DFS path.
+ *     def update_loop_entry(n, h):
+ *         n1 = get_loop_entry(n) or n
+ *         h1 = get_loop_entry(h) or h
+ *         if h1 in path and depths[h1] <= depths[n1]:
+ *             entries[n] = h1
+ *
+ *     def dfs(n, depth):
+ *         traversed.add(n)
+ *         path.add(n)
+ *         depths[n] = depth
+ *         for succ in G.successors(n):
+ *             if succ not in traversed:
+ *                 # Case A: explore succ and update cur's loop entry
+ *                 #         only if succ's entry is in current DFS path.
+ *                 dfs(succ, depth + 1)
+ *                 h = get_loop_entry(succ)
+ *                 update_loop_entry(n, h)
+ *             else:
+ *                 # Case B or C depending on `h1 in path` check in update_loop_entry().
+ *                 update_loop_entry(n, succ)
+ *         path.remove(n)
+ *
+ * To adapt this algorithm for use with verifier:
+ * - use st->branch == 0 as a signal that DFS of succ had been finished
+ *   and cur's loop entry has to be updated (case A), handle this in
+ *   update_branch_counts();
+ * - use st->branch > 0 as a signal that st is in the current DFS path;
+ * - handle cases B and C in is_state_visited();
+ * - update topmost loop entry for intermediate states in get_loop_entry().
+ */
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+{
+	struct bpf_verifier_state *topmost = st->loop_entry, *old;
+
+	while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+		topmost = topmost->loop_entry;
+	/* Update loop entries for intermediate states to avoid this
+	 * traversal in future get_loop_entry() calls.
+	 */
+	while (st && st->loop_entry != topmost) {
+		old = st->loop_entry;
+		st->loop_entry = topmost;
+		st = old;
+	}
+	return topmost;
+}
+
+static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+{
+	struct bpf_verifier_state *cur1, *hdr1;
+
+	cur1 = get_loop_entry(cur) ?: cur;
+	hdr1 = get_loop_entry(hdr) ?: hdr;
+	/* The head1->branches check decides between cases B and C in
+	 * comment for get_loop_entry(). If hdr1->branches == 0 then
+	 * head's topmost loop entry is not in current DFS path,
+	 * hence 'cur' and 'hdr' are not in the same loop and there is
+	 * no need to update cur->loop_entry.
+	 */
+	if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+		cur->loop_entry = hdr;
+		hdr->used_as_loop_entry = true;
+	}
+}
+
 static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
 	while (st) {
 		u32 br = --st->branches;
 
+		/* br == 0 signals that DFS exploration for 'st' is finished,
+		 * thus it is necessary to update parent's loop entry if it
+		 * turned out that st is a part of some loop.
+		 * This is a part of 'case A' in get_loop_entry() comment.
+		 */
+		if (br == 0 && st->parent && st->loop_entry)
+			update_loop_entry(st->parent, st->loop_entry);
+
 		/* WARN_ON(br > 1) technically makes sense here,
 		 * but see comment in push_stack(), hence:
 		 */
@@ -16650,10 +16816,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl, **pprev;
-	struct bpf_verifier_state *cur = env->cur_state, *new;
+	struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
 	int i, j, n, err, states_cnt = 0;
 	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
 	bool add_new_state = force_new_state;
+	bool force_exact;
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
@@ -16748,8 +16915,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 					 */
 					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
 					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
-					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+						update_loop_entry(cur, &sl->state);
 						goto hit;
+					}
 				}
 				goto skip_inf_loop_check;
 			}
@@ -16780,7 +16949,36 @@ skip_inf_loop_check:
 				add_new_state = false;
 			goto miss;
 		}
-		if (states_equal(env, &sl->state, cur, false)) {
+		/* If sl->state is a part of a loop and this loop's entry is a part of
+		 * current verification path then states have to be compared exactly.
+		 * 'force_exact' is needed to catch the following case:
+		 *
+		 *                initial     Here state 'succ' was processed first,
+		 *                  |         it was eventually tracked to produce a
+		 *                  V         state identical to 'hdr'.
+		 *     .---------> hdr        All branches from 'succ' had been explored
+		 *     |            |         and thus 'succ' has its .branches == 0.
+		 *     |            V
+		 *     |    .------...        Suppose states 'cur' and 'succ' correspond
+		 *     |    |       |         to the same instruction + callsites.
+		 *     |    V       V         In such case it is necessary to check
+		 *     |   ...     ...        if 'succ' and 'cur' are states_equal().
+		 *     |    |       |         If 'succ' and 'cur' are a part of the
+		 *     |    V       V         same loop exact flag has to be set.
+		 *     |   succ <- cur        To check if that is the case, verify
+		 *     |    |                 if loop entry of 'succ' is in current
+		 *     |    V                 DFS path.
+		 *     |   ...
+		 *     |    |
+		 *     '----'
+		 *
+		 * Additional details are in the comment before get_loop_entry().
+		 */
+		loop_entry = get_loop_entry(&sl->state);
+		force_exact = loop_entry && loop_entry->branches > 0;
+		if (states_equal(env, &sl->state, cur, force_exact)) {
+			if (force_exact)
+				update_loop_entry(cur, loop_entry);
 hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -16829,7 +17027,8 @@ miss:
 			 * speed up verification
 			 */
 			*pprev = sl->next;
-			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
+			    !sl->state.used_as_loop_entry) {
 				u32 br = sl->state.branches;
 
 				WARN_ONCE(br,
-- 
cgit v1.2.3-70-g09d2


From 99b29a499b5fdfb7ab274835b8e4d4c11df2f6d7 Mon Sep 17 00:00:00 2001
From: Albert Huang <huangjie.albert@bytedance.com>
Date: Mon, 23 Oct 2023 20:57:31 +0800
Subject: xsk: Avoid starving the xsk further down the list

In the previous implementation, when multiple xsk sockets were
associated with a single xsk_buff_pool, a situation could arise
where the xsk_tx_list maintained data at the front for one xsk
socket while starving the xsk sockets at the back of the list.
This could result in issues such as the inability to transmit packets,
increased latency, and jitter. To address this problem, we introduce
a new variable called tx_budget_spent, which limits each xsk to transmit
a maximum of MAX_PER_SOCKET_BUDGET tx descriptors. This allocation ensures
equitable opportunities for subsequent xsk sockets to send tx descriptors.
The value of MAX_PER_SOCKET_BUDGET is set to 32.

Signed-off-by: Albert Huang <huangjie.albert@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20231023125732.82261-1-huangjie.albert@bytedance.com
---
 include/net/xdp_sock.h |  7 +++++++
 net/xdp/xsk.c          | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 7dd0df2f6f8e..f83128007fb0 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -63,6 +63,13 @@ struct xdp_sock {
 
 	struct xsk_queue *tx ____cacheline_aligned_in_smp;
 	struct list_head tx_list;
+	/* record the number of tx descriptors sent by this xsk and
+	 * when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs
+	 * to be given to other xsks for sending tx descriptors, thereby
+	 * preventing other XSKs from being starved.
+	 */
+	u32 tx_budget_spent;
+
 	/* Protects generic receive. */
 	spinlock_t rx_lock;
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ba070fd37d24..ae9f8cb611f6 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -33,6 +33,7 @@
 #include "xsk.h"
 
 #define TX_BATCH_SIZE 32
+#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
 
 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
 
@@ -423,16 +424,25 @@ EXPORT_SYMBOL(xsk_tx_release);
 
 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
 {
+	bool budget_exhausted = false;
 	struct xdp_sock *xs;
 
 	rcu_read_lock();
+again:
 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+		if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) {
+			budget_exhausted = true;
+			continue;
+		}
+
 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
 			if (xskq_has_descs(xs->tx))
 				xskq_cons_release(xs->tx);
 			continue;
 		}
 
+		xs->tx_budget_spent++;
+
 		/* This is the backpressure mechanism for the Tx path.
 		 * Reserve space in the completion queue and only proceed
 		 * if there is space in it. This avoids having to implement
@@ -446,6 +456,14 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
 		return true;
 	}
 
+	if (budget_exhausted) {
+		list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
+			xs->tx_budget_spent = 0;
+
+		budget_exhausted = false;
+		goto again;
+	}
+
 out:
 	rcu_read_unlock();
 	return false;
-- 
cgit v1.2.3-70-g09d2


From 643d1260366424412e8269caead410d333e3263f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 20 Oct 2023 14:38:15 +0200
Subject: netfilter: conntrack: switch connlabels to atomic_t

The spinlock is back from the day when connabels did not have
a fixed size and reallocation had to be supported.

Remove it.  This change also allows to call the helpers from
softirq or timers without deadlocks.

Also add WARN()s to catch refcounting imbalances.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_labels.h |  2 +-
 include/net/netns/conntrack.h               |  2 +-
 net/netfilter/nf_conntrack_labels.c         | 17 ++++++++---------
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index fcb19a4e8f2b..6903f72bcc15 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -39,7 +39,7 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct)
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 	struct net *net = nf_ct_net(ct);
 
-	if (net->ct.labels_used == 0)
+	if (atomic_read(&net->ct.labels_used) == 0)
 		return NULL;
 
 	return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 1f463b3957c7..bae914815aa3 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -107,7 +107,7 @@ struct netns_ct {
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
 	struct nf_ip_net	nf_ct_proto;
 #if defined(CONFIG_NF_CONNTRACK_LABELS)
-	unsigned int		labels_used;
+	atomic_t		labels_used;
 #endif
 };
 #endif
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 6e70e137a0a6..6c46aad23313 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -11,8 +11,6 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 
-static DEFINE_SPINLOCK(nf_connlabels_lock);
-
 static int replace_u32(u32 *address, u32 mask, u32 new)
 {
 	u32 old, tmp;
@@ -60,23 +58,24 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace);
 
 int nf_connlabels_get(struct net *net, unsigned int bits)
 {
+	int v;
+
 	if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long))
 		return -ERANGE;
 
-	spin_lock(&nf_connlabels_lock);
-	net->ct.labels_used++;
-	spin_unlock(&nf_connlabels_lock);
-
 	BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
 
+	v = atomic_inc_return_relaxed(&net->ct.labels_used);
+	WARN_ON_ONCE(v <= 0);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_get);
 
 void nf_connlabels_put(struct net *net)
 {
-	spin_lock(&nf_connlabels_lock);
-	net->ct.labels_used--;
-	spin_unlock(&nf_connlabels_lock);
+	int v = atomic_dec_return_relaxed(&net->ct.labels_used);
+
+	WARN_ON_ONCE(v < 0);
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_put);
-- 
cgit v1.2.3-70-g09d2


From 6509a2e410c3cb36c78a0a85c6102debe171337e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 18 Oct 2023 22:20:23 +0200
Subject: netfilter: nf_tables: set backend .flush always succeeds

.flush is always successful since this results from iterating over the
set elements to toggle mark the element as inactive in the next
generation.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 2 +-
 net/netfilter/nf_tables_api.c     | 9 +--------
 net/netfilter/nft_set_bitmap.c    | 4 +---
 net/netfilter/nft_set_hash.c      | 7 ++-----
 net/netfilter/nft_set_pipapo.c    | 4 +---
 net/netfilter/nft_set_rbtree.c    | 4 +---
 6 files changed, 7 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 8de040d2d2cf..d0f5c477c254 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -449,7 +449,7 @@ struct nft_set_ops {
 	void *				(*deactivate)(const struct net *net,
 						      const struct nft_set *set,
 						      const struct nft_set_elem *elem);
-	bool				(*flush)(const struct net *net,
+	void				(*flush)(const struct net *net,
 						 const struct nft_set *set,
 						 void *priv);
 	void				(*remove)(const struct net *net,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ecb251f6c6a6..35db40857bc6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -7146,17 +7146,13 @@ static int nft_setelem_flush(const struct nft_ctx *ctx,
 			     struct nft_set_elem *elem)
 {
 	struct nft_trans *trans;
-	int err;
 
 	trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
 				    sizeof(struct nft_trans_elem), GFP_ATOMIC);
 	if (!trans)
 		return -ENOMEM;
 
-	if (!set->ops->flush(ctx->net, set, elem->priv)) {
-		err = -ENOENT;
-		goto err1;
-	}
+	set->ops->flush(ctx->net, set, elem->priv);
 	set->ndeact++;
 
 	nft_setelem_data_deactivate(ctx->net, set, elem);
@@ -7165,9 +7161,6 @@ static int nft_setelem_flush(const struct nft_ctx *ctx,
 	nft_trans_commit_list_add_tail(ctx->net, trans);
 
 	return 0;
-err1:
-	kfree(trans);
-	return err;
 }
 
 static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 1e5e7a181e0b..2ee6e3672b41 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -174,7 +174,7 @@ static void nft_bitmap_activate(const struct net *net,
 	nft_set_elem_change_active(net, set, &be->ext);
 }
 
-static bool nft_bitmap_flush(const struct net *net,
+static void nft_bitmap_flush(const struct net *net,
 			     const struct nft_set *set, void *_be)
 {
 	struct nft_bitmap *priv = nft_set_priv(set);
@@ -186,8 +186,6 @@ static bool nft_bitmap_flush(const struct net *net,
 	/* Enter 10 state, similar to deactivation. */
 	priv->bitmap[idx] &= ~(genmask << off);
 	nft_set_elem_change_active(net, set, &be->ext);
-
-	return true;
 }
 
 static void *nft_bitmap_deactivate(const struct net *net,
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 2013de934cef..e758b887ad86 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -192,14 +192,12 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
 	nft_set_elem_change_active(net, set, &he->ext);
 }
 
-static bool nft_rhash_flush(const struct net *net,
+static void nft_rhash_flush(const struct net *net,
 			    const struct nft_set *set, void *priv)
 {
 	struct nft_rhash_elem *he = priv;
 
 	nft_set_elem_change_active(net, set, &he->ext);
-
-	return true;
 }
 
 static void *nft_rhash_deactivate(const struct net *net,
@@ -590,13 +588,12 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set,
 	nft_set_elem_change_active(net, set, &he->ext);
 }
 
-static bool nft_hash_flush(const struct net *net,
+static void nft_hash_flush(const struct net *net,
 			   const struct nft_set *set, void *priv)
 {
 	struct nft_hash_elem *he = priv;
 
 	nft_set_elem_change_active(net, set, &he->ext);
-	return true;
 }
 
 static void *nft_hash_deactivate(const struct net *net,
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index bea63aa2df4b..dba073aa9ad6 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1809,14 +1809,12 @@ static void *nft_pipapo_deactivate(const struct net *net,
  *
  * Return: true if element was found and deactivated.
  */
-static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set,
+static void nft_pipapo_flush(const struct net *net, const struct nft_set *set,
 			     void *elem)
 {
 	struct nft_pipapo_elem *e = elem;
 
 	nft_set_elem_change_active(net, set, &e->ext);
-
-	return true;
 }
 
 /**
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 7d1004f9e7d2..60ff591eb265 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -535,14 +535,12 @@ static void nft_rbtree_activate(const struct net *net,
 	nft_set_elem_change_active(net, set, &rbe->ext);
 }
 
-static bool nft_rbtree_flush(const struct net *net,
+static void nft_rbtree_flush(const struct net *net,
 			     const struct nft_set *set, void *priv)
 {
 	struct nft_rbtree_elem *rbe = priv;
 
 	nft_set_elem_change_active(net, set, &rbe->ext);
-
-	return true;
 }
 
 static void *nft_rbtree_deactivate(const struct net *net,
-- 
cgit v1.2.3-70-g09d2


From 9dad402b89e81a0516bad5e0ac009b7a0a80898f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 18 Oct 2023 22:23:07 +0200
Subject: netfilter: nf_tables: expose opaque set element as struct
 nft_elem_priv

Add placeholder structure and place it at the beginning of each struct
nft_*_elem for each existing set backend, instead of exposing elements
as void type to the frontend which defeats compiler type checks. Use
this pointer to this new type to replace void *.

This patch updates the following set backend API to use this new struct
nft_elem_priv placeholder structure:

- update
- deactivate
- flush
- get

as well as the following helper functions:

- nft_set_elem_ext()
- nft_set_elem_init()
- nft_set_elem_destroy()
- nf_tables_set_elem_destroy()

This patch adds nft_elem_priv_cast() to cast struct nft_elem_priv to
native element representation from the corresponding set backend.
BUILD_BUG_ON() makes sure this .priv placeholder is always at the top
of the opaque set element representation.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 38 ++++++++++++-------
 net/netfilter/nf_tables_api.c     | 27 +++++++------
 net/netfilter/nft_dynset.c        | 23 +++++------
 net/netfilter/nft_set_bitmap.c    | 35 +++++++++--------
 net/netfilter/nft_set_hash.c      | 80 ++++++++++++++++++++++-----------------
 net/netfilter/nft_set_pipapo.c    | 41 ++++++++++++--------
 net/netfilter/nft_set_pipapo.h    |  4 +-
 net/netfilter/nft_set_rbtree.c    | 46 ++++++++++++----------
 8 files changed, 173 insertions(+), 121 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d0f5c477c254..d287a778be65 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -274,6 +274,9 @@ struct nft_userdata {
 	unsigned char		data[];
 };
 
+/* placeholder structure for opaque set element backend representation. */
+struct nft_elem_priv { };
+
 /**
  *	struct nft_set_elem - generic representation of set elements
  *
@@ -294,9 +297,14 @@ struct nft_set_elem {
 		u32		buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
 		struct nft_data val;
 	} data;
-	void			*priv;
+	struct nft_elem_priv	*priv;
 };
 
+static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv)
+{
+	return (void *)priv;
+}
+
 struct nft_set;
 struct nft_set_iter {
 	u8		genmask;
@@ -430,7 +438,8 @@ struct nft_set_ops {
 						  const struct nft_set_ext **ext);
 	bool				(*update)(struct nft_set *set,
 						  const u32 *key,
-						  void *(*new)(struct nft_set *,
+						  struct nft_elem_priv *
+							(*new)(struct nft_set *,
 							       const struct nft_expr *,
 							       struct nft_regs *),
 						  const struct nft_expr *expr,
@@ -446,19 +455,19 @@ struct nft_set_ops {
 	void				(*activate)(const struct net *net,
 						    const struct nft_set *set,
 						    const struct nft_set_elem *elem);
-	void *				(*deactivate)(const struct net *net,
+	struct nft_elem_priv *		(*deactivate)(const struct net *net,
 						      const struct nft_set *set,
 						      const struct nft_set_elem *elem);
 	void				(*flush)(const struct net *net,
 						 const struct nft_set *set,
-						 void *priv);
+						 struct nft_elem_priv *priv);
 	void				(*remove)(const struct net *net,
 						  const struct nft_set *set,
 						  const struct nft_set_elem *elem);
 	void				(*walk)(const struct nft_ctx *ctx,
 						struct nft_set *set,
 						struct nft_set_iter *iter);
-	void *				(*get)(const struct net *net,
+	struct nft_elem_priv *		(*get)(const struct net *net,
 					       const struct nft_set *set,
 					       const struct nft_set_elem *elem,
 					       unsigned int flags);
@@ -796,9 +805,9 @@ static inline bool nft_set_elem_expired(const struct nft_set_ext *ext)
 }
 
 static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
-						   void *elem)
+						   const struct nft_elem_priv *elem_priv)
 {
-	return elem + set->ops->elemsize;
+	return (void *)elem_priv + set->ops->elemsize;
 }
 
 static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
@@ -810,16 +819,19 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
 					 const struct nft_set *set,
 					 const struct nlattr *attr);
 
-void *nft_set_elem_init(const struct nft_set *set,
-			const struct nft_set_ext_tmpl *tmpl,
-			const u32 *key, const u32 *key_end, const u32 *data,
-			u64 timeout, u64 expiration, gfp_t gfp);
+struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
+					const struct nft_set_ext_tmpl *tmpl,
+					const u32 *key, const u32 *key_end,
+					const u32 *data,
+					u64 timeout, u64 expiration, gfp_t gfp);
 int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
 			    struct nft_expr *expr_array[]);
-void nft_set_elem_destroy(const struct nft_set *set, void *elem,
+void nft_set_elem_destroy(const struct nft_set *set,
+			  const struct nft_elem_priv *elem_priv,
 			  bool destroy_expr);
 void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
-				const struct nft_set *set, void *elem);
+				const struct nft_set *set,
+				const struct nft_elem_priv *elem_priv);
 
 struct nft_expr_ops;
 /**
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 35db40857bc6..2ae81452113a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -601,7 +601,7 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
 struct nft_set_elem_catchall {
 	struct list_head	list;
 	struct rcu_head		rcu;
-	void			*elem;
+	struct nft_elem_priv	*elem;
 };
 
 static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
@@ -6218,10 +6218,11 @@ static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
 	return 0;
 }
 
-void *nft_set_elem_init(const struct nft_set *set,
-			const struct nft_set_ext_tmpl *tmpl,
-			const u32 *key, const u32 *key_end,
-			const u32 *data, u64 timeout, u64 expiration, gfp_t gfp)
+struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
+					const struct nft_set_ext_tmpl *tmpl,
+					const u32 *key, const u32 *key_end,
+					const u32 *data,
+					u64 timeout, u64 expiration, gfp_t gfp)
 {
 	struct nft_set_ext *ext;
 	void *elem;
@@ -6286,10 +6287,11 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
 }
 
 /* Drop references and destroy. Called from gc, dynset and abort path. */
-void nft_set_elem_destroy(const struct nft_set *set, void *elem,
+void nft_set_elem_destroy(const struct nft_set *set,
+			  const struct nft_elem_priv *elem_priv,
 			  bool destroy_expr)
 {
-	struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+	struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 	struct nft_ctx ctx = {
 		.net	= read_pnet(&set->net),
 		.family	= set->table->family,
@@ -6300,10 +6302,10 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
 		nft_data_release(nft_set_ext_data(ext), set->dtype);
 	if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
 		nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext));
-
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
 		nft_use_dec(&(*nft_set_ext_obj(ext))->use);
-	kfree(elem);
+
+	kfree(elem_priv);
 }
 EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
 
@@ -6311,14 +6313,15 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
  * path via nft_setelem_data_deactivate().
  */
 void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
-				const struct nft_set *set, void *elem)
+				const struct nft_set *set,
+				const struct nft_elem_priv *elem_priv)
 {
-	struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+	struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
 		nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
 
-	kfree(elem);
+	kfree(elem_priv);
 }
 
 int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 5c5cc01c73c5..b18a79039125 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -44,33 +44,34 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv,
 	return 0;
 }
 
-static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
-			    struct nft_regs *regs)
+static struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
+					    const struct nft_expr *expr,
+					    struct nft_regs *regs)
 {
 	const struct nft_dynset *priv = nft_expr_priv(expr);
 	struct nft_set_ext *ext;
+	void *elem_priv;
 	u64 timeout;
-	void *elem;
 
 	if (!atomic_add_unless(&set->nelems, 1, set->size))
 		return NULL;
 
 	timeout = priv->timeout ? : set->timeout;
-	elem = nft_set_elem_init(set, &priv->tmpl,
-				 &regs->data[priv->sreg_key], NULL,
-				 &regs->data[priv->sreg_data],
-				 timeout, 0, GFP_ATOMIC);
-	if (IS_ERR(elem))
+	elem_priv = nft_set_elem_init(set, &priv->tmpl,
+				      &regs->data[priv->sreg_key], NULL,
+				      &regs->data[priv->sreg_data],
+				      timeout, 0, GFP_ATOMIC);
+	if (IS_ERR(elem_priv))
 		goto err1;
 
-	ext = nft_set_elem_ext(set, elem);
+	ext = nft_set_elem_ext(set, elem_priv);
 	if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0)
 		goto err2;
 
-	return elem;
+	return elem_priv;
 
 err2:
-	nft_set_elem_destroy(set, elem, false);
+	nft_set_elem_destroy(set, elem_priv, false);
 err1:
 	if (set->size)
 		atomic_dec(&set->nelems);
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 2ee6e3672b41..a320e7614aaa 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -13,6 +13,7 @@
 #include <net/netfilter/nf_tables_core.h>
 
 struct nft_bitmap_elem {
+	struct nft_elem_priv	priv;
 	struct list_head	head;
 	struct nft_set_ext	ext;
 };
@@ -104,8 +105,9 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
 	return NULL;
 }
 
-static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
-			    const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_bitmap_get(const struct net *net, const struct nft_set *set,
+	       const struct nft_set_elem *elem, unsigned int flags)
 {
 	const struct nft_bitmap *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_cur(net);
@@ -116,7 +118,7 @@ static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
 		    !nft_set_elem_active(&be->ext, genmask))
 			continue;
 
-		return be;
+		return &be->priv;
 	}
 	return ERR_PTR(-ENOENT);
 }
@@ -125,8 +127,8 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
 			     const struct nft_set_elem *elem,
 			     struct nft_set_ext **ext)
 {
+	struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be;
 	struct nft_bitmap *priv = nft_set_priv(set);
-	struct nft_bitmap_elem *new = elem->priv, *be;
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
 
@@ -148,8 +150,8 @@ static void nft_bitmap_remove(const struct net *net,
 			      const struct nft_set *set,
 			      const struct nft_set_elem *elem)
 {
+	struct nft_bitmap_elem *be = nft_elem_priv_cast(elem->priv);
 	struct nft_bitmap *priv = nft_set_priv(set);
-	struct nft_bitmap_elem *be = elem->priv;
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
 
@@ -163,8 +165,8 @@ static void nft_bitmap_activate(const struct net *net,
 				const struct nft_set *set,
 				const struct nft_set_elem *elem)
 {
+	struct nft_bitmap_elem *be = nft_elem_priv_cast(elem->priv);
 	struct nft_bitmap *priv = nft_set_priv(set);
-	struct nft_bitmap_elem *be = elem->priv;
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
 
@@ -175,11 +177,12 @@ static void nft_bitmap_activate(const struct net *net,
 }
 
 static void nft_bitmap_flush(const struct net *net,
-			     const struct nft_set *set, void *_be)
+			     const struct nft_set *set,
+			     struct nft_elem_priv *elem_priv)
 {
+	struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
 	struct nft_bitmap *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_next(net);
-	struct nft_bitmap_elem *be = _be;
 	u32 idx, off;
 
 	nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
@@ -188,12 +191,12 @@ static void nft_bitmap_flush(const struct net *net,
 	nft_set_elem_change_active(net, set, &be->ext);
 }
 
-static void *nft_bitmap_deactivate(const struct net *net,
-				   const struct nft_set *set,
-				   const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_bitmap_deactivate(const struct net *net, const struct nft_set *set,
+		      const struct nft_set_elem *elem)
 {
+	struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be;
 	struct nft_bitmap *priv = nft_set_priv(set);
-	struct nft_bitmap_elem *this = elem->priv, *be;
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
 
@@ -207,7 +210,7 @@ static void *nft_bitmap_deactivate(const struct net *net,
 	priv->bitmap[idx] &= ~(genmask << off);
 	nft_set_elem_change_active(net, set, &be->ext);
 
-	return be;
+	return &be->priv;
 }
 
 static void nft_bitmap_walk(const struct nft_ctx *ctx,
@@ -224,7 +227,7 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
 		if (!nft_set_elem_active(&be->ext, iter->genmask))
 			goto cont;
 
-		elem.priv = be;
+		elem.priv = &be->priv;
 
 		iter->err = iter->fn(ctx, set, iter, &elem);
 
@@ -263,6 +266,8 @@ static int nft_bitmap_init(const struct nft_set *set,
 {
 	struct nft_bitmap *priv = nft_set_priv(set);
 
+	BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0);
+
 	INIT_LIST_HEAD(&priv->list);
 	priv->bitmap_size = nft_bitmap_size(set->klen);
 
@@ -276,7 +281,7 @@ static void nft_bitmap_destroy(const struct nft_ctx *ctx,
 	struct nft_bitmap_elem *be, *n;
 
 	list_for_each_entry_safe(be, n, &priv->list, head)
-		nf_tables_set_elem_destroy(ctx, set, be);
+		nf_tables_set_elem_destroy(ctx, set, &be->priv);
 }
 
 static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index e758b887ad86..0691565caa81 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -27,6 +27,7 @@ struct nft_rhash {
 };
 
 struct nft_rhash_elem {
+	struct nft_elem_priv		priv;
 	struct rhash_head		node;
 	struct nft_set_ext		ext;
 };
@@ -95,8 +96,9 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
 	return !!he;
 }
 
-static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
-			   const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_rhash_get(const struct net *net, const struct nft_set *set,
+	      const struct nft_set_elem *elem, unsigned int flags)
 {
 	struct nft_rhash *priv = nft_set_priv(set);
 	struct nft_rhash_elem *he;
@@ -108,13 +110,14 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
 
 	he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
 	if (he != NULL)
-		return he;
+		return &he->priv;
 
 	return ERR_PTR(-ENOENT);
 }
 
 static bool nft_rhash_update(struct nft_set *set, const u32 *key,
-			     void *(*new)(struct nft_set *,
+			     struct nft_elem_priv *
+				   (*new)(struct nft_set *,
 					  const struct nft_expr *,
 					  struct nft_regs *regs),
 			     const struct nft_expr *expr,
@@ -123,6 +126,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
 {
 	struct nft_rhash *priv = nft_set_priv(set);
 	struct nft_rhash_elem *he, *prev;
+	struct nft_elem_priv *elem_priv;
 	struct nft_rhash_cmp_arg arg = {
 		.genmask = NFT_GENMASK_ANY,
 		.set	 = set,
@@ -133,10 +137,11 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
 	if (he != NULL)
 		goto out;
 
-	he = new(set, expr, regs);
-	if (he == NULL)
+	elem_priv = new(set, expr, regs);
+	if (!elem_priv)
 		goto err1;
 
+	he = nft_elem_priv_cast(elem_priv);
 	prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
 						nft_rhash_params);
 	if (IS_ERR(prev))
@@ -144,7 +149,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
 
 	/* Another cpu may race to insert the element with the same key */
 	if (prev) {
-		nft_set_elem_destroy(set, he, true);
+		nft_set_elem_destroy(set, &he->priv, true);
 		atomic_dec(&set->nelems);
 		he = prev;
 	}
@@ -154,7 +159,7 @@ out:
 	return true;
 
 err2:
-	nft_set_elem_destroy(set, he, true);
+	nft_set_elem_destroy(set, &he->priv, true);
 	atomic_dec(&set->nelems);
 err1:
 	return false;
@@ -164,8 +169,8 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
 			    const struct nft_set_elem *elem,
 			    struct nft_set_ext **ext)
 {
+	struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
 	struct nft_rhash *priv = nft_set_priv(set);
-	struct nft_rhash_elem *he = elem->priv;
 	struct nft_rhash_cmp_arg arg = {
 		.genmask = nft_genmask_next(net),
 		.set	 = set,
@@ -187,22 +192,23 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
 static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
 			       const struct nft_set_elem *elem)
 {
-	struct nft_rhash_elem *he = elem->priv;
+	struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
 
 	nft_set_elem_change_active(net, set, &he->ext);
 }
 
 static void nft_rhash_flush(const struct net *net,
-			    const struct nft_set *set, void *priv)
+			    const struct nft_set *set,
+			    struct nft_elem_priv *elem_priv)
 {
-	struct nft_rhash_elem *he = priv;
+	struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
 
 	nft_set_elem_change_active(net, set, &he->ext);
 }
 
-static void *nft_rhash_deactivate(const struct net *net,
-				  const struct nft_set *set,
-				  const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_rhash_deactivate(const struct net *net, const struct nft_set *set,
+		     const struct nft_set_elem *elem)
 {
 	struct nft_rhash *priv = nft_set_priv(set);
 	struct nft_rhash_elem *he;
@@ -219,15 +225,15 @@ static void *nft_rhash_deactivate(const struct net *net,
 
 	rcu_read_unlock();
 
-	return he;
+	return &he->priv;
 }
 
 static void nft_rhash_remove(const struct net *net,
 			     const struct nft_set *set,
 			     const struct nft_set_elem *elem)
 {
+	struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
 	struct nft_rhash *priv = nft_set_priv(set);
-	struct nft_rhash_elem *he = elem->priv;
 
 	rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
 }
@@ -278,7 +284,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 		if (!nft_set_elem_active(&he->ext, iter->genmask))
 			goto cont;
 
-		elem.priv = he;
+		elem.priv = &he->priv;
 
 		iter->err = iter->fn(ctx, set, iter, &elem);
 		if (iter->err < 0)
@@ -404,6 +410,8 @@ static int nft_rhash_init(const struct nft_set *set,
 	struct rhashtable_params params = nft_rhash_params;
 	int err;
 
+	BUILD_BUG_ON(offsetof(struct nft_rhash_elem, priv) != 0);
+
 	params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT;
 	params.key_len	  = set->klen;
 
@@ -426,8 +434,9 @@ struct nft_rhash_ctx {
 static void nft_rhash_elem_destroy(void *ptr, void *arg)
 {
 	struct nft_rhash_ctx *rhash_ctx = arg;
+	struct nft_rhash_elem *he = ptr;
 
-	nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr);
+	nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, &he->priv);
 }
 
 static void nft_rhash_destroy(const struct nft_ctx *ctx,
@@ -474,6 +483,7 @@ struct nft_hash {
 };
 
 struct nft_hash_elem {
+	struct nft_elem_priv		priv;
 	struct hlist_node		node;
 	struct nft_set_ext		ext;
 };
@@ -499,8 +509,9 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
 	return false;
 }
 
-static void *nft_hash_get(const struct net *net, const struct nft_set *set,
-			  const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_hash_get(const struct net *net, const struct nft_set *set,
+	     const struct nft_set_elem *elem, unsigned int flags)
 {
 	struct nft_hash *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_cur(net);
@@ -512,7 +523,7 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set,
 	hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
 		if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) &&
 		    nft_set_elem_active(&he->ext, genmask))
-			return he;
+			return &he->priv;
 	}
 	return ERR_PTR(-ENOENT);
 }
@@ -562,7 +573,7 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
 			   const struct nft_set_elem *elem,
 			   struct nft_set_ext **ext)
 {
-	struct nft_hash_elem *this = elem->priv, *he;
+	struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
 	struct nft_hash *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_next(net);
 	u32 hash;
@@ -583,25 +594,26 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
 static void nft_hash_activate(const struct net *net, const struct nft_set *set,
 			      const struct nft_set_elem *elem)
 {
-	struct nft_hash_elem *he = elem->priv;
+	struct nft_hash_elem *he = nft_elem_priv_cast(elem->priv);
 
 	nft_set_elem_change_active(net, set, &he->ext);
 }
 
 static void nft_hash_flush(const struct net *net,
-			   const struct nft_set *set, void *priv)
+			   const struct nft_set *set,
+			   struct nft_elem_priv *elem_priv)
 {
-	struct nft_hash_elem *he = priv;
+	struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
 
 	nft_set_elem_change_active(net, set, &he->ext);
 }
 
-static void *nft_hash_deactivate(const struct net *net,
-				 const struct nft_set *set,
-				 const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_hash_deactivate(const struct net *net, const struct nft_set *set,
+		    const struct nft_set_elem *elem)
 {
+	struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
 	struct nft_hash *priv = nft_set_priv(set);
-	struct nft_hash_elem *this = elem->priv, *he;
 	u8 genmask = nft_genmask_next(net);
 	u32 hash;
 
@@ -611,7 +623,7 @@ static void *nft_hash_deactivate(const struct net *net,
 			    set->klen) &&
 		    nft_set_elem_active(&he->ext, genmask)) {
 			nft_set_elem_change_active(net, set, &he->ext);
-			return he;
+			return &he->priv;
 		}
 	}
 	return NULL;
@@ -621,7 +633,7 @@ static void nft_hash_remove(const struct net *net,
 			    const struct nft_set *set,
 			    const struct nft_set_elem *elem)
 {
-	struct nft_hash_elem *he = elem->priv;
+	struct nft_hash_elem *he = nft_elem_priv_cast(elem->priv);
 
 	hlist_del_rcu(&he->node);
 }
@@ -641,7 +653,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 			if (!nft_set_elem_active(&he->ext, iter->genmask))
 				goto cont;
 
-			elem.priv = he;
+			elem.priv = &he->priv;
 
 			iter->err = iter->fn(ctx, set, iter, &elem);
 			if (iter->err < 0)
@@ -682,7 +694,7 @@ static void nft_hash_destroy(const struct nft_ctx *ctx,
 	for (i = 0; i < priv->buckets; i++) {
 		hlist_for_each_entry_safe(he, next, &priv->table[i], node) {
 			hlist_del_rcu(&he->node);
-			nf_tables_set_elem_destroy(ctx, set, he);
+			nf_tables_set_elem_destroy(ctx, set, &he->priv);
 		}
 	}
 }
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index dba073aa9ad6..0969d2cb637b 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -599,11 +599,18 @@ out:
  * @elem:	nftables API element representation containing key data
  * @flags:	Unused
  */
-static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
-			    const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_pipapo_get(const struct net *net, const struct nft_set *set,
+	       const struct nft_set_elem *elem, unsigned int flags)
 {
-	return pipapo_get(net, set, (const u8 *)elem->key.val.data,
-			 nft_genmask_cur(net));
+	static struct nft_pipapo_elem *e;
+
+	e = pipapo_get(net, set, (const u8 *)elem->key.val.data,
+		       nft_genmask_cur(net));
+	if (IS_ERR(e))
+		return ERR_CAST(e);
+
+	return &e->priv;
 }
 
 /**
@@ -1162,10 +1169,10 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
 	union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
 	const u8 *start = (const u8 *)elem->key.val.data, *end;
-	struct nft_pipapo_elem *e = elem->priv, *dup;
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_match *m = priv->clone;
 	u8 genmask = nft_genmask_next(net);
+	struct nft_pipapo_elem *e, *dup;
 	struct nft_pipapo_field *f;
 	const u8 *start_p, *end_p;
 	int i, bsize_max, err = 0;
@@ -1263,6 +1270,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 		put_cpu_ptr(m->scratch);
 	}
 
+	e = nft_elem_priv_cast(elem->priv);
 	*ext2 = &e->ext;
 
 	pipapo_map(m, rulemap, e);
@@ -1541,7 +1549,7 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
 
 {
 	struct nft_set_elem elem = {
-		.priv	= e,
+		.priv	= &e->priv,
 	};
 
 	nft_setelem_data_deactivate(net, set, &elem);
@@ -1742,7 +1750,7 @@ static void nft_pipapo_activate(const struct net *net,
 				const struct nft_set *set,
 				const struct nft_set_elem *elem)
 {
-	struct nft_pipapo_elem *e = elem->priv;
+	struct nft_pipapo_elem *e = nft_elem_priv_cast(elem->priv);
 
 	nft_set_elem_change_active(net, set, &e->ext);
 }
@@ -1782,9 +1790,9 @@ static void *pipapo_deactivate(const struct net *net, const struct nft_set *set,
  *
  * Return: deactivated element if found, NULL otherwise.
  */
-static void *nft_pipapo_deactivate(const struct net *net,
-				   const struct nft_set *set,
-				   const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
+		      const struct nft_set_elem *elem)
 {
 	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
 
@@ -1810,9 +1818,9 @@ static void *nft_pipapo_deactivate(const struct net *net,
  * Return: true if element was found and deactivated.
  */
 static void nft_pipapo_flush(const struct net *net, const struct nft_set *set,
-			     void *elem)
+			     struct nft_elem_priv *elem_priv)
 {
-	struct nft_pipapo_elem *e = elem;
+	struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv);
 
 	nft_set_elem_change_active(net, set, &e->ext);
 }
@@ -1949,10 +1957,11 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_match *m = priv->clone;
-	struct nft_pipapo_elem *e = elem->priv;
 	int rules_f0, first_rule = 0;
+	struct nft_pipapo_elem *e;
 	const u8 *data;
 
+	e = nft_elem_priv_cast(elem->priv);
 	data = (const u8 *)nft_set_ext_key(&e->ext);
 
 	while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
@@ -2039,7 +2048,7 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
 
 		e = f->mt[r].e;
 
-		elem.priv = e;
+		elem.priv = &e->priv;
 
 		iter->err = iter->fn(ctx, set, iter, &elem);
 		if (iter->err < 0)
@@ -2113,6 +2122,8 @@ static int nft_pipapo_init(const struct nft_set *set,
 	struct nft_pipapo_field *f;
 	int err, i, field_count;
 
+	BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0);
+
 	field_count = desc->field_count ? : 1;
 
 	if (field_count > NFT_PIPAPO_MAX_FIELDS)
@@ -2207,7 +2218,7 @@ static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx,
 
 		e = f->mt[r].e;
 
-		nf_tables_set_elem_destroy(ctx, set, e);
+		nf_tables_set_elem_destroy(ctx, set, &e->priv);
 	}
 }
 
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index 2e164a319945..1040223da5fa 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -170,10 +170,12 @@ struct nft_pipapo_elem;
 
 /**
  * struct nft_pipapo_elem - API-facing representation of single set element
+ * @priv:	element placeholder
  * @ext:	nftables API extensions
  */
 struct nft_pipapo_elem {
-	struct nft_set_ext ext;
+	struct nft_elem_priv	priv;
+	struct nft_set_ext	ext;
 };
 
 int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 60ff591eb265..475f22568342 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -23,6 +23,7 @@ struct nft_rbtree {
 };
 
 struct nft_rbtree_elem {
+	struct nft_elem_priv	priv;
 	struct rb_node		node;
 	struct nft_set_ext	ext;
 };
@@ -196,8 +197,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
 	return false;
 }
 
-static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
-			    const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_rbtree_get(const struct net *net, const struct nft_set *set,
+	       const struct nft_set_elem *elem, unsigned int flags)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
 	unsigned int seq = read_seqcount_begin(&priv->count);
@@ -208,16 +210,17 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
 
 	ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 	if (ret || !read_seqcount_retry(&priv->count, seq))
-		return rbe;
+		return &rbe->priv;
 
 	read_lock_bh(&priv->lock);
 	seq = read_seqcount_begin(&priv->count);
 	ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
-	if (!ret)
-		rbe = ERR_PTR(-ENOENT);
 	read_unlock_bh(&priv->lock);
 
-	return rbe;
+	if (!ret)
+		return ERR_PTR(-ENOENT);
+
+	return &rbe->priv;
 }
 
 static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
@@ -225,7 +228,7 @@ static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
 				      struct nft_rbtree_elem *rbe)
 {
 	struct nft_set_elem elem = {
-		.priv	= rbe,
+		.priv	= &rbe->priv,
 	};
 
 	lockdep_assert_held_write(&priv->lock);
@@ -487,8 +490,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			     const struct nft_set_elem *elem,
 			     struct nft_set_ext **ext)
 {
+	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
 	struct nft_rbtree *priv = nft_set_priv(set);
-	struct nft_rbtree_elem *rbe = elem->priv;
 	int err;
 
 	do {
@@ -520,8 +523,8 @@ static void nft_rbtree_remove(const struct net *net,
 			      const struct nft_set *set,
 			      const struct nft_set_elem *elem)
 {
+	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
 	struct nft_rbtree *priv = nft_set_priv(set);
-	struct nft_rbtree_elem *rbe = elem->priv;
 
 	nft_rbtree_erase(priv, rbe);
 }
@@ -530,26 +533,27 @@ static void nft_rbtree_activate(const struct net *net,
 				const struct nft_set *set,
 				const struct nft_set_elem *elem)
 {
-	struct nft_rbtree_elem *rbe = elem->priv;
+	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
 
 	nft_set_elem_change_active(net, set, &rbe->ext);
 }
 
 static void nft_rbtree_flush(const struct net *net,
-			     const struct nft_set *set, void *priv)
+			     const struct nft_set *set,
+			     struct nft_elem_priv *elem_priv)
 {
-	struct nft_rbtree_elem *rbe = priv;
+	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
 
 	nft_set_elem_change_active(net, set, &rbe->ext);
 }
 
-static void *nft_rbtree_deactivate(const struct net *net,
-				   const struct nft_set *set,
-				   const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
+		      const struct nft_set_elem *elem)
 {
+	struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
 	const struct nft_rbtree *priv = nft_set_priv(set);
 	const struct rb_node *parent = priv->root.rb_node;
-	struct nft_rbtree_elem *rbe, *this = elem->priv;
 	u8 genmask = nft_genmask_next(net);
 	int d;
 
@@ -577,8 +581,8 @@ static void *nft_rbtree_deactivate(const struct net *net,
 				parent = parent->rb_left;
 				continue;
 			}
-			nft_rbtree_flush(net, set, rbe);
-			return rbe;
+			nft_rbtree_flush(net, set, &rbe->priv);
+			return &rbe->priv;
 		}
 	}
 	return NULL;
@@ -602,7 +606,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
 		if (!nft_set_elem_active(&rbe->ext, iter->genmask))
 			goto cont;
 
-		elem.priv = rbe;
+		elem.priv = &rbe->priv;
 
 		iter->err = iter->fn(ctx, set, iter, &elem);
 		if (iter->err < 0) {
@@ -702,6 +706,8 @@ static int nft_rbtree_init(const struct nft_set *set,
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
 
+	BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0);
+
 	rwlock_init(&priv->lock);
 	seqcount_rwlock_init(&priv->count, &priv->lock);
 	priv->root = RB_ROOT;
@@ -719,7 +725,7 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx,
 	while ((node = priv->root.rb_node) != NULL) {
 		rb_erase(node, &priv->root);
 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
-		nf_tables_set_elem_destroy(ctx, set, rbe);
+		nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 0e1ea651c9717ddcd8e0648d8468477a31867b0a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 16 Oct 2023 14:29:27 +0200
Subject: netfilter: nf_tables: shrink memory consumption of set elements

Instead of copying struct nft_set_elem into struct nft_trans_elem, store
the pointer to the opaque set element object in the transaction. Adapt
set backend API (and set backend implementations) to take the pointer to
opaque set element representation whenever required.

This patch deconstifies .remove() and .activate() set backend API since
these modify the set element opaque object. And it also constify
nft_set_elem_ext() this provides access to the nft_set_ext struct
without updating the object.

According to pahole on x86_64, this patch shrinks struct nft_trans_elem
size from 216 to 24 bytes.

This patch also reduces stack memory consumption by removing the
template struct nft_set_elem object, using the opaque set element object
instead such as from the set iterator API, catchall elements and the get
element command.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  18 ++---
 net/netfilter/nf_tables_api.c     | 166 +++++++++++++++++---------------------
 net/netfilter/nft_set_bitmap.c    |  16 ++--
 net/netfilter/nft_set_hash.c      |  26 +++---
 net/netfilter/nft_set_pipapo.c    |  25 +++---
 net/netfilter/nft_set_rbtree.c    |  25 ++----
 6 files changed, 116 insertions(+), 160 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d287a778be65..b63f35fb2a99 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -314,7 +314,7 @@ struct nft_set_iter {
 	int		(*fn)(const struct nft_ctx *ctx,
 			      struct nft_set *set,
 			      const struct nft_set_iter *iter,
-			      struct nft_set_elem *elem);
+			      struct nft_elem_priv *elem_priv);
 };
 
 /**
@@ -454,7 +454,7 @@ struct nft_set_ops {
 						  struct nft_set_ext **ext);
 	void				(*activate)(const struct net *net,
 						    const struct nft_set *set,
-						    const struct nft_set_elem *elem);
+						    struct nft_elem_priv *elem_priv);
 	struct nft_elem_priv *		(*deactivate)(const struct net *net,
 						      const struct nft_set *set,
 						      const struct nft_set_elem *elem);
@@ -463,7 +463,7 @@ struct nft_set_ops {
 						 struct nft_elem_priv *priv);
 	void				(*remove)(const struct net *net,
 						  const struct nft_set *set,
-						  const struct nft_set_elem *elem);
+						  struct nft_elem_priv *elem_priv);
 	void				(*walk)(const struct nft_ctx *ctx,
 						struct nft_set *set,
 						struct nft_set_iter *iter);
@@ -1073,7 +1073,7 @@ struct nft_chain {
 int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
 int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
 			 const struct nft_set_iter *iter,
-			 struct nft_set_elem *elem);
+			 struct nft_elem_priv *elem_priv);
 int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
 int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
 void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
@@ -1650,14 +1650,14 @@ struct nft_trans_table {
 
 struct nft_trans_elem {
 	struct nft_set			*set;
-	struct nft_set_elem		elem;
+	struct nft_elem_priv		*elem_priv;
 	bool				bound;
 };
 
 #define nft_trans_elem_set(trans)	\
 	(((struct nft_trans_elem *)trans->data)->set)
-#define nft_trans_elem(trans)	\
-	(((struct nft_trans_elem *)trans->data)->elem)
+#define nft_trans_elem_priv(trans)	\
+	(((struct nft_trans_elem *)trans->data)->elem_priv)
 #define nft_trans_elem_set_bound(trans)	\
 	(((struct nft_trans_elem *)trans->data)->bound)
 
@@ -1698,7 +1698,7 @@ struct nft_trans_gc {
 	struct nft_set		*set;
 	u32			seq;
 	u16			count;
-	void			*priv[NFT_TRANS_GC_BATCHCOUNT];
+	struct nft_elem_priv	*priv[NFT_TRANS_GC_BATCHCOUNT];
 	struct rcu_head		rcu;
 };
 
@@ -1721,7 +1721,7 @@ struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc);
 
 void nft_setelem_data_deactivate(const struct net *net,
 				 const struct nft_set *set,
-				 struct nft_set_elem *elem);
+				 struct nft_elem_priv *elem_priv);
 
 int __init nft_chain_filter_init(void);
 void nft_chain_filter_fini(void);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2ae81452113a..79b0ed92367a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -591,9 +591,9 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
 static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
 				  struct nft_set *set,
 				  const struct nft_set_iter *iter,
-				  struct nft_set_elem *elem)
+				  struct nft_elem_priv *elem_priv)
 {
-	nft_setelem_data_deactivate(ctx->net, set, elem);
+	nft_setelem_data_deactivate(ctx->net, set, elem_priv);
 
 	return 0;
 }
@@ -609,7 +609,6 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
 {
 	u8 genmask = nft_genmask_next(ctx->net);
 	struct nft_set_elem_catchall *catchall;
-	struct nft_set_elem elem;
 	struct nft_set_ext *ext;
 
 	list_for_each_entry(catchall, &set->catchall_list, list) {
@@ -617,8 +616,7 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
 		if (!nft_set_elem_active(ext, genmask))
 			continue;
 
-		elem.priv = catchall->elem;
-		nft_setelem_data_deactivate(ctx->net, set, &elem);
+		nft_setelem_data_deactivate(ctx->net, set, catchall->elem);
 		break;
 	}
 }
@@ -3807,9 +3805,9 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
 
 int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
 			 const struct nft_set_iter *iter,
-			 struct nft_set_elem *elem)
+			 struct nft_elem_priv *elem_priv)
 {
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 	struct nft_ctx *pctx = (struct nft_ctx *)ctx;
 	const struct nft_data *data;
 	int err;
@@ -3839,7 +3837,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
 {
 	u8 genmask = nft_genmask_next(ctx->net);
 	struct nft_set_elem_catchall *catchall;
-	struct nft_set_elem elem;
 	struct nft_set_ext *ext;
 	int ret = 0;
 
@@ -3848,8 +3845,7 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
 		if (!nft_set_elem_active(ext, genmask))
 			continue;
 
-		elem.priv = catchall->elem;
-		ret = nft_setelem_validate(ctx, set, NULL, &elem);
+		ret = nft_setelem_validate(ctx, set, NULL, catchall->elem);
 		if (ret < 0)
 			return ret;
 	}
@@ -5301,9 +5297,9 @@ static int nft_validate_register_store(const struct nft_ctx *ctx,
 
 static int nft_setelem_data_validate(const struct nft_ctx *ctx,
 				     struct nft_set *set,
-				     struct nft_set_elem *elem)
+				     struct nft_elem_priv *elem_priv)
 {
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 	enum nft_registers dreg;
 
 	dreg = nft_type_to_reg(set->dtype);
@@ -5316,9 +5312,9 @@ static int nft_setelem_data_validate(const struct nft_ctx *ctx,
 static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
 					struct nft_set *set,
 					const struct nft_set_iter *iter,
-					struct nft_set_elem *elem)
+					struct nft_elem_priv *elem_priv)
 {
-	return nft_setelem_data_validate(ctx, set, elem);
+	return nft_setelem_data_validate(ctx, set, elem_priv);
 }
 
 static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
@@ -5326,7 +5322,6 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
 {
 	u8 genmask = nft_genmask_next(ctx->net);
 	struct nft_set_elem_catchall *catchall;
-	struct nft_set_elem elem;
 	struct nft_set_ext *ext;
 	int ret = 0;
 
@@ -5335,8 +5330,7 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
 		if (!nft_set_elem_active(ext, genmask))
 			continue;
 
-		elem.priv = catchall->elem;
-		ret = nft_setelem_data_validate(ctx, set, &elem);
+		ret = nft_setelem_data_validate(ctx, set, catchall->elem);
 		if (ret < 0)
 			break;
 	}
@@ -5403,14 +5397,14 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 
 static void nft_setelem_data_activate(const struct net *net,
 				      const struct nft_set *set,
-				      struct nft_set_elem *elem);
+				      struct nft_elem_priv *elem_priv);
 
 static int nft_mapelem_activate(const struct nft_ctx *ctx,
 				struct nft_set *set,
 				const struct nft_set_iter *iter,
-				struct nft_set_elem *elem)
+				struct nft_elem_priv *elem_priv)
 {
-	nft_setelem_data_activate(ctx->net, set, elem);
+	nft_setelem_data_activate(ctx->net, set, elem_priv);
 
 	return 0;
 }
@@ -5420,7 +5414,6 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx,
 {
 	u8 genmask = nft_genmask_next(ctx->net);
 	struct nft_set_elem_catchall *catchall;
-	struct nft_set_elem elem;
 	struct nft_set_ext *ext;
 
 	list_for_each_entry(catchall, &set->catchall_list, list) {
@@ -5428,8 +5421,7 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx,
 		if (!nft_set_elem_active(ext, genmask))
 			continue;
 
-		elem.priv = catchall->elem;
-		nft_setelem_data_activate(ctx->net, set, &elem);
+		nft_setelem_data_activate(ctx->net, set, catchall->elem);
 		break;
 	}
 }
@@ -5608,10 +5600,10 @@ nla_put_failure:
 
 static int nf_tables_fill_setelem(struct sk_buff *skb,
 				  const struct nft_set *set,
-				  const struct nft_set_elem *elem,
+				  const struct nft_elem_priv *elem_priv,
 				  bool reset)
 {
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
 
@@ -5697,16 +5689,16 @@ struct nft_set_dump_args {
 static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
 				  struct nft_set *set,
 				  const struct nft_set_iter *iter,
-				  struct nft_set_elem *elem)
+				  struct nft_elem_priv *elem_priv)
 {
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 	struct nft_set_dump_args *args;
 
 	if (nft_set_elem_expired(ext))
 		return 0;
 
 	args = container_of(iter, struct nft_set_dump_args, iter);
-	return nf_tables_fill_setelem(args->skb, set, elem, args->reset);
+	return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset);
 }
 
 static void audit_log_nft_set_reset(const struct nft_table *table,
@@ -5731,7 +5723,6 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
 {
 	struct nft_set_elem_catchall *catchall;
 	u8 genmask = nft_genmask_cur(net);
-	struct nft_set_elem elem;
 	struct nft_set_ext *ext;
 	int ret = 0;
 
@@ -5741,8 +5732,7 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
 		    nft_set_elem_expired(ext))
 			continue;
 
-		elem.priv = catchall->elem;
-		ret = nf_tables_fill_setelem(skb, set, &elem, reset);
+		ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset);
 		if (reset && !ret)
 			audit_log_nft_set_reset(set->table, base_seq, 1);
 		break;
@@ -5867,7 +5857,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
 				       const struct nft_ctx *ctx, u32 seq,
 				       u32 portid, int event, u16 flags,
 				       const struct nft_set *set,
-				       const struct nft_set_elem *elem,
+				       const struct nft_elem_priv *elem_priv,
 				       bool reset)
 {
 	struct nlmsghdr *nlh;
@@ -5889,7 +5879,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	err = nf_tables_fill_setelem(skb, set, elem, reset);
+	err = nf_tables_fill_setelem(skb, set, elem_priv, reset);
 	if (err < 0)
 		goto nla_put_failure;
 
@@ -6039,7 +6029,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		return err;
 
 	err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
-					  NFT_MSG_NEWSETELEM, 0, set, &elem,
+					  NFT_MSG_NEWSETELEM, 0, set, elem.priv,
 					  reset);
 	if (err < 0)
 		goto err_fill_setelem;
@@ -6122,7 +6112,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 
 static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
 				     const struct nft_set *set,
-				     const struct nft_set_elem *elem,
+				     const struct nft_elem_priv *elem_priv,
 				     int event)
 {
 	struct nftables_pernet *nft_net;
@@ -6143,7 +6133,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
 		flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
 
 	err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
-					  set, elem, false);
+					  set, elem_priv, false);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -6456,9 +6446,9 @@ static int nft_setelem_insert(const struct net *net,
 }
 
 static bool nft_setelem_is_catchall(const struct nft_set *set,
-				    const struct nft_set_elem *elem)
+				    const struct nft_elem_priv *elem_priv)
 {
-	struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
 	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL)
@@ -6468,14 +6458,14 @@ static bool nft_setelem_is_catchall(const struct nft_set *set,
 }
 
 static void nft_setelem_activate(struct net *net, struct nft_set *set,
-				 struct nft_set_elem *elem)
+				 struct nft_elem_priv *elem_priv)
 {
-	struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 
-	if (nft_setelem_is_catchall(set, elem)) {
+	if (nft_setelem_is_catchall(set, elem_priv)) {
 		nft_set_elem_change_active(net, set, ext);
 	} else {
-		set->ops->activate(net, set, elem);
+		set->ops->activate(net, set, elem_priv);
 	}
 }
 
@@ -6533,12 +6523,12 @@ static int nft_setelem_deactivate(const struct net *net,
 
 static void nft_setelem_catchall_remove(const struct net *net,
 					const struct nft_set *set,
-					const struct nft_set_elem *elem)
+					struct nft_elem_priv *elem_priv)
 {
 	struct nft_set_elem_catchall *catchall, *next;
 
 	list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
-		if (catchall->elem == elem->priv) {
+		if (catchall->elem == elem_priv) {
 			list_del_rcu(&catchall->list);
 			kfree_rcu(catchall, rcu);
 			break;
@@ -6548,12 +6538,12 @@ static void nft_setelem_catchall_remove(const struct net *net,
 
 static void nft_setelem_remove(const struct net *net,
 			       const struct nft_set *set,
-			       const struct nft_set_elem *elem)
+			       struct nft_elem_priv *elem_priv)
 {
-	if (nft_setelem_is_catchall(set, elem))
-		nft_setelem_catchall_remove(net, set, elem);
+	if (nft_setelem_is_catchall(set, elem_priv))
+		nft_setelem_catchall_remove(net, set, elem_priv);
 	else
-		set->ops->remove(net, set, elem);
+		set->ops->remove(net, set, elem_priv);
 }
 
 static bool nft_setelem_valid_key_end(const struct nft_set *set,
@@ -6921,12 +6911,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		}
 	}
 
-	nft_trans_elem(trans) = elem;
+	nft_trans_elem_priv(trans) = elem.priv;
 	nft_trans_commit_list_add_tail(ctx->net, trans);
 	return 0;
 
 err_set_full:
-	nft_setelem_remove(ctx->net, set, &elem);
+	nft_setelem_remove(ctx->net, set, elem.priv);
 err_element_clash:
 	kfree(trans);
 err_elem_free:
@@ -7027,9 +7017,9 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
 
 static void nft_setelem_data_activate(const struct net *net,
 				      const struct nft_set *set,
-				      struct nft_set_elem *elem)
+				      struct nft_elem_priv *elem_priv)
 {
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
 		nft_data_hold(nft_set_ext_data(ext), set->dtype);
@@ -7039,9 +7029,9 @@ static void nft_setelem_data_activate(const struct net *net,
 
 void nft_setelem_data_deactivate(const struct net *net,
 				 const struct nft_set *set,
-				 struct nft_set_elem *elem)
+				 struct nft_elem_priv *elem_priv)
 {
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
 		nft_data_release(nft_set_ext_data(ext), set->dtype);
@@ -7126,9 +7116,9 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 	if (err < 0)
 		goto fail_ops;
 
-	nft_setelem_data_deactivate(ctx->net, set, &elem);
+	nft_setelem_data_deactivate(ctx->net, set, elem.priv);
 
-	nft_trans_elem(trans) = elem;
+	nft_trans_elem_priv(trans) = elem.priv;
 	nft_trans_commit_list_add_tail(ctx->net, trans);
 	return 0;
 
@@ -7146,7 +7136,7 @@ fail_elem:
 static int nft_setelem_flush(const struct nft_ctx *ctx,
 			     struct nft_set *set,
 			     const struct nft_set_iter *iter,
-			     struct nft_set_elem *elem)
+			     struct nft_elem_priv *elem_priv)
 {
 	struct nft_trans *trans;
 
@@ -7155,12 +7145,12 @@ static int nft_setelem_flush(const struct nft_ctx *ctx,
 	if (!trans)
 		return -ENOMEM;
 
-	set->ops->flush(ctx->net, set, elem->priv);
+	set->ops->flush(ctx->net, set, elem_priv);
 	set->ndeact++;
 
-	nft_setelem_data_deactivate(ctx->net, set, elem);
+	nft_setelem_data_deactivate(ctx->net, set, elem_priv);
 	nft_trans_elem_set(trans) = set;
-	nft_trans_elem(trans) = *elem;
+	nft_trans_elem_priv(trans) = elem_priv;
 	nft_trans_commit_list_add_tail(ctx->net, trans);
 
 	return 0;
@@ -7168,7 +7158,7 @@ static int nft_setelem_flush(const struct nft_ctx *ctx,
 
 static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
 				    struct nft_set *set,
-				    struct nft_set_elem *elem)
+				    struct nft_elem_priv *elem_priv)
 {
 	struct nft_trans *trans;
 
@@ -7177,9 +7167,9 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
 	if (!trans)
 		return -ENOMEM;
 
-	nft_setelem_data_deactivate(ctx->net, set, elem);
+	nft_setelem_data_deactivate(ctx->net, set, elem_priv);
 	nft_trans_elem_set(trans) = set;
-	nft_trans_elem(trans) = *elem;
+	nft_trans_elem_priv(trans) = elem_priv;
 	nft_trans_commit_list_add_tail(ctx->net, trans);
 
 	return 0;
@@ -7190,7 +7180,6 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx,
 {
 	u8 genmask = nft_genmask_next(ctx->net);
 	struct nft_set_elem_catchall *catchall;
-	struct nft_set_elem elem;
 	struct nft_set_ext *ext;
 	int ret = 0;
 
@@ -7199,8 +7188,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx,
 		if (!nft_set_elem_active(ext, genmask))
 			continue;
 
-		elem.priv = catchall->elem;
-		ret = __nft_set_catchall_flush(ctx, set, &elem);
+		ret = __nft_set_catchall_flush(ctx, set, catchall->elem);
 		if (ret < 0)
 			break;
 		nft_set_elem_change_active(ctx->net, set, ext);
@@ -9278,7 +9266,7 @@ static void nft_commit_release(struct nft_trans *trans)
 	case NFT_MSG_DESTROYSETELEM:
 		nf_tables_set_elem_destroy(&trans->ctx,
 					   nft_trans_elem_set(trans),
-					   nft_trans_elem(trans).priv);
+					   nft_trans_elem_priv(trans));
 		break;
 	case NFT_MSG_DELOBJ:
 	case NFT_MSG_DESTROYOBJ:
@@ -9507,16 +9495,12 @@ void nft_chain_del(struct nft_chain *chain)
 static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
 					struct nft_trans_gc *trans)
 {
-	void **priv = trans->priv;
+	struct nft_elem_priv **priv = trans->priv;
 	unsigned int i;
 
 	for (i = 0; i < trans->count; i++) {
-		struct nft_set_elem elem = {
-			.priv = priv[i],
-		};
-
-		nft_setelem_data_deactivate(ctx->net, trans->set, &elem);
-		nft_setelem_remove(ctx->net, trans->set, &elem);
+		nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]);
+		nft_setelem_remove(ctx->net, trans->set, priv[i]);
 	}
 }
 
@@ -9529,7 +9513,7 @@ void nft_trans_gc_destroy(struct nft_trans_gc *trans)
 
 static void nft_trans_gc_trans_free(struct rcu_head *rcu)
 {
-	struct nft_set_elem elem = {};
+	struct nft_elem_priv *elem_priv;
 	struct nft_trans_gc *trans;
 	struct nft_ctx ctx = {};
 	unsigned int i;
@@ -9538,11 +9522,11 @@ static void nft_trans_gc_trans_free(struct rcu_head *rcu)
 	ctx.net	= read_pnet(&trans->set->net);
 
 	for (i = 0; i < trans->count; i++) {
-		elem.priv = trans->priv[i];
-		if (!nft_setelem_is_catchall(trans->set, &elem))
+		elem_priv = trans->priv[i];
+		if (!nft_setelem_is_catchall(trans->set, elem_priv))
 			atomic_dec(&trans->set->nelems);
 
-		nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv);
+		nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv);
 	}
 
 	nft_trans_gc_destroy(trans);
@@ -10110,9 +10094,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWSETELEM:
 			te = (struct nft_trans_elem *)trans->data;
 
-			nft_setelem_activate(net, te->set, &te->elem);
+			nft_setelem_activate(net, te->set, te->elem_priv);
 			nf_tables_setelem_notify(&trans->ctx, te->set,
-						 &te->elem,
+						 te->elem_priv,
 						 NFT_MSG_NEWSETELEM);
 			if (te->set->ops->commit &&
 			    list_empty(&te->set->pending_update)) {
@@ -10126,10 +10110,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			te = (struct nft_trans_elem *)trans->data;
 
 			nf_tables_setelem_notify(&trans->ctx, te->set,
-						 &te->elem,
+						 te->elem_priv,
 						 trans->msg_type);
-			nft_setelem_remove(net, te->set, &te->elem);
-			if (!nft_setelem_is_catchall(te->set, &te->elem)) {
+			nft_setelem_remove(net, te->set, te->elem_priv);
+			if (!nft_setelem_is_catchall(te->set, te->elem_priv)) {
 				atomic_dec(&te->set->nelems);
 				te->set->ndeact--;
 			}
@@ -10249,7 +10233,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 		break;
 	case NFT_MSG_NEWSETELEM:
 		nft_set_elem_destroy(nft_trans_elem_set(trans),
-				     nft_trans_elem(trans).priv, true);
+				     nft_trans_elem_priv(trans), true);
 		break;
 	case NFT_MSG_NEWOBJ:
 		nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
@@ -10396,8 +10380,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 				break;
 			}
 			te = (struct nft_trans_elem *)trans->data;
-			nft_setelem_remove(net, te->set, &te->elem);
-			if (!nft_setelem_is_catchall(te->set, &te->elem))
+			nft_setelem_remove(net, te->set, te->elem_priv);
+			if (!nft_setelem_is_catchall(te->set, te->elem_priv))
 				atomic_dec(&te->set->nelems);
 
 			if (te->set->ops->abort &&
@@ -10410,9 +10394,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 		case NFT_MSG_DESTROYSETELEM:
 			te = (struct nft_trans_elem *)trans->data;
 
-			nft_setelem_data_activate(net, te->set, &te->elem);
-			nft_setelem_activate(net, te->set, &te->elem);
-			if (!nft_setelem_is_catchall(te->set, &te->elem))
+			nft_setelem_data_activate(net, te->set, te->elem_priv);
+			nft_setelem_activate(net, te->set, te->elem_priv);
+			if (!nft_setelem_is_catchall(te->set, te->elem_priv))
 				te->set->ndeact--;
 
 			if (te->set->ops->abort &&
@@ -10588,9 +10572,9 @@ static int nft_check_loops(const struct nft_ctx *ctx,
 static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
 					struct nft_set *set,
 					const struct nft_set_iter *iter,
-					struct nft_set_elem *elem)
+					struct nft_elem_priv *elem_priv)
 {
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
 	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index a320e7614aaa..963edb514641 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -146,11 +146,10 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
 	return 0;
 }
 
-static void nft_bitmap_remove(const struct net *net,
-			      const struct nft_set *set,
-			      const struct nft_set_elem *elem)
+static void nft_bitmap_remove(const struct net *net, const struct nft_set *set,
+			      struct nft_elem_priv *elem_priv)
 {
-	struct nft_bitmap_elem *be = nft_elem_priv_cast(elem->priv);
+	struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
 	struct nft_bitmap *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
@@ -163,9 +162,9 @@ static void nft_bitmap_remove(const struct net *net,
 
 static void nft_bitmap_activate(const struct net *net,
 				const struct nft_set *set,
-				const struct nft_set_elem *elem)
+				struct nft_elem_priv *elem_priv)
 {
-	struct nft_bitmap_elem *be = nft_elem_priv_cast(elem->priv);
+	struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
 	struct nft_bitmap *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
@@ -219,7 +218,6 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
 {
 	const struct nft_bitmap *priv = nft_set_priv(set);
 	struct nft_bitmap_elem *be;
-	struct nft_set_elem elem;
 
 	list_for_each_entry_rcu(be, &priv->list, head) {
 		if (iter->count < iter->skip)
@@ -227,9 +225,7 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
 		if (!nft_set_elem_active(&be->ext, iter->genmask))
 			goto cont;
 
-		elem.priv = &be->priv;
-
-		iter->err = iter->fn(ctx, set, iter, &elem);
+		iter->err = iter->fn(ctx, set, iter, &be->priv);
 
 		if (iter->err < 0)
 			return;
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 0691565caa81..e6c00891e334 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -190,9 +190,9 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
 }
 
 static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
-			       const struct nft_set_elem *elem)
+			       struct nft_elem_priv *elem_priv)
 {
-	struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
+	struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
 
 	nft_set_elem_change_active(net, set, &he->ext);
 }
@@ -230,9 +230,9 @@ nft_rhash_deactivate(const struct net *net, const struct nft_set *set,
 
 static void nft_rhash_remove(const struct net *net,
 			     const struct nft_set *set,
-			     const struct nft_set_elem *elem)
+			     struct nft_elem_priv *elem_priv)
 {
-	struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
+	struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
 	struct nft_rhash *priv = nft_set_priv(set);
 
 	rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
@@ -264,7 +264,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_rhash *priv = nft_set_priv(set);
 	struct nft_rhash_elem *he;
 	struct rhashtable_iter hti;
-	struct nft_set_elem elem;
 
 	rhashtable_walk_enter(&priv->ht, &hti);
 	rhashtable_walk_start(&hti);
@@ -284,9 +283,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 		if (!nft_set_elem_active(&he->ext, iter->genmask))
 			goto cont;
 
-		elem.priv = &he->priv;
-
-		iter->err = iter->fn(ctx, set, iter, &elem);
+		iter->err = iter->fn(ctx, set, iter, &he->priv);
 		if (iter->err < 0)
 			break;
 
@@ -592,9 +589,9 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
 }
 
 static void nft_hash_activate(const struct net *net, const struct nft_set *set,
-			      const struct nft_set_elem *elem)
+			      struct nft_elem_priv *elem_priv)
 {
-	struct nft_hash_elem *he = nft_elem_priv_cast(elem->priv);
+	struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
 
 	nft_set_elem_change_active(net, set, &he->ext);
 }
@@ -631,9 +628,9 @@ nft_hash_deactivate(const struct net *net, const struct nft_set *set,
 
 static void nft_hash_remove(const struct net *net,
 			    const struct nft_set *set,
-			    const struct nft_set_elem *elem)
+			    struct nft_elem_priv *elem_priv)
 {
-	struct nft_hash_elem *he = nft_elem_priv_cast(elem->priv);
+	struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
 
 	hlist_del_rcu(&he->node);
 }
@@ -643,7 +640,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 {
 	struct nft_hash *priv = nft_set_priv(set);
 	struct nft_hash_elem *he;
-	struct nft_set_elem elem;
 	int i;
 
 	for (i = 0; i < priv->buckets; i++) {
@@ -653,9 +649,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 			if (!nft_set_elem_active(&he->ext, iter->genmask))
 				goto cont;
 
-			elem.priv = &he->priv;
-
-			iter->err = iter->fn(ctx, set, iter, &elem);
+			iter->err = iter->fn(ctx, set, iter, &he->priv);
 			if (iter->err < 0)
 				return;
 cont:
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 0969d2cb637b..f540c2be0caa 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1548,11 +1548,7 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
 				     struct nft_pipapo_elem *e)
 
 {
-	struct nft_set_elem elem = {
-		.priv	= &e->priv,
-	};
-
-	nft_setelem_data_deactivate(net, set, &elem);
+	nft_setelem_data_deactivate(net, set, &e->priv);
 }
 
 /**
@@ -1739,7 +1735,7 @@ static void nft_pipapo_abort(const struct nft_set *set)
  * nft_pipapo_activate() - Mark element reference as active given key, commit
  * @net:	Network namespace
  * @set:	nftables API set representation
- * @elem:	nftables API element representation containing key data
+ * @elem_priv:	nftables API element representation containing key data
  *
  * On insertion, elements are added to a copy of the matching data currently
  * in use for lookups, and not directly inserted into current lookup data. Both
@@ -1748,9 +1744,9 @@ static void nft_pipapo_abort(const struct nft_set *set)
  */
 static void nft_pipapo_activate(const struct net *net,
 				const struct nft_set *set,
-				const struct nft_set_elem *elem)
+				struct nft_elem_priv *elem_priv)
 {
-	struct nft_pipapo_elem *e = nft_elem_priv_cast(elem->priv);
+	struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv);
 
 	nft_set_elem_change_active(net, set, &e->ext);
 }
@@ -1803,7 +1799,7 @@ nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
  * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive
  * @net:	Network namespace
  * @set:	nftables API set representation
- * @elem:	nftables API element representation containing key data
+ * @elem_priv:	nftables API element representation containing key data
  *
  * This is functionally the same as nft_pipapo_deactivate(), with a slightly
  * different interface, and it's also called once for each element in a set
@@ -1945,7 +1941,7 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
  * nft_pipapo_remove() - Remove element given key, commit
  * @net:	Network namespace
  * @set:	nftables API set representation
- * @elem:	nftables API element representation containing key data
+ * @elem_priv:	nftables API element representation containing key data
  *
  * Similarly to nft_pipapo_activate(), this is used as commit operation by the
  * API, but it's called once per element in the pending transaction, so we can't
@@ -1953,7 +1949,7 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
  * the matched element here, if any, and commit the updated matching data.
  */
 static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
-			      const struct nft_set_elem *elem)
+			      struct nft_elem_priv *elem_priv)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_match *m = priv->clone;
@@ -1961,7 +1957,7 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
 	struct nft_pipapo_elem *e;
 	const u8 *data;
 
-	e = nft_elem_priv_cast(elem->priv);
+	e = nft_elem_priv_cast(elem_priv);
 	data = (const u8 *)nft_set_ext_key(&e->ext);
 
 	while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
@@ -2038,7 +2034,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
 
 	for (r = 0; r < f->rules; r++) {
 		struct nft_pipapo_elem *e;
-		struct nft_set_elem elem;
 
 		if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
 			continue;
@@ -2048,9 +2043,7 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
 
 		e = f->mt[r].e;
 
-		elem.priv = &e->priv;
-
-		iter->err = iter->fn(ctx, set, iter, &elem);
+		iter->err = iter->fn(ctx, set, iter, &e->priv);
 		if (iter->err < 0)
 			goto out;
 
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 475f22568342..25baa9cdb77d 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -227,12 +227,8 @@ static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
 				      struct nft_rbtree *priv,
 				      struct nft_rbtree_elem *rbe)
 {
-	struct nft_set_elem elem = {
-		.priv	= &rbe->priv,
-	};
-
 	lockdep_assert_held_write(&priv->lock);
-	nft_setelem_data_deactivate(net, set, &elem);
+	nft_setelem_data_deactivate(net, set, &rbe->priv);
 	rb_erase(&rbe->node, &priv->root);
 }
 
@@ -521,9 +517,9 @@ static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rb
 
 static void nft_rbtree_remove(const struct net *net,
 			      const struct nft_set *set,
-			      const struct nft_set_elem *elem)
+			      struct nft_elem_priv *elem_priv)
 {
-	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
+	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
 	struct nft_rbtree *priv = nft_set_priv(set);
 
 	nft_rbtree_erase(priv, rbe);
@@ -531,9 +527,9 @@ static void nft_rbtree_remove(const struct net *net,
 
 static void nft_rbtree_activate(const struct net *net,
 				const struct nft_set *set,
-				const struct nft_set_elem *elem)
+				struct nft_elem_priv *elem_priv)
 {
-	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
+	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
 
 	nft_set_elem_change_active(net, set, &rbe->ext);
 }
@@ -594,7 +590,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
 	struct nft_rbtree_elem *rbe;
-	struct nft_set_elem elem;
 	struct rb_node *node;
 
 	read_lock_bh(&priv->lock);
@@ -606,9 +601,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
 		if (!nft_set_elem_active(&rbe->ext, iter->genmask))
 			goto cont;
 
-		elem.priv = &rbe->priv;
-
-		iter->err = iter->fn(ctx, set, iter, &elem);
+		iter->err = iter->fn(ctx, set, iter, &rbe->priv);
 		if (iter->err < 0) {
 			read_unlock_bh(&priv->lock);
 			return;
@@ -623,11 +616,7 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
 				 struct nft_rbtree *priv,
 				 struct nft_rbtree_elem *rbe)
 {
-	struct nft_set_elem elem = {
-		.priv	= rbe,
-	};
-
-	nft_setelem_data_deactivate(net, set, &elem);
+	nft_setelem_data_deactivate(net, set, &rbe->priv);
 	nft_rbtree_erase(priv, rbe);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 078996fcd657e6e0c3a72b7d5806d04c32e74250 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 18 Oct 2023 22:23:35 +0200
Subject: netfilter: nf_tables: set->ops->insert returns opaque set element in
 case of EEXIST

Return struct nft_elem_priv instead of struct nft_set_ext for
consistency with ("netfilter: nf_tables: expose opaque set element as
struct nft_elem_priv") and to prepare the introduction of element
timeout updates from control path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c     | 17 ++++++++++-------
 net/netfilter/nft_set_bitmap.c    |  4 ++--
 net/netfilter/nft_set_hash.c      |  8 ++++----
 net/netfilter/nft_set_pipapo.c    | 10 +++++-----
 net/netfilter/nft_set_rbtree.c    | 10 +++++-----
 6 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index b63f35fb2a99..3bbd13ab1ecf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -451,7 +451,7 @@ struct nft_set_ops {
 	int				(*insert)(const struct net *net,
 						  const struct nft_set *set,
 						  const struct nft_set_elem *elem,
-						  struct nft_set_ext **ext);
+						  struct nft_elem_priv **priv);
 	void				(*activate)(const struct net *net,
 						    const struct nft_set *set,
 						    struct nft_elem_priv *elem_priv);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 79b0ed92367a..ed3329fcbe7f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6406,7 +6406,7 @@ EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
 static int nft_setelem_catchall_insert(const struct net *net,
 				       struct nft_set *set,
 				       const struct nft_set_elem *elem,
-				       struct nft_set_ext **pext)
+				       struct nft_elem_priv **priv)
 {
 	struct nft_set_elem_catchall *catchall;
 	u8 genmask = nft_genmask_next(net);
@@ -6415,7 +6415,7 @@ static int nft_setelem_catchall_insert(const struct net *net,
 	list_for_each_entry(catchall, &set->catchall_list, list) {
 		ext = nft_set_elem_ext(set, catchall->elem);
 		if (nft_set_elem_active(ext, genmask)) {
-			*pext = ext;
+			*priv = catchall->elem;
 			return -EEXIST;
 		}
 	}
@@ -6433,14 +6433,15 @@ static int nft_setelem_catchall_insert(const struct net *net,
 static int nft_setelem_insert(const struct net *net,
 			      struct nft_set *set,
 			      const struct nft_set_elem *elem,
-			      struct nft_set_ext **ext, unsigned int flags)
+			      struct nft_elem_priv **elem_priv,
+			      unsigned int flags)
 {
 	int ret;
 
 	if (flags & NFT_SET_ELEM_CATCHALL)
-		ret = nft_setelem_catchall_insert(net, set, elem, ext);
+		ret = nft_setelem_catchall_insert(net, set, elem, elem_priv);
 	else
-		ret = set->ops->insert(net, set, elem, ext);
+		ret = set->ops->insert(net, set, elem, elem_priv);
 
 	return ret;
 }
@@ -6576,13 +6577,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_set_ext *ext, *ext2;
 	struct nft_set_elem elem;
 	struct nft_set_binding *binding;
+	struct nft_elem_priv *elem_priv;
 	struct nft_object *obj = NULL;
 	struct nft_userdata *udata;
 	struct nft_data_desc desc;
 	enum nft_registers dreg;
 	struct nft_trans *trans;
-	u64 timeout;
 	u64 expiration;
+	u64 timeout;
 	int err, i;
 	u8 ulen;
 
@@ -6875,9 +6877,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 
 	ext->genmask = nft_genmask_cur(ctx->net);
 
-	err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags);
+	err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags);
 	if (err) {
 		if (err == -EEXIST) {
+			ext2 = nft_set_elem_ext(set, elem_priv);
 			if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
 			    nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
 			    nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 963edb514641..32df7a16835d 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -125,7 +125,7 @@ nft_bitmap_get(const struct net *net, const struct nft_set *set,
 
 static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
 			     const struct nft_set_elem *elem,
-			     struct nft_set_ext **ext)
+			     struct nft_elem_priv **elem_priv)
 {
 	struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be;
 	struct nft_bitmap *priv = nft_set_priv(set);
@@ -134,7 +134,7 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
 
 	be = nft_bitmap_elem_find(set, new, genmask);
 	if (be) {
-		*ext = &be->ext;
+		*elem_priv = &be->priv;
 		return -EEXIST;
 	}
 
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index e6c00891e334..6c2061bfdae6 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -167,7 +167,7 @@ err1:
 
 static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
 			    const struct nft_set_elem *elem,
-			    struct nft_set_ext **ext)
+			    struct nft_elem_priv **elem_priv)
 {
 	struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
 	struct nft_rhash *priv = nft_set_priv(set);
@@ -183,7 +183,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
 	if (IS_ERR(prev))
 		return PTR_ERR(prev);
 	if (prev) {
-		*ext = &prev->ext;
+		*elem_priv = &prev->priv;
 		return -EEXIST;
 	}
 	return 0;
@@ -568,7 +568,7 @@ static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv,
 
 static int nft_hash_insert(const struct net *net, const struct nft_set *set,
 			   const struct nft_set_elem *elem,
-			   struct nft_set_ext **ext)
+			   struct nft_elem_priv **elem_priv)
 {
 	struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
 	struct nft_hash *priv = nft_set_priv(set);
@@ -580,7 +580,7 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
 		if (!memcmp(nft_set_ext_key(&this->ext),
 			    nft_set_ext_key(&he->ext), set->klen) &&
 		    nft_set_elem_active(&he->ext, genmask)) {
-			*ext = &he->ext;
+			*elem_priv = &he->priv;
 			return -EEXIST;
 		}
 	}
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index f540c2be0caa..701977af3ee8 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1158,13 +1158,13 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
  * @net:	Network namespace
  * @set:	nftables API set representation
  * @elem:	nftables API element representation containing key data
- * @ext2:	Filled with pointer to &struct nft_set_ext in inserted element
+ * @elem_priv:	Filled with pointer to &struct nft_set_ext in inserted element
  *
  * Return: 0 on success, error pointer on failure.
  */
 static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 			     const struct nft_set_elem *elem,
-			     struct nft_set_ext **ext2)
+			     struct nft_elem_priv **elem_priv)
 {
 	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
 	union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
@@ -1195,7 +1195,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 
 		if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) &&
 		    !memcmp(end, dup_end->data, sizeof(*dup_end->data))) {
-			*ext2 = &dup->ext;
+			*elem_priv = &dup->priv;
 			return -EEXIST;
 		}
 
@@ -1210,7 +1210,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 	if (PTR_ERR(dup) != -ENOENT) {
 		if (IS_ERR(dup))
 			return PTR_ERR(dup);
-		*ext2 = &dup->ext;
+		*elem_priv = &dup->priv;
 		return -ENOTEMPTY;
 	}
 
@@ -1271,7 +1271,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 	}
 
 	e = nft_elem_priv_cast(elem->priv);
-	*ext2 = &e->ext;
+	*elem_priv = &e->priv;
 
 	pipapo_map(m, rulemap, e);
 
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 25baa9cdb77d..6f1186abd47b 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -306,7 +306,7 @@ static bool nft_rbtree_update_first(const struct nft_set *set,
 
 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			       struct nft_rbtree_elem *new,
-			       struct nft_set_ext **ext)
+			       struct nft_elem_priv **elem_priv)
 {
 	struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
 	struct rb_node *node, *next, *parent, **p, *first = NULL;
@@ -423,7 +423,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 	 */
 	if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
 	    nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
-		*ext = &rbe_ge->ext;
+		*elem_priv = &rbe_ge->priv;
 		return -EEXIST;
 	}
 
@@ -432,7 +432,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 	 */
 	if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
 	    nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
-		*ext = &rbe_le->ext;
+		*elem_priv = &rbe_le->priv;
 		return -EEXIST;
 	}
 
@@ -484,7 +484,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 
 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			     const struct nft_set_elem *elem,
-			     struct nft_set_ext **ext)
+			     struct nft_elem_priv **elem_priv)
 {
 	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
 	struct nft_rbtree *priv = nft_set_priv(set);
@@ -498,7 +498,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 
 		write_lock_bh(&priv->lock);
 		write_seqcount_begin(&priv->count);
-		err = __nft_rbtree_insert(net, set, rbe, ext);
+		err = __nft_rbtree_insert(net, set, rbe, elem_priv);
 		write_seqcount_end(&priv->count);
 		write_unlock_bh(&priv->lock);
 	} while (err == -EAGAIN);
-- 
cgit v1.2.3-70-g09d2


From 1d0507f46843b14b0cb051fe50ebc7e6432111ab Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 23 Oct 2023 11:17:07 -0700
Subject: net: mptcp: convert netlink from small_ops to ops

in the current MPTCP control plane, all operations use a netlink
attribute of the same type "MPTCP_PM_ATTR". However, add/del/get/flush
operations only parse the first element in the message _ the one that
describes MPTCP endpoints (that was named MPTCP_PM_ATTR_ADDR and
mostly used in ADD_ADDR operations _ probably the similarity of "attr",
"addr" and "add" might cause some confusion to human readers).
Convert MPTCP from 'small_ops' to 'ops', thus allowing different attributes
for each single operation, hopefully makes all this clearer to human
readers.

- use a separate attribute set for add/del/get/flush address operation,
  binary compatible with the existing one, to store the endpoint address.
  MPTCP_PM_ENDPOINT_ADDR is added to the uAPI (with the same value as
  MPTCP_PM_ATTR_ADDR) for these operations.
- convert mptcp_pm_ops[] and add policy files accordingly.

this prepares MPTCP control plane to be described as YAML spec.

Link: https://github.com/multipath-tcp/mptcp_net-next/issues/340
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Mat Martineau <martineau@kernel.org>
Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-1-v2-3-16b1f701f900@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/mptcp.h |   8 ++
 net/mptcp/pm_netlink.c     | 191 ++++++++++++++++++++++++++++++---------------
 2 files changed, 135 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index ee9c49f949a2..0e62937ab17c 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -65,6 +65,14 @@ enum {
 
 #define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1)
 
+enum {
+	MPTCP_PM_ENDPOINT_ADDR = 1,
+
+	__MPTCP_PM_ENDPOINT_MAX
+};
+
+#define MPTCP_PM_ENDPOINT_MAX (__MPTCP_PM_ENDPOINT_MAX - 1)
+
 enum {
 	MPTCP_PM_ADDR_ATTR_UNSPEC,
 
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 9661f3812682..fd4e843505e5 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -48,6 +48,60 @@ struct pm_nl_pernet {
 #define MPTCP_PM_ADDR_MAX	8
 #define ADD_ADDR_RETRANS_MAX	3
 
+static
+const struct nla_policy mptcp_pm_address_nl_policy[MPTCP_PM_ADDR_ATTR_IF_IDX + 1] = {
+	[MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
+	[MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
+	[MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
+	[MPTCP_PM_ADDR_ATTR_ADDR6] = NLA_POLICY_EXACT_LEN(16),
+	[MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16, },
+	[MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32, },
+	[MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32, },
+};
+
+/* MPTCP_PM_CMD_ADD_ADDR / DEL / GET / FLUSH - do */
+static
+const struct nla_policy mptcp_pm_endpoint_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1] = {
+	[MPTCP_PM_ENDPOINT_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
+/* MPTCP_PM_CMD_SET_LIMITS - do */
+static
+const struct nla_policy mptcp_pm_set_limits_nl_policy[MPTCP_PM_ATTR_SUBFLOWS + 1] = {
+	[MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
+	[MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
+};
+
+/* MPTCP_PM_CMD_SET_FLAGS - do */
+static
+const struct nla_policy mptcp_pm_set_flags_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1] = {
+	[MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+	[MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+	[MPTCP_PM_ATTR_ADDR_REMOTE] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
+/* MPTCP_PM_CMD_ANNOUNCE - do */
+static
+const struct nla_policy mptcp_pm_announce_nl_policy[MPTCP_PM_ATTR_TOKEN + 1] = {
+	[MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+	[MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+};
+
+/* MPTCP_PM_CMD_REMOVE - do */
+static
+const struct nla_policy mptcp_pm_remove_nl_policy[MPTCP_PM_ATTR_LOC_ID + 1] = {
+	[MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+	[MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, },
+};
+
+/* MPTCP_PM_CMD_SUBFLOW_CREATE / DESTROY - do */
+static
+const struct nla_policy mptcp_pm_subflow_create_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1] = {
+	[MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+	[MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+	[MPTCP_PM_ATTR_ADDR_REMOTE] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
 static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net)
 {
 	return net_generic(net, pm_nl_pernet_id);
@@ -1104,29 +1158,6 @@ static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
 					  },
 };
 
-static const struct nla_policy
-mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
-	[MPTCP_PM_ADDR_ATTR_FAMILY]	= { .type	= NLA_U16,	},
-	[MPTCP_PM_ADDR_ATTR_ID]		= { .type	= NLA_U8,	},
-	[MPTCP_PM_ADDR_ATTR_ADDR4]	= { .type	= NLA_U32,	},
-	[MPTCP_PM_ADDR_ATTR_ADDR6]	=
-		NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
-	[MPTCP_PM_ADDR_ATTR_PORT]	= { .type	= NLA_U16	},
-	[MPTCP_PM_ADDR_ATTR_FLAGS]	= { .type	= NLA_U32	},
-	[MPTCP_PM_ADDR_ATTR_IF_IDX]     = { .type	= NLA_S32	},
-};
-
-static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
-	[MPTCP_PM_ATTR_ADDR]		=
-					NLA_POLICY_NESTED(mptcp_pm_addr_policy),
-	[MPTCP_PM_ATTR_RCV_ADD_ADDRS]	= { .type	= NLA_U32,	},
-	[MPTCP_PM_ATTR_SUBFLOWS]	= { .type	= NLA_U32,	},
-	[MPTCP_PM_ATTR_TOKEN]		= { .type	= NLA_U32,	},
-	[MPTCP_PM_ATTR_LOC_ID]		= { .type	= NLA_U8,	},
-	[MPTCP_PM_ATTR_ADDR_REMOTE]	=
-					NLA_POLICY_NESTED(mptcp_pm_addr_policy),
-};
-
 void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
 {
 	struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk);
@@ -1188,7 +1219,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
 
 	/* no validation needed - was already done via nested policy */
 	err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
-					  mptcp_pm_addr_policy, info->extack);
+					  mptcp_pm_address_nl_policy, info->extack);
 	if (err)
 		return err;
 
@@ -1305,7 +1336,7 @@ next:
 
 static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+	struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
 	struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
 	struct mptcp_pm_addr_entry addr, *entry;
 	int ret;
@@ -1486,7 +1517,7 @@ next:
 
 static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+	struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
 	struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
 	struct mptcp_pm_addr_entry addr, *entry;
 	unsigned int addr_max;
@@ -1677,7 +1708,7 @@ nla_put_failure:
 
 static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+	struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
 	struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
 	struct mptcp_pm_addr_entry addr, *entry;
 	struct sk_buff *msg;
@@ -2283,72 +2314,104 @@ nla_put_failure:
 	nlmsg_free(skb);
 }
 
-static const struct genl_small_ops mptcp_pm_ops[] = {
+static const struct genl_ops mptcp_pm_ops[] = {
 	{
-		.cmd    = MPTCP_PM_CMD_ADD_ADDR,
-		.doit   = mptcp_nl_cmd_add_addr,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_ADD_ADDR,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_add_addr,
+		.policy		= mptcp_pm_endpoint_nl_policy,
+		.maxattr	= MPTCP_PM_ENDPOINT_ADDR,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_DEL_ADDR,
-		.doit   = mptcp_nl_cmd_del_addr,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_DEL_ADDR,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_del_addr,
+		.policy		= mptcp_pm_endpoint_nl_policy,
+		.maxattr	= MPTCP_PM_ENDPOINT_ADDR,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_FLUSH_ADDRS,
-		.doit   = mptcp_nl_cmd_flush_addrs,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_GET_ADDR,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_get_addr,
+		.dumpit		= mptcp_nl_cmd_dump_addrs,
+		.policy		= mptcp_pm_endpoint_nl_policy,
+		.maxattr	= MPTCP_PM_ENDPOINT_ADDR,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_GET_ADDR,
-		.doit   = mptcp_nl_cmd_get_addr,
-		.dumpit   = mptcp_nl_cmd_dump_addrs,
+		.cmd		= MPTCP_PM_CMD_FLUSH_ADDRS,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_flush_addrs,
+		.policy		= mptcp_pm_endpoint_nl_policy,
+		.maxattr	= MPTCP_PM_ENDPOINT_ADDR,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_SET_LIMITS,
-		.doit   = mptcp_nl_cmd_set_limits,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_SET_LIMITS,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_set_limits,
+		.policy		= mptcp_pm_set_limits_nl_policy,
+		.maxattr	= MPTCP_PM_ATTR_SUBFLOWS,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_GET_LIMITS,
-		.doit   = mptcp_nl_cmd_get_limits,
+		.cmd		= MPTCP_PM_CMD_GET_LIMITS,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_get_limits,
+		.policy		= mptcp_pm_set_limits_nl_policy,
+		.maxattr	= MPTCP_PM_ATTR_SUBFLOWS,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_SET_FLAGS,
-		.doit   = mptcp_nl_cmd_set_flags,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_SET_FLAGS,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_set_flags,
+		.policy		= mptcp_pm_set_flags_nl_policy,
+		.maxattr	= MPTCP_PM_ATTR_ADDR_REMOTE,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_ANNOUNCE,
-		.doit   = mptcp_nl_cmd_announce,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_ANNOUNCE,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_announce,
+		.policy		= mptcp_pm_announce_nl_policy,
+		.maxattr	= MPTCP_PM_ATTR_TOKEN,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_REMOVE,
-		.doit   = mptcp_nl_cmd_remove,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_REMOVE,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_remove,
+		.policy		= mptcp_pm_remove_nl_policy,
+		.maxattr	= MPTCP_PM_ATTR_LOC_ID,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_SUBFLOW_CREATE,
-		.doit   = mptcp_nl_cmd_sf_create,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_SUBFLOW_CREATE,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_sf_create,
+		.policy		= mptcp_pm_subflow_create_nl_policy,
+		.maxattr	= MPTCP_PM_ATTR_ADDR_REMOTE,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 	{
-		.cmd    = MPTCP_PM_CMD_SUBFLOW_DESTROY,
-		.doit   = mptcp_nl_cmd_sf_destroy,
-		.flags  = GENL_UNS_ADMIN_PERM,
+		.cmd		= MPTCP_PM_CMD_SUBFLOW_DESTROY,
+		.validate	= GENL_DONT_VALIDATE_STRICT,
+		.doit		= mptcp_nl_cmd_sf_destroy,
+		.policy		= mptcp_pm_subflow_create_nl_policy,
+		.maxattr	= MPTCP_PM_ATTR_ADDR_REMOTE,
+		.flags		= GENL_UNS_ADMIN_PERM,
 	},
 };
 
 static struct genl_family mptcp_genl_family __ro_after_init = {
 	.name		= MPTCP_PM_NAME,
 	.version	= MPTCP_PM_VER,
-	.maxattr	= MPTCP_PM_ATTR_MAX,
-	.policy		= mptcp_pm_policy,
 	.netnsok	= true,
 	.module		= THIS_MODULE,
-	.small_ops	= mptcp_pm_ops,
-	.n_small_ops	= ARRAY_SIZE(mptcp_pm_ops),
+	.ops		= mptcp_pm_ops,
+	.n_ops		= ARRAY_SIZE(mptcp_pm_ops),
 	.resv_start_op	= MPTCP_PM_CMD_SUBFLOW_DESTROY + 1,
 	.mcgrps		= mptcp_pm_mcgrps,
 	.n_mcgrps	= ARRAY_SIZE(mptcp_pm_mcgrps),
-- 
cgit v1.2.3-70-g09d2


From 9d1ed17f93ce873cda16698267bdee096a768839 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 23 Oct 2023 11:17:09 -0700
Subject: uapi: mptcp: use header file generated from YAML spec

generated with:

 $ ./tools/net/ynl/ynl-gen-c.py --mode uapi \
 > --spec Documentation/netlink/specs/mptcp.yaml \
 > --header -o include/uapi/linux/mptcp_pm.h

Link: https://github.com/multipath-tcp/mptcp_net-next/issues/340
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Mat Martineau <martineau@kernel.org>
Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-1-v2-5-16b1f701f900@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                   |   2 +-
 include/uapi/linux/mptcp.h    | 182 +++---------------------------------------
 include/uapi/linux/mptcp_pm.h | 150 ++++++++++++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 173 deletions(-)
 create mode 100644 include/uapi/linux/mptcp_pm.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 977de4624fe0..b2f53d5cae06 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14964,7 +14964,7 @@ F:	Documentation/netlink/specs/mptcp.yaml
 F:	Documentation/networking/mptcp-sysctl.rst
 F:	include/net/mptcp.h
 F:	include/trace/events/mptcp.h
-F:	include/uapi/linux/mptcp.h
+F:	include/uapi/linux/mptcp*.h
 F:	net/mptcp/
 F:	tools/testing/selftests/bpf/*/*mptcp*.c
 F:	tools/testing/selftests/net/mptcp/
diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 0e62937ab17c..64ecc8a3f9f2 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -23,99 +23,24 @@
 #define MPTCP_SUBFLOW_FLAG_CONNECTED		_BITUL(7)
 #define MPTCP_SUBFLOW_FLAG_MAPVALID		_BITUL(8)
 
-enum {
-	MPTCP_SUBFLOW_ATTR_UNSPEC,
-	MPTCP_SUBFLOW_ATTR_TOKEN_REM,
-	MPTCP_SUBFLOW_ATTR_TOKEN_LOC,
-	MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
-	MPTCP_SUBFLOW_ATTR_MAP_SEQ,
-	MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
-	MPTCP_SUBFLOW_ATTR_SSN_OFFSET,
-	MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
-	MPTCP_SUBFLOW_ATTR_FLAGS,
-	MPTCP_SUBFLOW_ATTR_ID_REM,
-	MPTCP_SUBFLOW_ATTR_ID_LOC,
-	MPTCP_SUBFLOW_ATTR_PAD,
-	__MPTCP_SUBFLOW_ATTR_MAX
-};
-
-#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1)
-
-/* netlink interface */
-#define MPTCP_PM_NAME		"mptcp_pm"
 #define MPTCP_PM_CMD_GRP_NAME	"mptcp_pm_cmds"
 #define MPTCP_PM_EV_GRP_NAME	"mptcp_pm_events"
-#define MPTCP_PM_VER		0x1
-
-/*
- * ATTR types defined for MPTCP
- */
-enum {
-	MPTCP_PM_ATTR_UNSPEC,
-
-	MPTCP_PM_ATTR_ADDR,				/* nested address */
-	MPTCP_PM_ATTR_RCV_ADD_ADDRS,			/* u32 */
-	MPTCP_PM_ATTR_SUBFLOWS,				/* u32 */
-	MPTCP_PM_ATTR_TOKEN,				/* u32 */
-	MPTCP_PM_ATTR_LOC_ID,				/* u8 */
-	MPTCP_PM_ATTR_ADDR_REMOTE,			/* nested address */
-
-	__MPTCP_PM_ATTR_MAX
-};
-
-#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1)
-
-enum {
-	MPTCP_PM_ENDPOINT_ADDR = 1,
-
-	__MPTCP_PM_ENDPOINT_MAX
-};
-
-#define MPTCP_PM_ENDPOINT_MAX (__MPTCP_PM_ENDPOINT_MAX - 1)
-
-enum {
-	MPTCP_PM_ADDR_ATTR_UNSPEC,
-
-	MPTCP_PM_ADDR_ATTR_FAMILY,			/* u16 */
-	MPTCP_PM_ADDR_ATTR_ID,				/* u8 */
-	MPTCP_PM_ADDR_ATTR_ADDR4,			/* struct in_addr */
-	MPTCP_PM_ADDR_ATTR_ADDR6,			/* struct in6_addr */
-	MPTCP_PM_ADDR_ATTR_PORT,			/* u16 */
-	MPTCP_PM_ADDR_ATTR_FLAGS,			/* u32 */
-	MPTCP_PM_ADDR_ATTR_IF_IDX,			/* s32 */
-
-	__MPTCP_PM_ADDR_ATTR_MAX
-};
 
-#define MPTCP_PM_ADDR_ATTR_MAX (__MPTCP_PM_ADDR_ATTR_MAX - 1)
+#include <linux/mptcp_pm.h>
 
-#define MPTCP_PM_ADDR_FLAG_SIGNAL			(1 << 0)
-#define MPTCP_PM_ADDR_FLAG_SUBFLOW			(1 << 1)
-#define MPTCP_PM_ADDR_FLAG_BACKUP			(1 << 2)
-#define MPTCP_PM_ADDR_FLAG_FULLMESH			(1 << 3)
-#define MPTCP_PM_ADDR_FLAG_IMPLICIT			(1 << 4)
-
-enum {
-	MPTCP_PM_CMD_UNSPEC,
-
-	MPTCP_PM_CMD_ADD_ADDR,
-	MPTCP_PM_CMD_DEL_ADDR,
-	MPTCP_PM_CMD_GET_ADDR,
-	MPTCP_PM_CMD_FLUSH_ADDRS,
-	MPTCP_PM_CMD_SET_LIMITS,
-	MPTCP_PM_CMD_GET_LIMITS,
-	MPTCP_PM_CMD_SET_FLAGS,
-	MPTCP_PM_CMD_ANNOUNCE,
-	MPTCP_PM_CMD_REMOVE,
-	MPTCP_PM_CMD_SUBFLOW_CREATE,
-	MPTCP_PM_CMD_SUBFLOW_DESTROY,
-
-	__MPTCP_PM_CMD_AFTER_LAST
-};
+/* for backward compatibility */
+#define	__MPTCP_PM_CMD_AFTER_LAST	__MPTCP_PM_CMD_MAX
+#define	__MPTCP_ATTR_AFTER_LAST		__MPTCP_ATTR_MAX
 
 #define MPTCP_INFO_FLAG_FALLBACK		_BITUL(0)
 #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED	_BITUL(1)
 
+#define MPTCP_PM_ADDR_FLAG_SIGNAL                      (1 << 0)
+#define MPTCP_PM_ADDR_FLAG_SUBFLOW                     (1 << 1)
+#define MPTCP_PM_ADDR_FLAG_BACKUP                      (1 << 2)
+#define MPTCP_PM_ADDR_FLAG_FULLMESH                    (1 << 3)
+#define MPTCP_PM_ADDR_FLAG_IMPLICIT                    (1 << 4)
+
 struct mptcp_info {
 	__u8	mptcpi_subflows;
 	__u8	mptcpi_add_addr_signal;
@@ -138,93 +63,6 @@ struct mptcp_info {
 	__u64	mptcpi_bytes_acked;
 };
 
-/*
- * MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6,
- *                      sport, dport
- * A new MPTCP connection has been created. It is the good time to allocate
- * memory and send ADD_ADDR if needed. Depending on the traffic-patterns
- * it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent.
- *
- * MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6,
- *			    sport, dport
- * A MPTCP connection is established (can start new subflows).
- *
- * MPTCP_EVENT_CLOSED: token
- * A MPTCP connection has stopped.
- *
- * MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport]
- * A new address has been announced by the peer.
- *
- * MPTCP_EVENT_REMOVED: token, rem_id
- * An address has been lost by the peer.
- *
- * MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id,
- *                              saddr4 | saddr6, daddr4 | daddr6, sport,
- *                              dport, backup, if_idx [, error]
- * A new subflow has been established. 'error' should not be set.
- *
- * MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6,
- *                         daddr4 | daddr6, sport, dport, backup, if_idx
- *                         [, error]
- * A subflow has been closed. An error (copy of sk_err) could be set if an
- * error has been detected for this subflow.
- *
- * MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6,
- *                           daddr4 | daddr6, sport, dport, backup, if_idx
- *                           [, error]
- * The priority of a subflow has changed. 'error' should not be set.
- *
- * MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6
- * A new PM listener is created.
- *
- * MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6
- * A PM listener is closed.
- */
-enum mptcp_event_type {
-	MPTCP_EVENT_UNSPEC = 0,
-	MPTCP_EVENT_CREATED = 1,
-	MPTCP_EVENT_ESTABLISHED = 2,
-	MPTCP_EVENT_CLOSED = 3,
-
-	MPTCP_EVENT_ANNOUNCED = 6,
-	MPTCP_EVENT_REMOVED = 7,
-
-	MPTCP_EVENT_SUB_ESTABLISHED = 10,
-	MPTCP_EVENT_SUB_CLOSED = 11,
-
-	MPTCP_EVENT_SUB_PRIORITY = 13,
-
-	MPTCP_EVENT_LISTENER_CREATED = 15,
-	MPTCP_EVENT_LISTENER_CLOSED = 16,
-};
-
-enum mptcp_event_attr {
-	MPTCP_ATTR_UNSPEC = 0,
-
-	MPTCP_ATTR_TOKEN,	/* u32 */
-	MPTCP_ATTR_FAMILY,	/* u16 */
-	MPTCP_ATTR_LOC_ID,	/* u8 */
-	MPTCP_ATTR_REM_ID,	/* u8 */
-	MPTCP_ATTR_SADDR4,	/* be32 */
-	MPTCP_ATTR_SADDR6,	/* struct in6_addr */
-	MPTCP_ATTR_DADDR4,	/* be32 */
-	MPTCP_ATTR_DADDR6,	/* struct in6_addr */
-	MPTCP_ATTR_SPORT,	/* be16 */
-	MPTCP_ATTR_DPORT,	/* be16 */
-	MPTCP_ATTR_BACKUP,	/* u8 */
-	MPTCP_ATTR_ERROR,	/* u8 */
-	MPTCP_ATTR_FLAGS,	/* u16 */
-	MPTCP_ATTR_TIMEOUT,	/* u32 */
-	MPTCP_ATTR_IF_IDX,	/* s32 */
-	MPTCP_ATTR_RESET_REASON,/* u32 */
-	MPTCP_ATTR_RESET_FLAGS, /* u32 */
-	MPTCP_ATTR_SERVER_SIDE,	/* u8 */
-
-	__MPTCP_ATTR_AFTER_LAST
-};
-
-#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1)
-
 /* MPTCP Reset reason codes, rfc8684 */
 #define MPTCP_RST_EUNSPEC	0
 #define MPTCP_RST_EMPTCP	1
diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h
new file mode 100644
index 000000000000..0ad598fe940b
--- /dev/null
+++ b/include/uapi/linux/mptcp_pm.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/mptcp.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_MPTCP_PM_H
+#define _UAPI_LINUX_MPTCP_PM_H
+
+#define MPTCP_PM_NAME	"mptcp_pm"
+#define MPTCP_PM_VER	1
+
+/**
+ * enum mptcp_event_type
+ * @MPTCP_EVENT_UNSPEC: unused event
+ * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6,
+ *   sport, dport A new MPTCP connection has been created. It is the good time
+ *   to allocate memory and send ADD_ADDR if needed. Depending on the
+ *   traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED
+ *   is sent.
+ * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6,
+ *   sport, dport A MPTCP connection is established (can start new subflows).
+ * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped.
+ * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A
+ *   new address has been announced by the peer.
+ * @MPTCP_EVENT_REMOVED: token, rem_id An address has been lost by the peer.
+ * @MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, saddr4 |
+ *   saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error] A new
+ *   subflow has been established. 'error' should not be set.
+ * @MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6,
+ *   daddr4 | daddr6, sport, dport, backup, if_idx [, error] A subflow has been
+ *   closed. An error (copy of sk_err) could be set if an error has been
+ *   detected for this subflow.
+ * @MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6,
+ *   daddr4 | daddr6, sport, dport, backup, if_idx [, error] The priority of a
+ *   subflow has changed. 'error' should not be set.
+ * @MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 A new PM
+ *   listener is created.
+ * @MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 A PM listener
+ *   is closed.
+ */
+enum mptcp_event_type {
+	MPTCP_EVENT_UNSPEC,
+	MPTCP_EVENT_CREATED,
+	MPTCP_EVENT_ESTABLISHED,
+	MPTCP_EVENT_CLOSED,
+	MPTCP_EVENT_ANNOUNCED = 6,
+	MPTCP_EVENT_REMOVED,
+	MPTCP_EVENT_SUB_ESTABLISHED = 10,
+	MPTCP_EVENT_SUB_CLOSED,
+	MPTCP_EVENT_SUB_PRIORITY = 13,
+	MPTCP_EVENT_LISTENER_CREATED = 15,
+	MPTCP_EVENT_LISTENER_CLOSED,
+};
+
+enum {
+	MPTCP_PM_ADDR_ATTR_UNSPEC,
+	MPTCP_PM_ADDR_ATTR_FAMILY,
+	MPTCP_PM_ADDR_ATTR_ID,
+	MPTCP_PM_ADDR_ATTR_ADDR4,
+	MPTCP_PM_ADDR_ATTR_ADDR6,
+	MPTCP_PM_ADDR_ATTR_PORT,
+	MPTCP_PM_ADDR_ATTR_FLAGS,
+	MPTCP_PM_ADDR_ATTR_IF_IDX,
+
+	__MPTCP_PM_ADDR_ATTR_MAX
+};
+#define MPTCP_PM_ADDR_ATTR_MAX (__MPTCP_PM_ADDR_ATTR_MAX - 1)
+
+enum {
+	MPTCP_SUBFLOW_ATTR_UNSPEC,
+	MPTCP_SUBFLOW_ATTR_TOKEN_REM,
+	MPTCP_SUBFLOW_ATTR_TOKEN_LOC,
+	MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
+	MPTCP_SUBFLOW_ATTR_MAP_SEQ,
+	MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
+	MPTCP_SUBFLOW_ATTR_SSN_OFFSET,
+	MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
+	MPTCP_SUBFLOW_ATTR_FLAGS,
+	MPTCP_SUBFLOW_ATTR_ID_REM,
+	MPTCP_SUBFLOW_ATTR_ID_LOC,
+	MPTCP_SUBFLOW_ATTR_PAD,
+
+	__MPTCP_SUBFLOW_ATTR_MAX
+};
+#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1)
+
+enum {
+	MPTCP_PM_ENDPOINT_ADDR = 1,
+
+	__MPTCP_PM_ENDPOINT_MAX
+};
+#define MPTCP_PM_ENDPOINT_MAX (__MPTCP_PM_ENDPOINT_MAX - 1)
+
+enum {
+	MPTCP_PM_ATTR_UNSPEC,
+	MPTCP_PM_ATTR_ADDR,
+	MPTCP_PM_ATTR_RCV_ADD_ADDRS,
+	MPTCP_PM_ATTR_SUBFLOWS,
+	MPTCP_PM_ATTR_TOKEN,
+	MPTCP_PM_ATTR_LOC_ID,
+	MPTCP_PM_ATTR_ADDR_REMOTE,
+
+	__MPTCP_PM_ATTR_MAX
+};
+#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1)
+
+enum mptcp_event_attr {
+	MPTCP_ATTR_UNSPEC,
+	MPTCP_ATTR_TOKEN,
+	MPTCP_ATTR_FAMILY,
+	MPTCP_ATTR_LOC_ID,
+	MPTCP_ATTR_REM_ID,
+	MPTCP_ATTR_SADDR4,
+	MPTCP_ATTR_SADDR6,
+	MPTCP_ATTR_DADDR4,
+	MPTCP_ATTR_DADDR6,
+	MPTCP_ATTR_SPORT,
+	MPTCP_ATTR_DPORT,
+	MPTCP_ATTR_BACKUP,
+	MPTCP_ATTR_ERROR,
+	MPTCP_ATTR_FLAGS,
+	MPTCP_ATTR_TIMEOUT,
+	MPTCP_ATTR_IF_IDX,
+	MPTCP_ATTR_RESET_REASON,
+	MPTCP_ATTR_RESET_FLAGS,
+	MPTCP_ATTR_SERVER_SIDE,
+
+	__MPTCP_ATTR_MAX
+};
+#define MPTCP_ATTR_MAX (__MPTCP_ATTR_MAX - 1)
+
+enum {
+	MPTCP_PM_CMD_UNSPEC,
+	MPTCP_PM_CMD_ADD_ADDR,
+	MPTCP_PM_CMD_DEL_ADDR,
+	MPTCP_PM_CMD_GET_ADDR,
+	MPTCP_PM_CMD_FLUSH_ADDRS,
+	MPTCP_PM_CMD_SET_LIMITS,
+	MPTCP_PM_CMD_GET_LIMITS,
+	MPTCP_PM_CMD_SET_FLAGS,
+	MPTCP_PM_CMD_ANNOUNCE,
+	MPTCP_PM_CMD_REMOVE,
+	MPTCP_PM_CMD_SUBFLOW_CREATE,
+	MPTCP_PM_CMD_SUBFLOW_DESTROY,
+
+	__MPTCP_PM_CMD_MAX
+};
+#define MPTCP_PM_CMD_MAX (__MPTCP_PM_CMD_MAX - 1)
+
+#endif /* _UAPI_LINUX_MPTCP_PM_H */
-- 
cgit v1.2.3-70-g09d2


From 6ca80638b90cec66547011ee1ef79e534589989a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Mon, 23 Oct 2023 11:17:28 -0700
Subject: net: dsa: Use conduit and user terms

Use more inclusive terms throughout the DSA subsystem by moving away
from "master" which is replaced by "conduit" and "slave" which is
replaced by "user". No functional changes.

Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20231023181729.1191071-2-florian.fainelli@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../bindings/net/dsa/mediatek,mt7530.yaml          |    2 +-
 Documentation/networking/dsa/b53.rst               |   14 +-
 Documentation/networking/dsa/bcm_sf2.rst           |    2 +-
 Documentation/networking/dsa/configuration.rst     |  100 +-
 Documentation/networking/dsa/dsa.rst               |  162 +-
 Documentation/networking/dsa/lan9303.rst           |    2 +-
 Documentation/networking/dsa/sja1105.rst           |    6 +-
 .../boot/dts/marvell/armada-3720-espressobin.dtsi  |    2 +-
 drivers/net/dsa/b53/b53_common.c                   |    4 +-
 drivers/net/dsa/b53/b53_mdio.c                     |    2 +-
 drivers/net/dsa/bcm_sf2.c                          |   41 +-
 drivers/net/dsa/bcm_sf2.h                          |    2 +-
 drivers/net/dsa/bcm_sf2_cfp.c                      |    4 +-
 drivers/net/dsa/lan9303-core.c                     |    4 +-
 drivers/net/dsa/lantiq_gswip.c                     |   34 +-
 drivers/net/dsa/microchip/ksz9477.c                |    6 +-
 drivers/net/dsa/microchip/ksz_common.c             |   20 +-
 drivers/net/dsa/microchip/ksz_ptp.c                |    2 +-
 drivers/net/dsa/mt7530.c                           |   18 +-
 drivers/net/dsa/mv88e6xxx/chip.c                   |    4 +-
 drivers/net/dsa/ocelot/felix.c                     |   68 +-
 drivers/net/dsa/ocelot/felix.h                     |    6 +-
 drivers/net/dsa/qca/qca8k-8xxx.c                   |   50 +-
 drivers/net/dsa/qca/qca8k-common.c                 |    4 +-
 drivers/net/dsa/qca/qca8k-leds.c                   |    6 +-
 drivers/net/dsa/qca/qca8k.h                        |    2 +-
 drivers/net/dsa/realtek/realtek-smi.c              |   28 +-
 drivers/net/dsa/realtek/realtek.h                  |    2 +-
 drivers/net/dsa/realtek/rtl8365mb.c                |    2 +-
 drivers/net/dsa/sja1105/sja1105_main.c             |    4 +-
 drivers/net/dsa/xrs700x/xrs700x.c                  |   12 +-
 drivers/net/ethernet/broadcom/bcmsysport.c         |    2 +-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c        |    2 +-
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c    |    2 +-
 include/linux/dsa/sja1105.h                        |    2 +-
 include/net/dsa.h                                  |   56 +-
 include/net/dsa_stubs.h                            |   22 +-
 net/core/dev_ioctl.c                               |    2 +-
 net/dsa/Makefile                                   |    6 +-
 net/dsa/conduit.c                                  |  475 +++
 net/dsa/conduit.h                                  |   22 +
 net/dsa/dsa.c                                      |  224 +-
 net/dsa/dsa.h                                      |   12 +-
 net/dsa/master.c                                   |  475 ---
 net/dsa/master.h                                   |   22 -
 net/dsa/netlink.c                                  |   14 +-
 net/dsa/port.c                                     |  124 +-
 net/dsa/port.h                                     |    4 +-
 net/dsa/slave.c                                    | 3727 --------------------
 net/dsa/slave.h                                    |   69 -
 net/dsa/switch.c                                   |   20 +-
 net/dsa/switch.h                                   |    8 +-
 net/dsa/tag.c                                      |   10 +-
 net/dsa/tag.h                                      |   26 +-
 net/dsa/tag_8021q.c                                |   22 +-
 net/dsa/tag_8021q.h                                |    2 +-
 net/dsa/tag_ar9331.c                               |    4 +-
 net/dsa/tag_brcm.c                                 |   14 +-
 net/dsa/tag_dsa.c                                  |    6 +-
 net/dsa/tag_gswip.c                                |    4 +-
 net/dsa/tag_hellcreek.c                            |    4 +-
 net/dsa/tag_ksz.c                                  |   12 +-
 net/dsa/tag_lan9303.c                              |    4 +-
 net/dsa/tag_mtk.c                                  |    4 +-
 net/dsa/tag_none.c                                 |    6 +-
 net/dsa/tag_ocelot.c                               |   22 +-
 net/dsa/tag_ocelot_8021q.c                         |   12 +-
 net/dsa/tag_qca.c                                  |    6 +-
 net/dsa/tag_rtl4_a.c                               |    6 +-
 net/dsa/tag_rtl8_4.c                               |    6 +-
 net/dsa/tag_rzn1_a5psw.c                           |    4 +-
 net/dsa/tag_sja1105.c                              |   30 +-
 net/dsa/tag_trailer.c                              |    4 +-
 net/dsa/tag_xrs700x.c                              |    4 +-
 net/dsa/user.c                                     | 3727 ++++++++++++++++++++
 net/dsa/user.h                                     |   69 +
 76 files changed, 4958 insertions(+), 4955 deletions(-)
 create mode 100644 net/dsa/conduit.c
 create mode 100644 net/dsa/conduit.h
 delete mode 100644 net/dsa/master.c
 delete mode 100644 net/dsa/master.h
 delete mode 100644 net/dsa/slave.c
 delete mode 100644 net/dsa/slave.h
 create mode 100644 net/dsa/user.c
 create mode 100644 net/dsa/user.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml b/Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml
index 5038818e9f2e..1c2444121e60 100644
--- a/Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml
+++ b/Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml
@@ -60,7 +60,7 @@ description: |
 
       Check out example 6.
 
-    - Port 5 can be wired to an external phy. Port 5 becomes a DSA slave.
+    - Port 5 can be wired to an external phy. Port 5 becomes a DSA user port.
 
       For the multi-chip module MT7530, the external phy must be wired TX to TX
       to gmac1 of the SoC for this to work. Ubiquiti EdgeRouter X SFP is wired
diff --git a/Documentation/networking/dsa/b53.rst b/Documentation/networking/dsa/b53.rst
index b41637cdb82b..1cb3ff648f88 100644
--- a/Documentation/networking/dsa/b53.rst
+++ b/Documentation/networking/dsa/b53.rst
@@ -52,7 +52,7 @@ VLAN programming would basically change the CPU port's default PVID and make
 it untagged, undesirable.
 
 In difference to the configuration described in :ref:`dsa-vlan-configuration`
-the default VLAN 1 has to be removed from the slave interface configuration in
+the default VLAN 1 has to be removed from the user interface configuration in
 single port and gateway configuration, while there is no need to add an extra
 VLAN configuration in the bridge showcase.
 
@@ -68,13 +68,13 @@ By default packages are tagged with vid 1:
   ip link add link eth0 name eth0.2 type vlan id 2
   ip link add link eth0 name eth0.3 type vlan id 3
 
-  # The master interface needs to be brought up before the slave ports.
+  # The conduit interface needs to be brought up before the user ports.
   ip link set eth0 up
   ip link set eth0.1 up
   ip link set eth0.2 up
   ip link set eth0.3 up
 
-  # bring up the slave interfaces
+  # bring up the user interfaces
   ip link set wan up
   ip link set lan1 up
   ip link set lan2 up
@@ -113,11 +113,11 @@ bridge
   # tag traffic on CPU port
   ip link add link eth0 name eth0.1 type vlan id 1
 
-  # The master interface needs to be brought up before the slave ports.
+  # The conduit interface needs to be brought up before the user ports.
   ip link set eth0 up
   ip link set eth0.1 up
 
-  # bring up the slave interfaces
+  # bring up the user interfaces
   ip link set wan up
   ip link set lan1 up
   ip link set lan2 up
@@ -149,12 +149,12 @@ gateway
   ip link add link eth0 name eth0.1 type vlan id 1
   ip link add link eth0 name eth0.2 type vlan id 2
 
-  # The master interface needs to be brought up before the slave ports.
+  # The conduit interface needs to be brought up before the user ports.
   ip link set eth0 up
   ip link set eth0.1 up
   ip link set eth0.2 up
 
-  # bring up the slave interfaces
+  # bring up the user interfaces
   ip link set wan up
   ip link set lan1 up
   ip link set lan2 up
diff --git a/Documentation/networking/dsa/bcm_sf2.rst b/Documentation/networking/dsa/bcm_sf2.rst
index dee234039e1e..d2571435696f 100644
--- a/Documentation/networking/dsa/bcm_sf2.rst
+++ b/Documentation/networking/dsa/bcm_sf2.rst
@@ -67,7 +67,7 @@ MDIO indirect accesses
 ----------------------
 
 Due to a limitation in how Broadcom switches have been designed, external
-Broadcom switches connected to a SF2 require the use of the DSA slave MDIO bus
+Broadcom switches connected to a SF2 require the use of the DSA user MDIO bus
 in order to properly configure them. By default, the SF2 pseudo-PHY address, and
 an external switch pseudo-PHY address will both be snooping for incoming MDIO
 transactions, since they are at the same address (30), resulting in some kind of
diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst
index d2934c40f0f1..e6c9719874b0 100644
--- a/Documentation/networking/dsa/configuration.rst
+++ b/Documentation/networking/dsa/configuration.rst
@@ -31,38 +31,38 @@ at https://www.kernel.org/pub/linux/utils/net/iproute2/
 
 Through DSA every port of a switch is handled like a normal linux Ethernet
 interface. The CPU port is the switch port connected to an Ethernet MAC chip.
-The corresponding linux Ethernet interface is called the master interface.
-All other corresponding linux interfaces are called slave interfaces.
+The corresponding linux Ethernet interface is called the conduit interface.
+All other corresponding linux interfaces are called user interfaces.
 
-The slave interfaces depend on the master interface being up in order for them
-to send or receive traffic. Prior to kernel v5.12, the state of the master
+The user interfaces depend on the conduit interface being up in order for them
+to send or receive traffic. Prior to kernel v5.12, the state of the conduit
 interface had to be managed explicitly by the user. Starting with kernel v5.12,
 the behavior is as follows:
 
-- when a DSA slave interface is brought up, the master interface is
+- when a DSA user interface is brought up, the conduit interface is
   automatically brought up.
-- when the master interface is brought down, all DSA slave interfaces are
+- when the conduit interface is brought down, all DSA user interfaces are
   automatically brought down.
 
 In this documentation the following Ethernet interfaces are used:
 
 *eth0*
-  the master interface
+  the conduit interface
 
 *eth1*
-  another master interface
+  another conduit interface
 
 *lan1*
-  a slave interface
+  a user interface
 
 *lan2*
-  another slave interface
+  another user interface
 
 *lan3*
-  a third slave interface
+  a third user interface
 
 *wan*
-  A slave interface dedicated for upstream traffic
+  A user interface dedicated for upstream traffic
 
 Further Ethernet interfaces can be configured similar.
 The configured IPs and networks are:
@@ -96,11 +96,11 @@ without using a VLAN based configuration.
     ip addr add 192.0.2.5/30 dev lan2
     ip addr add 192.0.2.9/30 dev lan3
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set lan1 up
     ip link set lan2 up
     ip link set lan3 up
@@ -108,11 +108,11 @@ without using a VLAN based configuration.
 *bridge*
   .. code-block:: sh
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set lan1 up
     ip link set lan2 up
     ip link set lan3 up
@@ -134,11 +134,11 @@ without using a VLAN based configuration.
 *gateway*
   .. code-block:: sh
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set wan up
     ip link set lan1 up
     ip link set lan2 up
@@ -178,14 +178,14 @@ configuration.
     ip link add link eth0 name eth0.2 type vlan id 2
     ip link add link eth0 name eth0.3 type vlan id 3
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
     ip link set eth0.1 up
     ip link set eth0.2 up
     ip link set eth0.3 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set lan1 up
     ip link set lan2 up
     ip link set lan3 up
@@ -221,12 +221,12 @@ configuration.
     # tag traffic on CPU port
     ip link add link eth0 name eth0.1 type vlan id 1
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
     ip link set eth0.1 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set lan1 up
     ip link set lan2 up
     ip link set lan3 up
@@ -261,13 +261,13 @@ configuration.
     ip link add link eth0 name eth0.1 type vlan id 1
     ip link add link eth0 name eth0.2 type vlan id 2
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
     ip link set eth0.1 up
     ip link set eth0.2 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set wan up
     ip link set lan1 up
     ip link set lan2 up
@@ -380,22 +380,22 @@ affinities according to the available CPU ports.
 
 Secondly, it is possible to perform load balancing between CPU ports on a per
 packet basis, rather than statically assigning user ports to CPU ports.
-This can be achieved by placing the DSA masters under a LAG interface (bonding
+This can be achieved by placing the DSA conduits under a LAG interface (bonding
 or team). DSA monitors this operation and creates a mirror of this software LAG
-on the CPU ports facing the physical DSA masters that constitute the LAG slave
+on the CPU ports facing the physical DSA conduits that constitute the LAG slave
 devices.
 
 To make use of multiple CPU ports, the firmware (device tree) description of
-the switch must mark all the links between CPU ports and their DSA masters
+the switch must mark all the links between CPU ports and their DSA conduits
 using the ``ethernet`` reference/phandle. At startup, only a single CPU port
-and DSA master will be used - the numerically first port from the firmware
+and DSA conduit will be used - the numerically first port from the firmware
 description which has an ``ethernet`` property. It is up to the user to
-configure the system for the switch to use other masters.
+configure the system for the switch to use other conduits.
 
 DSA uses the ``rtnl_link_ops`` mechanism (with a "dsa" ``kind``) to allow
-changing the DSA master of a user port. The ``IFLA_DSA_MASTER`` u32 netlink
-attribute contains the ifindex of the master device that handles each slave
-device. The DSA master must be a valid candidate based on firmware node
+changing the DSA conduit of a user port. The ``IFLA_DSA_MASTER`` u32 netlink
+attribute contains the ifindex of the conduit device that handles each user
+device. The DSA conduit must be a valid candidate based on firmware node
 information, or a LAG interface which contains only slaves which are valid
 candidates.
 
@@ -403,7 +403,7 @@ Using iproute2, the following manipulations are possible:
 
   .. code-block:: sh
 
-    # See the DSA master in current use
+    # See the DSA conduit in current use
     ip -d link show dev swp0
         (...)
         dsa master eth0
@@ -414,7 +414,7 @@ Using iproute2, the following manipulations are possible:
     ip link set swp2 type dsa master eth1
     ip link set swp3 type dsa master eth0
 
-    # CPU ports in LAG, using explicit assignment of the DSA master
+    # CPU ports in LAG, using explicit assignment of the DSA conduit
     ip link add bond0 type bond mode balance-xor && ip link set bond0 up
     ip link set eth1 down && ip link set eth1 master bond0
     ip link set swp0 type dsa master bond0
@@ -426,7 +426,7 @@ Using iproute2, the following manipulations are possible:
         (...)
         dsa master bond0
 
-    # CPU ports in LAG, relying on implicit migration of the DSA master
+    # CPU ports in LAG, relying on implicit migration of the DSA conduit
     ip link add bond0 type bond mode balance-xor && ip link set bond0 up
     ip link set eth0 down && ip link set eth0 master bond0
     ip link set eth1 down && ip link set eth1 master bond0
@@ -436,23 +436,23 @@ Using iproute2, the following manipulations are possible:
 
 Notice that in the case of CPU ports under a LAG, the use of the
 ``IFLA_DSA_MASTER`` netlink attribute is not strictly needed, but rather, DSA
-reacts to the ``IFLA_MASTER`` attribute change of its present master (``eth0``)
+reacts to the ``IFLA_MASTER`` attribute change of its present conduit (``eth0``)
 and migrates all user ports to the new upper of ``eth0``, ``bond0``. Similarly,
 when ``bond0`` is destroyed using ``RTM_DELLINK``, DSA migrates the user ports
-that were assigned to this interface to the first physical DSA master which is
+that were assigned to this interface to the first physical DSA conduit which is
 eligible, based on the firmware description (it effectively reverts to the
 startup configuration).
 
 In a setup with more than 2 physical CPU ports, it is therefore possible to mix
-static user to CPU port assignment with LAG between DSA masters. It is not
-possible to statically assign a user port towards a DSA master that has any
-upper interfaces (this includes LAG devices - the master must always be the LAG
+static user to CPU port assignment with LAG between DSA conduits. It is not
+possible to statically assign a user port towards a DSA conduit that has any
+upper interfaces (this includes LAG devices - the conduit must always be the LAG
 in this case).
 
-Live changing of the DSA master (and thus CPU port) affinity of a user port is
+Live changing of the DSA conduit (and thus CPU port) affinity of a user port is
 permitted, in order to allow dynamic redistribution in response to traffic.
 
-Physical DSA masters are allowed to join and leave at any time a LAG interface
-used as a DSA master; however, DSA will reject a LAG interface as a valid
-candidate for being a DSA master unless it has at least one physical DSA master
+Physical DSA conduits are allowed to join and leave at any time a LAG interface
+used as a DSA conduit; however, DSA will reject a LAG interface as a valid
+candidate for being a DSA conduit unless it has at least one physical DSA conduit
 as a slave device.
diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst
index a94ddf83348a..7b2e69cd7ef0 100644
--- a/Documentation/networking/dsa/dsa.rst
+++ b/Documentation/networking/dsa/dsa.rst
@@ -25,7 +25,7 @@ presence of a management port connected to an Ethernet controller capable of
 receiving Ethernet frames from the switch. This is a very common setup for all
 kinds of Ethernet switches found in Small Home and Office products: routers,
 gateways, or even top-of-rack switches. This host Ethernet controller will
-be later referred to as "master" and "cpu" in DSA terminology and code.
+be later referred to as "conduit" and "cpu" in DSA terminology and code.
 
 The D in DSA stands for Distributed, because the subsystem has been designed
 with the ability to configure and manage cascaded switches on top of each other
@@ -35,7 +35,7 @@ of multiple switches connected to each other is called a "switch tree".
 
 For each front-panel port, DSA creates specialized network devices which are
 used as controlling and data-flowing endpoints for use by the Linux networking
-stack. These specialized network interfaces are referred to as "slave" network
+stack. These specialized network interfaces are referred to as "user" network
 interfaces in DSA terminology and code.
 
 The ideal case for using DSA is when an Ethernet switch supports a "switch tag"
@@ -56,12 +56,16 @@ Note that DSA does not currently create network interfaces for the "cpu" and
 
 - the "cpu" port is the Ethernet switch facing side of the management
   controller, and as such, would create a duplication of feature, since you
-  would get two interfaces for the same conduit: master netdev, and "cpu" netdev
+  would get two interfaces for the same conduit: conduit netdev, and "cpu" netdev
 
 - the "dsa" port(s) are just conduits between two or more switches, and as such
   cannot really be used as proper network interfaces either, only the
   downstream, or the top-most upstream interface makes sense with that model
 
+NB: for the past 15 years, the DSA subsystem had been making use of the terms
+"master" (rather than "conduit") and "slave" (rather than "user"). These terms
+have been removed from the DSA codebase and phased out of the uAPI.
+
 Switch tagging protocols
 ------------------------
 
@@ -80,14 +84,14 @@ methods of the ``struct dsa_device_ops`` structure, which are detailed below.
 Tagging protocols generally fall in one of three categories:
 
 1. The switch-specific frame header is located before the Ethernet header,
-   shifting to the right (from the perspective of the DSA master's frame
+   shifting to the right (from the perspective of the DSA conduit's frame
    parser) the MAC DA, MAC SA, EtherType and the entire L2 payload.
 2. The switch-specific frame header is located before the EtherType, keeping
-   the MAC DA and MAC SA in place from the DSA master's perspective, but
+   the MAC DA and MAC SA in place from the DSA conduit's perspective, but
    shifting the 'real' EtherType and L2 payload to the right.
 3. The switch-specific frame header is located at the tail of the packet,
    keeping all frame headers in place and not altering the view of the packet
-   that the DSA master's frame parser has.
+   that the DSA conduit's frame parser has.
 
 A tagging protocol may tag all packets with switch tags of the same length, or
 the tag length might vary (for example packets with PTP timestamps might
@@ -95,7 +99,7 @@ require an extended switch tag, or there might be one tag length on TX and a
 different one on RX). Either way, the tagging protocol driver must populate the
 ``struct dsa_device_ops::needed_headroom`` and/or ``struct dsa_device_ops::needed_tailroom``
 with the length in octets of the longest switch frame header/trailer. The DSA
-framework will automatically adjust the MTU of the master interface to
+framework will automatically adjust the MTU of the conduit interface to
 accommodate for this extra size in order for DSA user ports to support the
 standard MTU (L2 payload length) of 1500 octets. The ``needed_headroom`` and
 ``needed_tailroom`` properties are also used to request from the network stack,
@@ -140,18 +144,18 @@ adding or removing the ``ETH_P_EDSA`` EtherType and some padding octets).
 It is possible to construct cascaded setups of DSA switches even if their
 tagging protocols are not compatible with one another. In this case, there are
 no DSA links in this fabric, and each switch constitutes a disjoint DSA switch
-tree. The DSA links are viewed as simply a pair of a DSA master (the out-facing
+tree. The DSA links are viewed as simply a pair of a DSA conduit (the out-facing
 port of the upstream DSA switch) and a CPU port (the in-facing port of the
 downstream DSA switch).
 
 The tagging protocol of the attached DSA switch tree can be viewed through the
-``dsa/tagging`` sysfs attribute of the DSA master::
+``dsa/tagging`` sysfs attribute of the DSA conduit::
 
     cat /sys/class/net/eth0/dsa/tagging
 
 If the hardware and driver are capable, the tagging protocol of the DSA switch
 tree can be changed at runtime. This is done by writing the new tagging
-protocol name to the same sysfs device attribute as above (the DSA master and
+protocol name to the same sysfs device attribute as above (the DSA conduit and
 all attached switch ports must be down while doing this).
 
 It is desirable that all tagging protocols are testable with the ``dsa_loop``
@@ -159,7 +163,7 @@ mockup driver, which can be attached to any network interface. The goal is that
 any network interface should be capable of transmitting the same packet in the
 same way, and the tagger should decode the same received packet in the same way
 regardless of the driver used for the switch control path, and the driver used
-for the DSA master.
+for the DSA conduit.
 
 The transmission of a packet goes through the tagger's ``xmit`` function.
 The passed ``struct sk_buff *skb`` has ``skb->data`` pointing at
@@ -183,44 +187,44 @@ virtual DSA user network interface corresponding to the physical front-facing
 switch port that the packet was received on.
 
 Since tagging protocols in category 1 and 2 break software (and most often also
-hardware) packet dissection on the DSA master, features such as RPS (Receive
-Packet Steering) on the DSA master would be broken. The DSA framework deals
+hardware) packet dissection on the DSA conduit, features such as RPS (Receive
+Packet Steering) on the DSA conduit would be broken. The DSA framework deals
 with this by hooking into the flow dissector and shifting the offset at which
-the IP header is to be found in the tagged frame as seen by the DSA master.
+the IP header is to be found in the tagged frame as seen by the DSA conduit.
 This behavior is automatic based on the ``overhead`` value of the tagging
 protocol. If not all packets are of equal size, the tagger can implement the
 ``flow_dissect`` method of the ``struct dsa_device_ops`` and override this
 default behavior by specifying the correct offset incurred by each individual
 RX packet. Tail taggers do not cause issues to the flow dissector.
 
-Checksum offload should work with category 1 and 2 taggers when the DSA master
+Checksum offload should work with category 1 and 2 taggers when the DSA conduit
 driver declares NETIF_F_HW_CSUM in vlan_features and looks at csum_start and
 csum_offset. For those cases, DSA will shift the checksum start and offset by
-the tag size. If the DSA master driver still uses the legacy NETIF_F_IP_CSUM
+the tag size. If the DSA conduit driver still uses the legacy NETIF_F_IP_CSUM
 or NETIF_F_IPV6_CSUM in vlan_features, the offload might only work if the
 offload hardware already expects that specific tag (perhaps due to matching
-vendors). DSA slaves inherit those flags from the master port, and it is up to
+vendors). DSA user ports inherit those flags from the conduit, and it is up to
 the driver to correctly fall back to software checksum when the IP header is not
 where the hardware expects. If that check is ineffective, the packets might go
 to the network without a proper checksum (the checksum field will have the
 pseudo IP header sum). For category 3, when the offload hardware does not
 already expect the switch tag in use, the checksum must be calculated before any
-tag is inserted (i.e. inside the tagger). Otherwise, the DSA master would
+tag is inserted (i.e. inside the tagger). Otherwise, the DSA conduit would
 include the tail tag in the (software or hardware) checksum calculation. Then,
 when the tag gets stripped by the switch during transmission, it will leave an
 incorrect IP checksum in place.
 
 Due to various reasons (most common being category 1 taggers being associated
-with DSA-unaware masters, mangling what the master perceives as MAC DA), the
-tagging protocol may require the DSA master to operate in promiscuous mode, to
+with DSA-unaware conduits, mangling what the conduit perceives as MAC DA), the
+tagging protocol may require the DSA conduit to operate in promiscuous mode, to
 receive all frames regardless of the value of the MAC DA. This can be done by
-setting the ``promisc_on_master`` property of the ``struct dsa_device_ops``.
-Note that this assumes a DSA-unaware master driver, which is the norm.
+setting the ``promisc_on_conduit`` property of the ``struct dsa_device_ops``.
+Note that this assumes a DSA-unaware conduit driver, which is the norm.
 
-Master network devices
-----------------------
+Conduit network devices
+-----------------------
 
-Master network devices are regular, unmodified Linux network device drivers for
+Conduit network devices are regular, unmodified Linux network device drivers for
 the CPU/management Ethernet interface. Such a driver might occasionally need to
 know whether DSA is enabled (e.g.: to enable/disable specific offload features),
 but the DSA subsystem has been proven to work with industry standard drivers:
@@ -232,14 +236,14 @@ Ethernet switch.
 Networking stack hooks
 ----------------------
 
-When a master netdev is used with DSA, a small hook is placed in the
+When a conduit netdev is used with DSA, a small hook is placed in the
 networking stack is in order to have the DSA subsystem process the Ethernet
 switch specific tagging protocol. DSA accomplishes this by registering a
 specific (and fake) Ethernet type (later becoming ``skb->protocol``) with the
 networking stack, this is also known as a ``ptype`` or ``packet_type``. A typical
 Ethernet Frame receive sequence looks like this:
 
-Master network device (e.g.: e1000e):
+Conduit network device (e.g.: e1000e):
 
 1. Receive interrupt fires:
 
@@ -269,16 +273,16 @@ Master network device (e.g.: e1000e):
 
         - inspect and strip switch tag protocol to determine originating port
         - locate per-port network device
-        - invoke ``eth_type_trans()`` with the DSA slave network device
+        - invoke ``eth_type_trans()`` with the DSA user network device
         - invoked ``netif_receive_skb()``
 
-Past this point, the DSA slave network devices get delivered regular Ethernet
+Past this point, the DSA user network devices get delivered regular Ethernet
 frames that can be processed by the networking stack.
 
-Slave network devices
----------------------
+User network devices
+--------------------
 
-Slave network devices created by DSA are stacked on top of their master network
+User network devices created by DSA are stacked on top of their conduit network
 device, each of these network interfaces will be responsible for being a
 controlling and data-flowing end-point for each front-panel port of the switch.
 These interfaces are specialized in order to:
@@ -289,31 +293,31 @@ These interfaces are specialized in order to:
   Wake-on-LAN, register dumps...
 - manage external/internal PHY: link, auto-negotiation, etc.
 
-These slave network devices have custom net_device_ops and ethtool_ops function
+These user network devices have custom net_device_ops and ethtool_ops function
 pointers which allow DSA to introduce a level of layering between the networking
 stack/ethtool and the switch driver implementation.
 
-Upon frame transmission from these slave network devices, DSA will look up which
+Upon frame transmission from these user network devices, DSA will look up which
 switch tagging protocol is currently registered with these network devices and
 invoke a specific transmit routine which takes care of adding the relevant
 switch tag in the Ethernet frames.
 
-These frames are then queued for transmission using the master network device
+These frames are then queued for transmission using the conduit network device
 ``ndo_start_xmit()`` function. Since they contain the appropriate switch tag, the
 Ethernet switch will be able to process these incoming frames from the
 management interface and deliver them to the physical switch port.
 
 When using multiple CPU ports, it is possible to stack a LAG (bonding/team)
-device between the DSA slave devices and the physical DSA masters. The LAG
-device is thus also a DSA master, but the LAG slave devices continue to be DSA
-masters as well (just with no user port assigned to them; this is needed for
-recovery in case the LAG DSA master disappears). Thus, the data path of the LAG
-DSA master is used asymmetrically. On RX, the ``ETH_P_XDSA`` handler, which
-calls ``dsa_switch_rcv()``, is invoked early (on the physical DSA master;
-LAG slave). Therefore, the RX data path of the LAG DSA master is not used.
-On the other hand, TX takes place linearly: ``dsa_slave_xmit`` calls
-``dsa_enqueue_skb``, which calls ``dev_queue_xmit`` towards the LAG DSA master.
-The latter calls ``dev_queue_xmit`` towards one physical DSA master or the
+device between the DSA user devices and the physical DSA conduits. The LAG
+device is thus also a DSA conduit, but the LAG slave devices continue to be DSA
+conduits as well (just with no user port assigned to them; this is needed for
+recovery in case the LAG DSA conduit disappears). Thus, the data path of the LAG
+DSA conduit is used asymmetrically. On RX, the ``ETH_P_XDSA`` handler, which
+calls ``dsa_switch_rcv()``, is invoked early (on the physical DSA conduit;
+LAG slave). Therefore, the RX data path of the LAG DSA conduit is not used.
+On the other hand, TX takes place linearly: ``dsa_user_xmit`` calls
+``dsa_enqueue_skb``, which calls ``dev_queue_xmit`` towards the LAG DSA conduit.
+The latter calls ``dev_queue_xmit`` towards one physical DSA conduit or the
 other, and in both cases, the packet exits the system through a hardware path
 towards the switch.
 
@@ -352,11 +356,11 @@ perspective::
            || swp0 | | swp1 | | swp2 | | swp3 ||
            ++------+-+------+-+------+-+------++
 
-Slave MDIO bus
---------------
+User MDIO bus
+-------------
 
-In order to be able to read to/from a switch PHY built into it, DSA creates a
-slave MDIO bus which allows a specific switch driver to divert and intercept
+In order to be able to read to/from a switch PHY built into it, DSA creates an
+user MDIO bus which allows a specific switch driver to divert and intercept
 MDIO reads/writes towards specific PHY addresses. In most MDIO-connected
 switches, these functions would utilize direct or indirect PHY addressing mode
 to return standard MII registers from the switch builtin PHYs, allowing the PHY
@@ -364,7 +368,7 @@ library and/or to return link status, link partner pages, auto-negotiation
 results, etc.
 
 For Ethernet switches which have both external and internal MDIO buses, the
-slave MII bus can be utilized to mux/demux MDIO reads and writes towards either
+user MII bus can be utilized to mux/demux MDIO reads and writes towards either
 internal or external MDIO devices this switch might be connected to: internal
 PHYs, external PHYs, or even external switches.
 
@@ -381,10 +385,10 @@ DSA data structures are defined in ``include/net/dsa.h`` as well as
 
 - ``dsa_platform_data``: platform device configuration data which can reference
   a collection of dsa_chip_data structures if multiple switches are cascaded,
-  the master network device this switch tree is attached to needs to be
+  the conduit network device this switch tree is attached to needs to be
   referenced
 
-- ``dsa_switch_tree``: structure assigned to the master network device under
+- ``dsa_switch_tree``: structure assigned to the conduit network device under
   ``dsa_ptr``, this structure references a dsa_platform_data structure as well as
   the tagging protocol supported by the switch tree, and which receive/transmit
   function hooks should be invoked, information about the directly attached
@@ -392,7 +396,7 @@ DSA data structures are defined in ``include/net/dsa.h`` as well as
   referenced to address individual switches in the tree.
 
 - ``dsa_switch``: structure describing a switch device in the tree, referencing
-  a ``dsa_switch_tree`` as a backpointer, slave network devices, master network
+  a ``dsa_switch_tree`` as a backpointer, user network devices, conduit network
   device, and a reference to the backing``dsa_switch_ops``
 
 - ``dsa_switch_ops``: structure referencing function pointers, see below for a
@@ -404,7 +408,7 @@ Design limitations
 Lack of CPU/DSA network devices
 -------------------------------
 
-DSA does not currently create slave network devices for the CPU or DSA ports, as
+DSA does not currently create user network devices for the CPU or DSA ports, as
 described before. This might be an issue in the following cases:
 
 - inability to fetch switch CPU port statistics counters using ethtool, which
@@ -419,7 +423,7 @@ described before. This might be an issue in the following cases:
 Common pitfalls using DSA setups
 --------------------------------
 
-Once a master network device is configured to use DSA (dev->dsa_ptr becomes
+Once a conduit network device is configured to use DSA (dev->dsa_ptr becomes
 non-NULL), and the switch behind it expects a tagging protocol, this network
 interface can only exclusively be used as a conduit interface. Sending packets
 directly through this interface (e.g.: opening a socket using this interface)
@@ -440,7 +444,7 @@ DSA currently leverages the following subsystems:
 MDIO/PHY library
 ----------------
 
-Slave network devices exposed by DSA may or may not be interfacing with PHY
+User network devices exposed by DSA may or may not be interfacing with PHY
 devices (``struct phy_device`` as defined in ``include/linux/phy.h)``, but the DSA
 subsystem deals with all possible combinations:
 
@@ -450,7 +454,7 @@ subsystem deals with all possible combinations:
 - special, non-autonegotiated or non MDIO-managed PHY devices: SFPs, MoCA; a.k.a
   fixed PHYs
 
-The PHY configuration is done by the ``dsa_slave_phy_setup()`` function and the
+The PHY configuration is done by the ``dsa_user_phy_setup()`` function and the
 logic basically looks like this:
 
 - if Device Tree is used, the PHY device is looked up using the standard
@@ -463,7 +467,7 @@ logic basically looks like this:
   and connected transparently using the special fixed MDIO bus driver
 
 - finally, if the PHY is built into the switch, as is very common with
-  standalone switch packages, the PHY is probed using the slave MII bus created
+  standalone switch packages, the PHY is probed using the user MII bus created
   by DSA
 
 
@@ -472,7 +476,7 @@ SWITCHDEV
 
 DSA directly utilizes SWITCHDEV when interfacing with the bridge layer, and
 more specifically with its VLAN filtering portion when configuring VLANs on top
-of per-port slave network devices. As of today, the only SWITCHDEV objects
+of per-port user network devices. As of today, the only SWITCHDEV objects
 supported by DSA are the FDB and VLAN objects.
 
 Devlink
@@ -589,8 +593,8 @@ is torn down when the first switch unregisters.
 It is mandatory for DSA switch drivers to implement the ``shutdown()`` callback
 of their respective bus, and call ``dsa_switch_shutdown()`` from it (a minimal
 version of the full teardown performed by ``dsa_unregister_switch()``).
-The reason is that DSA keeps a reference on the master net device, and if the
-driver for the master device decides to unbind on shutdown, DSA's reference
+The reason is that DSA keeps a reference on the conduit net device, and if the
+driver for the conduit device decides to unbind on shutdown, DSA's reference
 will block that operation from finalizing.
 
 Either ``dsa_switch_shutdown()`` or ``dsa_unregister_switch()`` must be called,
@@ -615,7 +619,7 @@ Switch configuration
   tag formats.
 
 - ``change_tag_protocol``: when the default tagging protocol has compatibility
-  problems with the master or other issues, the driver may support changing it
+  problems with the conduit or other issues, the driver may support changing it
   at runtime, either through a device tree property or through sysfs. In that
   case, further calls to ``get_tag_protocol`` should report the protocol in
   current use.
@@ -643,22 +647,22 @@ Switch configuration
   PHY cannot be found. In this case, probing of the DSA switch continues
   without that particular port.
 
-- ``port_change_master``: method through which the affinity (association used
+- ``port_change_conduit``: method through which the affinity (association used
   for traffic termination purposes) between a user port and a CPU port can be
   changed. By default all user ports from a tree are assigned to the first
   available CPU port that makes sense for them (most of the times this means
   the user ports of a tree are all assigned to the same CPU port, except for H
   topologies as described in commit 2c0b03258b8b). The ``port`` argument
-  represents the index of the user port, and the ``master`` argument represents
-  the new DSA master ``net_device``. The CPU port associated with the new
-  master can be retrieved by looking at ``struct dsa_port *cpu_dp =
-  master->dsa_ptr``. Additionally, the master can also be a LAG device where
-  all the slave devices are physical DSA masters. LAG DSA masters also have a
-  valid ``master->dsa_ptr`` pointer, however this is not unique, but rather a
-  duplicate of the first physical DSA master's (LAG slave) ``dsa_ptr``. In case
-  of a LAG DSA master, a further call to ``port_lag_join`` will be emitted
+  represents the index of the user port, and the ``conduit`` argument represents
+  the new DSA conduit ``net_device``. The CPU port associated with the new
+  conduit can be retrieved by looking at ``struct dsa_port *cpu_dp =
+  conduit->dsa_ptr``. Additionally, the conduit can also be a LAG device where
+  all the slave devices are physical DSA conduits. LAG DSA  also have a
+  valid ``conduit->dsa_ptr`` pointer, however this is not unique, but rather a
+  duplicate of the first physical DSA conduit's (LAG slave) ``dsa_ptr``. In case
+  of a LAG DSA conduit, a further call to ``port_lag_join`` will be emitted
   separately for the physical CPU ports associated with the physical DSA
-  masters, requesting them to create a hardware LAG associated with the LAG
+  conduits, requesting them to create a hardware LAG associated with the LAG
   interface.
 
 PHY devices and link management
@@ -670,16 +674,16 @@ PHY devices and link management
   should return a 32-bit bitmask of "flags" that is private between the switch
   driver and the Ethernet PHY driver in ``drivers/net/phy/\*``.
 
-- ``phy_read``: Function invoked by the DSA slave MDIO bus when attempting to read
+- ``phy_read``: Function invoked by the DSA user MDIO bus when attempting to read
   the switch port MDIO registers. If unavailable, return 0xffff for each read.
   For builtin switch Ethernet PHYs, this function should allow reading the link
   status, auto-negotiation results, link partner pages, etc.
 
-- ``phy_write``: Function invoked by the DSA slave MDIO bus when attempting to write
+- ``phy_write``: Function invoked by the DSA user MDIO bus when attempting to write
   to the switch port MDIO registers. If unavailable return a negative error
   code.
 
-- ``adjust_link``: Function invoked by the PHY library when a slave network device
+- ``adjust_link``: Function invoked by the PHY library when a user network device
   is attached to a PHY device. This function is responsible for appropriately
   configuring the switch port link parameters: speed, duplex, pause based on
   what the ``phy_device`` is providing.
@@ -698,14 +702,14 @@ Ethtool operations
   typically return statistics strings, private flags strings, etc.
 
 - ``get_ethtool_stats``: ethtool function used to query per-port statistics and
-  return their values. DSA overlays slave network devices general statistics:
+  return their values. DSA overlays user network devices general statistics:
   RX/TX counters from the network device, with switch driver specific statistics
   per port
 
 - ``get_sset_count``: ethtool function used to query the number of statistics items
 
 - ``get_wol``: ethtool function used to obtain Wake-on-LAN settings per-port, this
-  function may for certain implementations also query the master network device
+  function may for certain implementations also query the conduit network device
   Wake-on-LAN settings if this interface needs to participate in Wake-on-LAN
 
 - ``set_wol``: ethtool function used to configure Wake-on-LAN settings per-port,
@@ -747,13 +751,13 @@ Power management
   should resume all Ethernet switch activities and re-configure the switch to be
   in a fully active state
 
-- ``port_enable``: function invoked by the DSA slave network device ndo_open
+- ``port_enable``: function invoked by the DSA user network device ndo_open
   function when a port is administratively brought up, this function should
   fully enable a given switch port. DSA takes care of marking the port with
   ``BR_STATE_BLOCKING`` if the port is a bridge member, or ``BR_STATE_FORWARDING`` if it
   was not, and propagating these changes down to the hardware
 
-- ``port_disable``: function invoked by the DSA slave network device ndo_close
+- ``port_disable``: function invoked by the DSA user network device ndo_close
   function when a port is administratively brought down, this function should
   fully disable a given switch port. DSA takes care of marking the port with
   ``BR_STATE_DISABLED`` and propagating changes to the hardware if this port is
diff --git a/Documentation/networking/dsa/lan9303.rst b/Documentation/networking/dsa/lan9303.rst
index e3c820db28ad..ab81b4e0139e 100644
--- a/Documentation/networking/dsa/lan9303.rst
+++ b/Documentation/networking/dsa/lan9303.rst
@@ -4,7 +4,7 @@ LAN9303 Ethernet switch driver
 
 The LAN9303 is a three port 10/100 Mbps ethernet switch with integrated phys for
 the two external ethernet ports. The third port is an RMII/MII interface to a
-host master network interface (e.g. fixed link).
+host conduit network interface (e.g. fixed link).
 
 
 Driver details
diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst
index e0219c1452ab..8ab60eef07d4 100644
--- a/Documentation/networking/dsa/sja1105.rst
+++ b/Documentation/networking/dsa/sja1105.rst
@@ -79,7 +79,7 @@ The hardware tags all traffic internally with a port-based VLAN (pvid), or it
 decodes the VLAN information from the 802.1Q tag. Advanced VLAN classification
 is not possible. Once attributed a VLAN tag, frames are checked against the
 port's membership rules and dropped at ingress if they don't match any VLAN.
-This behavior is available when switch ports are enslaved to a bridge with
+This behavior is available when switch ports join a bridge with
 ``vlan_filtering 1``.
 
 Normally the hardware is not configurable with respect to VLAN awareness, but
@@ -122,7 +122,7 @@ on egress. Using ``vlan_filtering=1``, the behavior is the other way around:
 offloaded flows can be steered to TX queues based on the VLAN PCP, but the DSA
 net devices are no longer able to do that. To inject frames into a hardware TX
 queue with VLAN awareness active, it is necessary to create a VLAN
-sub-interface on the DSA master port, and send normal (0x8100) VLAN-tagged
+sub-interface on the DSA conduit port, and send normal (0x8100) VLAN-tagged
 towards the switch, with the VLAN PCP bits set appropriately.
 
 Management traffic (having DMAC 01-80-C2-xx-xx-xx or 01-19-1B-xx-xx-xx) is the
@@ -389,7 +389,7 @@ MDIO bus and PHY management
 The SJA1105 does not have an MDIO bus and does not perform in-band AN either.
 Therefore there is no link state notification coming from the switch device.
 A board would need to hook up the PHYs connected to the switch to any other
-MDIO bus available to Linux within the system (e.g. to the DSA master's MDIO
+MDIO bus available to Linux within the system (e.g. to the DSA conduit's MDIO
 bus). Link state management then works by the driver manually keeping in sync
 (over SPI commands) the MAC link speed with the settings negotiated by the PHY.
 
diff --git a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dtsi b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dtsi
index 5fc613d24151..49cbdb55b4b3 100644
--- a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dtsi
@@ -13,7 +13,7 @@
 / {
 	aliases {
 		ethernet0 = &eth0;
-		/* for dsa slave device */
+		/* for DSA user port device */
 		ethernet1 = &switch0port1;
 		ethernet2 = &switch0port2;
 		ethernet3 = &switch0port3;
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 4e27dc913cf7..0d628b35fd5c 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -757,7 +757,7 @@ int b53_configure_vlan(struct dsa_switch *ds)
 
 	/* Create an untagged VLAN entry for the default PVID in case
 	 * CONFIG_VLAN_8021Q is disabled and there are no calls to
-	 * dsa_slave_vlan_rx_add_vid() to create the default VLAN
+	 * dsa_user_vlan_rx_add_vid() to create the default VLAN
 	 * entry. Do this only when the tagging protocol is not
 	 * DSA_TAG_PROTO_NONE
 	 */
@@ -958,7 +958,7 @@ static struct phy_device *b53_get_phy_device(struct dsa_switch *ds, int port)
 		return NULL;
 	}
 
-	return mdiobus_get_phy(ds->slave_mii_bus, port);
+	return mdiobus_get_phy(ds->user_mii_bus, port);
 }
 
 void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
diff --git a/drivers/net/dsa/b53/b53_mdio.c b/drivers/net/dsa/b53/b53_mdio.c
index 4d55d8d18376..897e5e8b3d69 100644
--- a/drivers/net/dsa/b53/b53_mdio.c
+++ b/drivers/net/dsa/b53/b53_mdio.c
@@ -329,7 +329,7 @@ static int b53_mdio_probe(struct mdio_device *mdiodev)
 	 * layer setup
 	 */
 	if (of_machine_is_compatible("brcm,bcm7445d0") &&
-	    strcmp(mdiodev->bus->name, "sf2 slave mii"))
+	    strcmp(mdiodev->bus->name, "sf2 user mii"))
 		return -EPROBE_DEFER;
 
 	dev = b53_switch_alloc(&mdiodev->dev, &b53_mdio_ops, mdiodev->bus);
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 326937e91f52..cadee5505c29 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -623,19 +623,19 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
 
 	priv->master_mii_dn = dn;
 
-	priv->slave_mii_bus = mdiobus_alloc();
-	if (!priv->slave_mii_bus) {
+	priv->user_mii_bus = mdiobus_alloc();
+	if (!priv->user_mii_bus) {
 		err = -ENOMEM;
 		goto err_put_master_mii_bus_dev;
 	}
 
-	priv->slave_mii_bus->priv = priv;
-	priv->slave_mii_bus->name = "sf2 slave mii";
-	priv->slave_mii_bus->read = bcm_sf2_sw_mdio_read;
-	priv->slave_mii_bus->write = bcm_sf2_sw_mdio_write;
-	snprintf(priv->slave_mii_bus->id, MII_BUS_ID_SIZE, "sf2-%d",
+	priv->user_mii_bus->priv = priv;
+	priv->user_mii_bus->name = "sf2 user mii";
+	priv->user_mii_bus->read = bcm_sf2_sw_mdio_read;
+	priv->user_mii_bus->write = bcm_sf2_sw_mdio_write;
+	snprintf(priv->user_mii_bus->id, MII_BUS_ID_SIZE, "sf2-%d",
 		 index++);
-	priv->slave_mii_bus->dev.of_node = dn;
+	priv->user_mii_bus->dev.of_node = dn;
 
 	/* Include the pseudo-PHY address to divert reads towards our
 	 * workaround. This is only required for 7445D0, since 7445E0
@@ -653,9 +653,9 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
 		priv->indir_phy_mask = 0;
 
 	ds->phys_mii_mask = priv->indir_phy_mask;
-	ds->slave_mii_bus = priv->slave_mii_bus;
-	priv->slave_mii_bus->parent = ds->dev->parent;
-	priv->slave_mii_bus->phy_mask = ~priv->indir_phy_mask;
+	ds->user_mii_bus = priv->user_mii_bus;
+	priv->user_mii_bus->parent = ds->dev->parent;
+	priv->user_mii_bus->phy_mask = ~priv->indir_phy_mask;
 
 	/* We need to make sure that of_phy_connect() will not work by
 	 * removing the 'phandle' and 'linux,phandle' properties and
@@ -682,14 +682,14 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
 			phy_device_remove(phydev);
 	}
 
-	err = mdiobus_register(priv->slave_mii_bus);
+	err = mdiobus_register(priv->user_mii_bus);
 	if (err && dn)
-		goto err_free_slave_mii_bus;
+		goto err_free_user_mii_bus;
 
 	return 0;
 
-err_free_slave_mii_bus:
-	mdiobus_free(priv->slave_mii_bus);
+err_free_user_mii_bus:
+	mdiobus_free(priv->user_mii_bus);
 err_put_master_mii_bus_dev:
 	put_device(&priv->master_mii_bus->dev);
 err_of_node_put:
@@ -699,10 +699,9 @@ err_of_node_put:
 
 static void bcm_sf2_mdio_unregister(struct bcm_sf2_priv *priv)
 {
-	mdiobus_unregister(priv->slave_mii_bus);
-	mdiobus_free(priv->slave_mii_bus);
+	mdiobus_unregister(priv->user_mii_bus);
+	mdiobus_free(priv->user_mii_bus);
 	put_device(&priv->master_mii_bus->dev);
-	of_node_put(priv->master_mii_dn);
 }
 
 static u32 bcm_sf2_sw_get_phy_flags(struct dsa_switch *ds, int port)
@@ -915,7 +914,7 @@ static void bcm_sf2_sw_fixed_state(struct dsa_switch *ds, int port,
 		 * state machine and make it go in PHY_FORCING state instead.
 		 */
 		if (!status->link)
-			netif_carrier_off(dsa_to_port(ds, port)->slave);
+			netif_carrier_off(dsa_to_port(ds, port)->user);
 		status->duplex = DUPLEX_FULL;
 	} else {
 		status->link = true;
@@ -989,7 +988,7 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds)
 static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 			       struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = dsa_port_to_master(dsa_to_port(ds, port));
+	struct net_device *p = dsa_port_to_conduit(dsa_to_port(ds, port));
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	struct ethtool_wolinfo pwol = { };
 
@@ -1013,7 +1012,7 @@ static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port,
 			      struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = dsa_port_to_master(dsa_to_port(ds, port));
+	struct net_device *p = dsa_port_to_conduit(dsa_to_port(ds, port));
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index;
 	struct ethtool_wolinfo pwol =  { };
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 00afc94ce522..424f896b5a6f 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -108,7 +108,7 @@ struct bcm_sf2_priv {
 	/* Master and slave MDIO bus controller */
 	unsigned int			indir_phy_mask;
 	struct device_node		*master_mii_dn;
-	struct mii_bus			*slave_mii_bus;
+	struct mii_bus			*user_mii_bus;
 	struct mii_bus			*master_mii_bus;
 
 	/* Bitmask of ports needing BRCM tags */
diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index c4010b7bf089..c88ee3dd4299 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -1102,7 +1102,7 @@ static int bcm_sf2_cfp_rule_get_all(struct bcm_sf2_priv *priv,
 int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc, u32 *rule_locs)
 {
-	struct net_device *p = dsa_port_to_master(dsa_to_port(ds, port));
+	struct net_device *p = dsa_port_to_conduit(dsa_to_port(ds, port));
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	int ret = 0;
 
@@ -1145,7 +1145,7 @@ int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port,
 int bcm_sf2_set_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc)
 {
-	struct net_device *p = dsa_port_to_master(dsa_to_port(ds, port));
+	struct net_device *p = dsa_port_to_conduit(dsa_to_port(ds, port));
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	int ret = 0;
 
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index ee67adeb2cdb..fcb20eac332a 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -1084,7 +1084,7 @@ static int lan9303_port_enable(struct dsa_switch *ds, int port,
 	if (!dsa_port_is_user(dp))
 		return 0;
 
-	vlan_vid_add(dsa_port_to_master(dp), htons(ETH_P_8021Q), port);
+	vlan_vid_add(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port);
 
 	return lan9303_enable_processing_port(chip, port);
 }
@@ -1097,7 +1097,7 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port)
 	if (!dsa_port_is_user(dp))
 		return;
 
-	vlan_vid_del(dsa_port_to_master(dp), htons(ETH_P_8021Q), port);
+	vlan_vid_del(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port);
 
 	lan9303_disable_processing_port(chip, port);
 	lan9303_phy_write(ds, chip->phy_addr_base + port, MII_BMCR, BMCR_PDOWN);
diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index 1a2d5797bf98..9c185c9f0963 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -510,22 +510,22 @@ static int gswip_mdio(struct gswip_priv *priv, struct device_node *mdio_np)
 	struct dsa_switch *ds = priv->ds;
 	int err;
 
-	ds->slave_mii_bus = mdiobus_alloc();
-	if (!ds->slave_mii_bus)
+	ds->user_mii_bus = mdiobus_alloc();
+	if (!ds->user_mii_bus)
 		return -ENOMEM;
 
-	ds->slave_mii_bus->priv = priv;
-	ds->slave_mii_bus->read = gswip_mdio_rd;
-	ds->slave_mii_bus->write = gswip_mdio_wr;
-	ds->slave_mii_bus->name = "lantiq,xrx200-mdio";
-	snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s-mii",
+	ds->user_mii_bus->priv = priv;
+	ds->user_mii_bus->read = gswip_mdio_rd;
+	ds->user_mii_bus->write = gswip_mdio_wr;
+	ds->user_mii_bus->name = "lantiq,xrx200-mdio";
+	snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "%s-mii",
 		 dev_name(priv->dev));
-	ds->slave_mii_bus->parent = priv->dev;
-	ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
+	ds->user_mii_bus->parent = priv->dev;
+	ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask;
 
-	err = of_mdiobus_register(ds->slave_mii_bus, mdio_np);
+	err = of_mdiobus_register(ds->user_mii_bus, mdio_np);
 	if (err)
-		mdiobus_free(ds->slave_mii_bus);
+		mdiobus_free(ds->user_mii_bus);
 
 	return err;
 }
@@ -2196,8 +2196,8 @@ disable_switch:
 	dsa_unregister_switch(priv->ds);
 mdio_bus:
 	if (mdio_np) {
-		mdiobus_unregister(priv->ds->slave_mii_bus);
-		mdiobus_free(priv->ds->slave_mii_bus);
+		mdiobus_unregister(priv->ds->user_mii_bus);
+		mdiobus_free(priv->ds->user_mii_bus);
 	}
 put_mdio_node:
 	of_node_put(mdio_np);
@@ -2219,10 +2219,10 @@ static void gswip_remove(struct platform_device *pdev)
 
 	dsa_unregister_switch(priv->ds);
 
-	if (priv->ds->slave_mii_bus) {
-		mdiobus_unregister(priv->ds->slave_mii_bus);
-		of_node_put(priv->ds->slave_mii_bus->dev.of_node);
-		mdiobus_free(priv->ds->slave_mii_bus);
+	if (priv->ds->user_mii_bus) {
+		mdiobus_unregister(priv->ds->user_mii_bus);
+		of_node_put(priv->ds->user_mii_bus->dev.of_node);
+		mdiobus_free(priv->ds->user_mii_bus);
 	}
 
 	for (i = 0; i < priv->num_gphy_fw; i++)
diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c
index cde8ef33d029..a8b0e528b804 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -1170,7 +1170,7 @@ int ksz9477_tc_cbs_set_cinc(struct ksz_device *dev, int port, u32 val)
 void ksz9477_hsr_join(struct dsa_switch *ds, int port, struct net_device *hsr)
 {
 	struct ksz_device *dev = ds->priv;
-	struct net_device *slave;
+	struct net_device *user;
 	struct dsa_port *hsr_dp;
 	u8 data, hsr_ports = 0;
 
@@ -1202,8 +1202,8 @@ void ksz9477_hsr_join(struct dsa_switch *ds, int port, struct net_device *hsr)
 	ksz_port_cfg(dev, port, REG_PORT_LUE_CTRL, PORT_SRC_ADDR_FILTER, true);
 
 	/* Setup HW supported features for lan HSR ports */
-	slave = dsa_to_port(ds, port)->slave;
-	slave->features |= KSZ9477_SUPPORTED_HSR_FEATURES;
+	user = dsa_to_port(ds, port)->user;
+	user->features |= KSZ9477_SUPPORTED_HSR_FEATURES;
 }
 
 void ksz9477_hsr_leave(struct dsa_switch *ds, int port, struct net_device *hsr)
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index b800ace40ce1..a8025ff988ec 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1945,14 +1945,14 @@ static int ksz_irq_phy_setup(struct ksz_device *dev)
 				ret = irq;
 				goto out;
 			}
-			ds->slave_mii_bus->irq[phy] = irq;
+			ds->user_mii_bus->irq[phy] = irq;
 		}
 	}
 	return 0;
 out:
 	while (phy--)
 		if (BIT(phy) & ds->phys_mii_mask)
-			irq_dispose_mapping(ds->slave_mii_bus->irq[phy]);
+			irq_dispose_mapping(ds->user_mii_bus->irq[phy]);
 
 	return ret;
 }
@@ -1964,7 +1964,7 @@ static void ksz_irq_phy_free(struct ksz_device *dev)
 
 	for (phy = 0; phy < KSZ_MAX_NUM_PORTS; phy++)
 		if (BIT(phy) & ds->phys_mii_mask)
-			irq_dispose_mapping(ds->slave_mii_bus->irq[phy]);
+			irq_dispose_mapping(ds->user_mii_bus->irq[phy]);
 }
 
 static int ksz_mdio_register(struct ksz_device *dev)
@@ -1987,12 +1987,12 @@ static int ksz_mdio_register(struct ksz_device *dev)
 	bus->priv = dev;
 	bus->read = ksz_sw_mdio_read;
 	bus->write = ksz_sw_mdio_write;
-	bus->name = "ksz slave smi";
+	bus->name = "ksz user smi";
 	snprintf(bus->id, MII_BUS_ID_SIZE, "SMI-%d", ds->index);
 	bus->parent = ds->dev;
 	bus->phy_mask = ~ds->phys_mii_mask;
 
-	ds->slave_mii_bus = bus;
+	ds->user_mii_bus = bus;
 
 	if (dev->irq > 0) {
 		ret = ksz_irq_phy_setup(dev);
@@ -2344,7 +2344,7 @@ static void ksz_mib_read_work(struct work_struct *work)
 		if (!p->read) {
 			const struct dsa_port *dp = dsa_to_port(dev->ds, i);
 
-			if (!netif_carrier_ok(dp->slave))
+			if (!netif_carrier_ok(dp->user))
 				mib->cnt_ptr = dev->info->reg_mib_cnt;
 		}
 		port_r_cnt(dev, i);
@@ -2464,7 +2464,7 @@ static void ksz_get_ethtool_stats(struct dsa_switch *ds, int port,
 	mutex_lock(&mib->cnt_mutex);
 
 	/* Only read dropped counters if no link. */
-	if (!netif_carrier_ok(dp->slave))
+	if (!netif_carrier_ok(dp->user))
 		mib->cnt_ptr = dev->info->reg_mib_cnt;
 	port_r_cnt(dev, port);
 	memcpy(buf, mib->counters, dev->info->mib_cnt * sizeof(u64));
@@ -2574,7 +2574,7 @@ static int ksz_port_setup(struct dsa_switch *ds, int port)
 	if (!dsa_is_user_port(ds, port))
 		return 0;
 
-	/* setup slave port */
+	/* setup user port */
 	dev->dev_ops->port_setup(dev, port, false);
 
 	/* port_stp_state_set() will be called after to enable the port so
@@ -3567,8 +3567,8 @@ static int ksz_port_set_mac_address(struct dsa_switch *ds, int port,
 static int ksz_switch_macaddr_get(struct dsa_switch *ds, int port,
 				  struct netlink_ext_ack *extack)
 {
-	struct net_device *slave = dsa_to_port(ds, port)->slave;
-	const unsigned char *addr = slave->dev_addr;
+	struct net_device *user = dsa_to_port(ds, port)->user;
+	const unsigned char *addr = user->dev_addr;
 	struct ksz_switch_macaddr *switch_macaddr;
 	struct ksz_device *dev = ds->priv;
 	const u16 *regs = dev->info->regs;
diff --git a/drivers/net/dsa/microchip/ksz_ptp.c b/drivers/net/dsa/microchip/ksz_ptp.c
index 4e22a695a64c..1fe105913c75 100644
--- a/drivers/net/dsa/microchip/ksz_ptp.c
+++ b/drivers/net/dsa/microchip/ksz_ptp.c
@@ -557,7 +557,7 @@ static void ksz_ptp_txtstamp_skb(struct ksz_device *dev,
 	struct skb_shared_hwtstamps hwtstamps = {};
 	int ret;
 
-	/* timeout must include DSA master to transmit data, tstamp latency,
+	/* timeout must include DSA conduit to transmit data, tstamp latency,
 	 * IRQ latency and time for reading the time stamp.
 	 */
 	ret = wait_for_completion_timeout(&prt->tstamp_msg_comp,
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index ecf5d3deb36e..d27c6b70a2f6 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -1113,7 +1113,7 @@ mt7530_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
 	u32 val;
 
 	/* When a new MTU is set, DSA always set the CPU port's MTU to the
-	 * largest MTU of the slave ports. Because the switch only has a global
+	 * largest MTU of the user ports. Because the switch only has a global
 	 * RX length register, only allowing CPU port here is enough.
 	 */
 	if (!dsa_is_cpu_port(ds, port))
@@ -2069,7 +2069,7 @@ mt7530_setup_mdio_irq(struct mt7530_priv *priv)
 			unsigned int irq;
 
 			irq = irq_create_mapping(priv->irq_domain, p);
-			ds->slave_mii_bus->irq[p] = irq;
+			ds->user_mii_bus->irq[p] = irq;
 		}
 	}
 }
@@ -2163,7 +2163,7 @@ mt7530_setup_mdio(struct mt7530_priv *priv)
 	if (!bus)
 		return -ENOMEM;
 
-	ds->slave_mii_bus = bus;
+	ds->user_mii_bus = bus;
 	bus->priv = priv;
 	bus->name = KBUILD_MODNAME "-mii";
 	snprintf(bus->id, MII_BUS_ID_SIZE, KBUILD_MODNAME "-%d", idx++);
@@ -2200,20 +2200,20 @@ mt7530_setup(struct dsa_switch *ds)
 	u32 id, val;
 	int ret, i;
 
-	/* The parent node of master netdev which holds the common system
+	/* The parent node of conduit netdev which holds the common system
 	 * controller also is the container for two GMACs nodes representing
 	 * as two netdev instances.
 	 */
 	dsa_switch_for_each_cpu_port(cpu_dp, ds) {
-		dn = cpu_dp->master->dev.of_node->parent;
+		dn = cpu_dp->conduit->dev.of_node->parent;
 		/* It doesn't matter which CPU port is found first,
-		 * their masters should share the same parent OF node
+		 * their conduits should share the same parent OF node
 		 */
 		break;
 	}
 
 	if (!dn) {
-		dev_err(ds->dev, "parent OF node of DSA master not found");
+		dev_err(ds->dev, "parent OF node of DSA conduit not found");
 		return -EINVAL;
 	}
 
@@ -2488,7 +2488,7 @@ mt7531_setup(struct dsa_switch *ds)
 	if (mt7531_dual_sgmii_supported(priv)) {
 		priv->p5_intf_sel = P5_INTF_SEL_GMAC5_SGMII;
 
-		/* Let ds->slave_mii_bus be able to access external phy. */
+		/* Let ds->user_mii_bus be able to access external phy. */
 		mt7530_rmw(priv, MT7531_GPIO_MODE1, MT7531_GPIO11_RG_RXD2_MASK,
 			   MT7531_EXT_P_MDC_11);
 		mt7530_rmw(priv, MT7531_GPIO_MODE1, MT7531_GPIO12_RG_RXD3_MASK,
@@ -2717,7 +2717,7 @@ mt7531_mac_config(struct dsa_switch *ds, int port, unsigned int mode,
 	case PHY_INTERFACE_MODE_RGMII_RXID:
 	case PHY_INTERFACE_MODE_RGMII_TXID:
 		dp = dsa_to_port(ds, port);
-		phydev = dp->slave->phydev;
+		phydev = dp->user->phydev;
 		return mt7531_rgmii_setup(priv, port, interface, phydev);
 	case PHY_INTERFACE_MODE_SGMII:
 	case PHY_INTERFACE_MODE_NA:
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ab434a77b059..42b1acaca33a 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2486,7 +2486,7 @@ static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port,
 	else
 		member = MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_TAGGED;
 
-	/* net/dsa/slave.c will call dsa_port_vlan_add() for the affected port
+	/* net/dsa/user.c will call dsa_port_vlan_add() for the affected port
 	 * and then the CPU port. Do not warn for duplicates for the CPU port.
 	 */
 	warn = !dsa_is_cpu_port(ds, port) && !dsa_is_dsa_port(ds, port);
@@ -3719,7 +3719,7 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 		return err;
 
 	chip->ds = ds;
-	ds->slave_mii_bus = mv88e6xxx_default_mdio_bus(chip);
+	ds->user_mii_bus = mv88e6xxx_default_mdio_bus(chip);
 
 	/* Since virtual bridges are mapped in the PVT, the number we support
 	 * depends on the physical switch topology. We need to let DSA figure
diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 9a3e5ec16972..61e95487732d 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -42,22 +42,22 @@ static struct net_device *felix_classify_db(struct dsa_db db)
 	}
 }
 
-static int felix_cpu_port_for_master(struct dsa_switch *ds,
-				     struct net_device *master)
+static int felix_cpu_port_for_conduit(struct dsa_switch *ds,
+				      struct net_device *conduit)
 {
 	struct ocelot *ocelot = ds->priv;
 	struct dsa_port *cpu_dp;
 	int lag;
 
-	if (netif_is_lag_master(master)) {
+	if (netif_is_lag_master(conduit)) {
 		mutex_lock(&ocelot->fwd_domain_lock);
-		lag = ocelot_bond_get_id(ocelot, master);
+		lag = ocelot_bond_get_id(ocelot, conduit);
 		mutex_unlock(&ocelot->fwd_domain_lock);
 
 		return lag;
 	}
 
-	cpu_dp = master->dsa_ptr;
+	cpu_dp = conduit->dsa_ptr;
 	return cpu_dp->index;
 }
 
@@ -366,7 +366,7 @@ static int felix_update_trapping_destinations(struct dsa_switch *ds,
  * is the mode through which frames can be injected from and extracted to an
  * external CPU, over Ethernet. In NXP SoCs, the "external CPU" is the ARM CPU
  * running Linux, and this forms a DSA setup together with the enetc or fman
- * DSA master.
+ * DSA conduit.
  */
 static void felix_npi_port_init(struct ocelot *ocelot, int port)
 {
@@ -441,16 +441,16 @@ static unsigned long felix_tag_npi_get_host_fwd_mask(struct dsa_switch *ds)
 	return BIT(ocelot->num_phys_ports);
 }
 
-static int felix_tag_npi_change_master(struct dsa_switch *ds, int port,
-				       struct net_device *master,
-				       struct netlink_ext_ack *extack)
+static int felix_tag_npi_change_conduit(struct dsa_switch *ds, int port,
+					struct net_device *conduit,
+					struct netlink_ext_ack *extack)
 {
 	struct dsa_port *dp = dsa_to_port(ds, port), *other_dp;
 	struct ocelot *ocelot = ds->priv;
 
-	if (netif_is_lag_master(master)) {
+	if (netif_is_lag_master(conduit)) {
 		NL_SET_ERR_MSG_MOD(extack,
-				   "LAG DSA master only supported using ocelot-8021q");
+				   "LAG DSA conduit only supported using ocelot-8021q");
 		return -EOPNOTSUPP;
 	}
 
@@ -459,24 +459,24 @@ static int felix_tag_npi_change_master(struct dsa_switch *ds, int port,
 	 * come back up until they're all changed to the new one.
 	 */
 	dsa_switch_for_each_user_port(other_dp, ds) {
-		struct net_device *slave = other_dp->slave;
+		struct net_device *user = other_dp->user;
 
-		if (other_dp != dp && (slave->flags & IFF_UP) &&
-		    dsa_port_to_master(other_dp) != master) {
+		if (other_dp != dp && (user->flags & IFF_UP) &&
+		    dsa_port_to_conduit(other_dp) != conduit) {
 			NL_SET_ERR_MSG_MOD(extack,
-					   "Cannot change while old master still has users");
+					   "Cannot change while old conduit still has users");
 			return -EOPNOTSUPP;
 		}
 	}
 
 	felix_npi_port_deinit(ocelot, ocelot->npi);
-	felix_npi_port_init(ocelot, felix_cpu_port_for_master(ds, master));
+	felix_npi_port_init(ocelot, felix_cpu_port_for_conduit(ds, conduit));
 
 	return 0;
 }
 
 /* Alternatively to using the NPI functionality, that same hardware MAC
- * connected internally to the enetc or fman DSA master can be configured to
+ * connected internally to the enetc or fman DSA conduit can be configured to
  * use the software-defined tag_8021q frame format. As far as the hardware is
  * concerned, it thinks it is a "dumb switch" - the queues of the CPU port
  * module are now disconnected from it, but can still be accessed through
@@ -486,7 +486,7 @@ static const struct felix_tag_proto_ops felix_tag_npi_proto_ops = {
 	.setup			= felix_tag_npi_setup,
 	.teardown		= felix_tag_npi_teardown,
 	.get_host_fwd_mask	= felix_tag_npi_get_host_fwd_mask,
-	.change_master		= felix_tag_npi_change_master,
+	.change_conduit		= felix_tag_npi_change_conduit,
 };
 
 static int felix_tag_8021q_setup(struct dsa_switch *ds)
@@ -561,11 +561,11 @@ static unsigned long felix_tag_8021q_get_host_fwd_mask(struct dsa_switch *ds)
 	return dsa_cpu_ports(ds);
 }
 
-static int felix_tag_8021q_change_master(struct dsa_switch *ds, int port,
-					 struct net_device *master,
-					 struct netlink_ext_ack *extack)
+static int felix_tag_8021q_change_conduit(struct dsa_switch *ds, int port,
+					  struct net_device *conduit,
+					  struct netlink_ext_ack *extack)
 {
-	int cpu = felix_cpu_port_for_master(ds, master);
+	int cpu = felix_cpu_port_for_conduit(ds, conduit);
 	struct ocelot *ocelot = ds->priv;
 
 	ocelot_port_unassign_dsa_8021q_cpu(ocelot, port);
@@ -578,7 +578,7 @@ static const struct felix_tag_proto_ops felix_tag_8021q_proto_ops = {
 	.setup			= felix_tag_8021q_setup,
 	.teardown		= felix_tag_8021q_teardown,
 	.get_host_fwd_mask	= felix_tag_8021q_get_host_fwd_mask,
-	.change_master		= felix_tag_8021q_change_master,
+	.change_conduit		= felix_tag_8021q_change_conduit,
 };
 
 static void felix_set_host_flood(struct dsa_switch *ds, unsigned long mask,
@@ -741,14 +741,14 @@ static void felix_port_set_host_flood(struct dsa_switch *ds, int port,
 			     !!felix->host_flood_mc_mask, true);
 }
 
-static int felix_port_change_master(struct dsa_switch *ds, int port,
-				    struct net_device *master,
-				    struct netlink_ext_ack *extack)
+static int felix_port_change_conduit(struct dsa_switch *ds, int port,
+				     struct net_device *conduit,
+				     struct netlink_ext_ack *extack)
 {
 	struct ocelot *ocelot = ds->priv;
 	struct felix *felix = ocelot_to_felix(ocelot);
 
-	return felix->tag_proto_ops->change_master(ds, port, master, extack);
+	return felix->tag_proto_ops->change_conduit(ds, port, conduit, extack);
 }
 
 static int felix_set_ageing_time(struct dsa_switch *ds,
@@ -953,7 +953,7 @@ static int felix_lag_join(struct dsa_switch *ds, int port,
 	if (!dsa_is_cpu_port(ds, port))
 		return 0;
 
-	return felix_port_change_master(ds, port, lag.dev, extack);
+	return felix_port_change_conduit(ds, port, lag.dev, extack);
 }
 
 static int felix_lag_leave(struct dsa_switch *ds, int port,
@@ -967,7 +967,7 @@ static int felix_lag_leave(struct dsa_switch *ds, int port,
 	if (!dsa_is_cpu_port(ds, port))
 		return 0;
 
-	return felix_port_change_master(ds, port, lag.dev, NULL);
+	return felix_port_change_conduit(ds, port, lag.dev, NULL);
 }
 
 static int felix_lag_change(struct dsa_switch *ds, int port)
@@ -1116,10 +1116,10 @@ static int felix_port_enable(struct dsa_switch *ds, int port,
 		return 0;
 
 	if (ocelot->npi >= 0) {
-		struct net_device *master = dsa_port_to_master(dp);
+		struct net_device *conduit = dsa_port_to_conduit(dp);
 
-		if (felix_cpu_port_for_master(ds, master) != ocelot->npi) {
-			dev_err(ds->dev, "Multiple masters are not allowed\n");
+		if (felix_cpu_port_for_conduit(ds, conduit) != ocelot->npi) {
+			dev_err(ds->dev, "Multiple conduits are not allowed\n");
 			return -EINVAL;
 		}
 	}
@@ -2164,7 +2164,7 @@ const struct dsa_switch_ops felix_switch_ops = {
 	.port_add_dscp_prio		= felix_port_add_dscp_prio,
 	.port_del_dscp_prio		= felix_port_del_dscp_prio,
 	.port_set_host_flood		= felix_port_set_host_flood,
-	.port_change_master		= felix_port_change_master,
+	.port_change_conduit		= felix_port_change_conduit,
 };
 EXPORT_SYMBOL_GPL(felix_switch_ops);
 
@@ -2176,7 +2176,7 @@ struct net_device *felix_port_to_netdev(struct ocelot *ocelot, int port)
 	if (!dsa_is_user_port(ds, port))
 		return NULL;
 
-	return dsa_to_port(ds, port)->slave;
+	return dsa_to_port(ds, port)->user;
 }
 EXPORT_SYMBOL_GPL(felix_port_to_netdev);
 
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 1d4befe7cfe8..dbf5872fe367 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -77,9 +77,9 @@ struct felix_tag_proto_ops {
 	int (*setup)(struct dsa_switch *ds);
 	void (*teardown)(struct dsa_switch *ds);
 	unsigned long (*get_host_fwd_mask)(struct dsa_switch *ds);
-	int (*change_master)(struct dsa_switch *ds, int port,
-			     struct net_device *master,
-			     struct netlink_ext_ack *extack);
+	int (*change_conduit)(struct dsa_switch *ds, int port,
+			      struct net_device *conduit,
+			      struct netlink_ext_ack *extack);
 };
 
 extern const struct dsa_switch_ops felix_switch_ops;
diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c
index 4ce68e655a63..ec57d9d52072 100644
--- a/drivers/net/dsa/qca/qca8k-8xxx.c
+++ b/drivers/net/dsa/qca/qca8k-8xxx.c
@@ -323,14 +323,14 @@ static int qca8k_read_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len)
 
 	mutex_lock(&mgmt_eth_data->mutex);
 
-	/* Check mgmt_master if is operational */
-	if (!priv->mgmt_master) {
+	/* Check if the mgmt_conduit if is operational */
+	if (!priv->mgmt_conduit) {
 		kfree_skb(skb);
 		mutex_unlock(&mgmt_eth_data->mutex);
 		return -EINVAL;
 	}
 
-	skb->dev = priv->mgmt_master;
+	skb->dev = priv->mgmt_conduit;
 
 	reinit_completion(&mgmt_eth_data->rw_done);
 
@@ -375,14 +375,14 @@ static int qca8k_write_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len)
 
 	mutex_lock(&mgmt_eth_data->mutex);
 
-	/* Check mgmt_master if is operational */
-	if (!priv->mgmt_master) {
+	/* Check if the mgmt_conduit if is operational */
+	if (!priv->mgmt_conduit) {
 		kfree_skb(skb);
 		mutex_unlock(&mgmt_eth_data->mutex);
 		return -EINVAL;
 	}
 
-	skb->dev = priv->mgmt_master;
+	skb->dev = priv->mgmt_conduit;
 
 	reinit_completion(&mgmt_eth_data->rw_done);
 
@@ -508,7 +508,7 @@ qca8k_bulk_read(void *ctx, const void *reg_buf, size_t reg_len,
 	struct qca8k_priv *priv = ctx;
 	u32 reg = *(u16 *)reg_buf;
 
-	if (priv->mgmt_master &&
+	if (priv->mgmt_conduit &&
 	    !qca8k_read_eth(priv, reg, val_buf, val_len))
 		return 0;
 
@@ -531,7 +531,7 @@ qca8k_bulk_gather_write(void *ctx, const void *reg_buf, size_t reg_len,
 	u32 reg = *(u16 *)reg_buf;
 	u32 *val = (u32 *)val_buf;
 
-	if (priv->mgmt_master &&
+	if (priv->mgmt_conduit &&
 	    !qca8k_write_eth(priv, reg, val, val_len))
 		return 0;
 
@@ -626,7 +626,7 @@ qca8k_phy_eth_command(struct qca8k_priv *priv, bool read, int phy,
 	struct sk_buff *write_skb, *clear_skb, *read_skb;
 	struct qca8k_mgmt_eth_data *mgmt_eth_data;
 	u32 write_val, clear_val = 0, val;
-	struct net_device *mgmt_master;
+	struct net_device *mgmt_conduit;
 	int ret, ret1;
 	bool ack;
 
@@ -683,18 +683,18 @@ qca8k_phy_eth_command(struct qca8k_priv *priv, bool read, int phy,
 	 */
 	mutex_lock(&mgmt_eth_data->mutex);
 
-	/* Check if mgmt_master is operational */
-	mgmt_master = priv->mgmt_master;
-	if (!mgmt_master) {
+	/* Check if mgmt_conduit is operational */
+	mgmt_conduit = priv->mgmt_conduit;
+	if (!mgmt_conduit) {
 		mutex_unlock(&mgmt_eth_data->mutex);
 		mutex_unlock(&priv->bus->mdio_lock);
 		ret = -EINVAL;
-		goto err_mgmt_master;
+		goto err_mgmt_conduit;
 	}
 
-	read_skb->dev = mgmt_master;
-	clear_skb->dev = mgmt_master;
-	write_skb->dev = mgmt_master;
+	read_skb->dev = mgmt_conduit;
+	clear_skb->dev = mgmt_conduit;
+	write_skb->dev = mgmt_conduit;
 
 	reinit_completion(&mgmt_eth_data->rw_done);
 
@@ -780,7 +780,7 @@ exit:
 	return ret;
 
 	/* Error handling before lock */
-err_mgmt_master:
+err_mgmt_conduit:
 	kfree_skb(read_skb);
 err_read_skb:
 	kfree_skb(clear_skb);
@@ -959,12 +959,12 @@ qca8k_mdio_register(struct qca8k_priv *priv)
 		 ds->dst->index, ds->index);
 	bus->parent = ds->dev;
 	bus->phy_mask = ~ds->phys_mii_mask;
-	ds->slave_mii_bus = bus;
+	ds->user_mii_bus = bus;
 
 	/* Check if the devicetree declare the port:phy mapping */
 	mdio = of_get_child_by_name(priv->dev->of_node, "mdio");
 	if (of_device_is_available(mdio)) {
-		bus->name = "qca8k slave mii";
+		bus->name = "qca8k user mii";
 		bus->read = qca8k_internal_mdio_read;
 		bus->write = qca8k_internal_mdio_write;
 		return devm_of_mdiobus_register(priv->dev, bus, mdio);
@@ -973,7 +973,7 @@ qca8k_mdio_register(struct qca8k_priv *priv)
 	/* If a mapping can't be found the legacy mapping is used,
 	 * using the qca8k_port_to_phy function
 	 */
-	bus->name = "qca8k-legacy slave mii";
+	bus->name = "qca8k-legacy user mii";
 	bus->read = qca8k_legacy_mdio_read;
 	bus->write = qca8k_legacy_mdio_write;
 	return devm_mdiobus_register(priv->dev, bus);
@@ -1728,10 +1728,10 @@ qca8k_get_tag_protocol(struct dsa_switch *ds, int port,
 }
 
 static void
-qca8k_master_change(struct dsa_switch *ds, const struct net_device *master,
-		    bool operational)
+qca8k_conduit_change(struct dsa_switch *ds, const struct net_device *conduit,
+		     bool operational)
 {
-	struct dsa_port *dp = master->dsa_ptr;
+	struct dsa_port *dp = conduit->dsa_ptr;
 	struct qca8k_priv *priv = ds->priv;
 
 	/* Ethernet MIB/MDIO is only supported for CPU port 0 */
@@ -1741,7 +1741,7 @@ qca8k_master_change(struct dsa_switch *ds, const struct net_device *master,
 	mutex_lock(&priv->mgmt_eth_data.mutex);
 	mutex_lock(&priv->mib_eth_data.mutex);
 
-	priv->mgmt_master = operational ? (struct net_device *)master : NULL;
+	priv->mgmt_conduit = operational ? (struct net_device *)conduit : NULL;
 
 	mutex_unlock(&priv->mib_eth_data.mutex);
 	mutex_unlock(&priv->mgmt_eth_data.mutex);
@@ -2016,7 +2016,7 @@ static const struct dsa_switch_ops qca8k_switch_ops = {
 	.get_phy_flags		= qca8k_get_phy_flags,
 	.port_lag_join		= qca8k_port_lag_join,
 	.port_lag_leave		= qca8k_port_lag_leave,
-	.master_state_change	= qca8k_master_change,
+	.conduit_state_change	= qca8k_conduit_change,
 	.connect_tag_protocol	= qca8k_connect_tag_protocol,
 };
 
diff --git a/drivers/net/dsa/qca/qca8k-common.c b/drivers/net/dsa/qca/qca8k-common.c
index 9ff0a3c1cb91..9243eff8918d 100644
--- a/drivers/net/dsa/qca/qca8k-common.c
+++ b/drivers/net/dsa/qca/qca8k-common.c
@@ -499,7 +499,7 @@ void qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
 	u32 hi = 0;
 	int ret;
 
-	if (priv->mgmt_master && priv->info->ops->autocast_mib &&
+	if (priv->mgmt_conduit && priv->info->ops->autocast_mib &&
 	    priv->info->ops->autocast_mib(ds, port, data) > 0)
 		return;
 
@@ -761,7 +761,7 @@ int qca8k_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
 	int ret;
 
 	/* We have only have a general MTU setting.
-	 * DSA always set the CPU port's MTU to the largest MTU of the slave
+	 * DSA always set the CPU port's MTU to the largest MTU of the user
 	 * ports.
 	 * Setting MTU just for the CPU port is sufficient to correctly set a
 	 * value for every port.
diff --git a/drivers/net/dsa/qca/qca8k-leds.c b/drivers/net/dsa/qca/qca8k-leds.c
index e8c16e76e34b..90e30c2909e4 100644
--- a/drivers/net/dsa/qca/qca8k-leds.c
+++ b/drivers/net/dsa/qca/qca8k-leds.c
@@ -356,8 +356,8 @@ static struct device *qca8k_cled_hw_control_get_device(struct led_classdev *ldev
 	dp = dsa_to_port(priv->ds, qca8k_phy_to_port(led->port_num));
 	if (!dp)
 		return NULL;
-	if (dp->slave)
-		return &dp->slave->dev;
+	if (dp->user)
+		return &dp->user->dev;
 	return NULL;
 }
 
@@ -429,7 +429,7 @@ qca8k_parse_port_leds(struct qca8k_priv *priv, struct fwnode_handle *port, int p
 		init_data.default_label = ":port";
 		init_data.fwnode = led;
 		init_data.devname_mandatory = true;
-		init_data.devicename = kasprintf(GFP_KERNEL, "%s:0%d", ds->slave_mii_bus->id,
+		init_data.devicename = kasprintf(GFP_KERNEL, "%s:0%d", ds->user_mii_bus->id,
 						 port_num);
 		if (!init_data.devicename)
 			return -ENOMEM;
diff --git a/drivers/net/dsa/qca/qca8k.h b/drivers/net/dsa/qca/qca8k.h
index 8f88b7db384d..2ac7e88f8da5 100644
--- a/drivers/net/dsa/qca/qca8k.h
+++ b/drivers/net/dsa/qca/qca8k.h
@@ -458,7 +458,7 @@ struct qca8k_priv {
 	struct mutex reg_mutex;
 	struct device *dev;
 	struct gpio_desc *reset_gpio;
-	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
+	struct net_device *mgmt_conduit; /* Track if mdio/mib Ethernet is available */
 	struct qca8k_mgmt_eth_data mgmt_eth_data;
 	struct qca8k_mib_eth_data mib_eth_data;
 	struct qca8k_mdio_cache mdio_cache;
diff --git a/drivers/net/dsa/realtek/realtek-smi.c b/drivers/net/dsa/realtek/realtek-smi.c
index bfd11591faf4..755546ed8db6 100644
--- a/drivers/net/dsa/realtek/realtek-smi.c
+++ b/drivers/net/dsa/realtek/realtek-smi.c
@@ -378,25 +378,25 @@ static int realtek_smi_setup_mdio(struct dsa_switch *ds)
 		return -ENODEV;
 	}
 
-	priv->slave_mii_bus = devm_mdiobus_alloc(priv->dev);
-	if (!priv->slave_mii_bus) {
+	priv->user_mii_bus = devm_mdiobus_alloc(priv->dev);
+	if (!priv->user_mii_bus) {
 		ret = -ENOMEM;
 		goto err_put_node;
 	}
-	priv->slave_mii_bus->priv = priv;
-	priv->slave_mii_bus->name = "SMI slave MII";
-	priv->slave_mii_bus->read = realtek_smi_mdio_read;
-	priv->slave_mii_bus->write = realtek_smi_mdio_write;
-	snprintf(priv->slave_mii_bus->id, MII_BUS_ID_SIZE, "SMI-%d",
+	priv->user_mii_bus->priv = priv;
+	priv->user_mii_bus->name = "SMI user MII";
+	priv->user_mii_bus->read = realtek_smi_mdio_read;
+	priv->user_mii_bus->write = realtek_smi_mdio_write;
+	snprintf(priv->user_mii_bus->id, MII_BUS_ID_SIZE, "SMI-%d",
 		 ds->index);
-	priv->slave_mii_bus->dev.of_node = mdio_np;
-	priv->slave_mii_bus->parent = priv->dev;
-	ds->slave_mii_bus = priv->slave_mii_bus;
+	priv->user_mii_bus->dev.of_node = mdio_np;
+	priv->user_mii_bus->parent = priv->dev;
+	ds->user_mii_bus = priv->user_mii_bus;
 
-	ret = devm_of_mdiobus_register(priv->dev, priv->slave_mii_bus, mdio_np);
+	ret = devm_of_mdiobus_register(priv->dev, priv->user_mii_bus, mdio_np);
 	if (ret) {
 		dev_err(priv->dev, "unable to register MDIO bus %s\n",
-			priv->slave_mii_bus->id);
+			priv->user_mii_bus->id);
 		goto err_put_node;
 	}
 
@@ -514,8 +514,8 @@ static void realtek_smi_remove(struct platform_device *pdev)
 		return;
 
 	dsa_unregister_switch(priv->ds);
-	if (priv->slave_mii_bus)
-		of_node_put(priv->slave_mii_bus->dev.of_node);
+	if (priv->user_mii_bus)
+		of_node_put(priv->user_mii_bus->dev.of_node);
 
 	/* leave the device reset asserted */
 	if (priv->reset)
diff --git a/drivers/net/dsa/realtek/realtek.h b/drivers/net/dsa/realtek/realtek.h
index 4fa7c6ba874a..790488e9c667 100644
--- a/drivers/net/dsa/realtek/realtek.h
+++ b/drivers/net/dsa/realtek/realtek.h
@@ -54,7 +54,7 @@ struct realtek_priv {
 	struct regmap		*map;
 	struct regmap		*map_nolock;
 	struct mutex		map_lock;
-	struct mii_bus		*slave_mii_bus;
+	struct mii_bus		*user_mii_bus;
 	struct mii_bus		*bus;
 	int			mdio_addr;
 
diff --git a/drivers/net/dsa/realtek/rtl8365mb.c b/drivers/net/dsa/realtek/rtl8365mb.c
index d171c18dd354..0875e4fc9f57 100644
--- a/drivers/net/dsa/realtek/rtl8365mb.c
+++ b/drivers/net/dsa/realtek/rtl8365mb.c
@@ -1144,7 +1144,7 @@ static int rtl8365mb_port_change_mtu(struct dsa_switch *ds, int port,
 	int frame_size;
 
 	/* When a new MTU is set, DSA always sets the CPU port's MTU to the
-	 * largest MTU of the slave ports. Because the switch only has a global
+	 * largest MTU of the user ports. Because the switch only has a global
 	 * RX length register, only allowing CPU port here is enough.
 	 */
 	if (!dsa_is_cpu_port(ds, port))
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 1a367e64bc3b..74cee39d73df 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2688,7 +2688,7 @@ static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
 	}
 
 	/* Transfer skb to the host port. */
-	dsa_enqueue_skb(skb, dsa_to_port(ds, port)->slave);
+	dsa_enqueue_skb(skb, dsa_to_port(ds, port)->user);
 
 	/* Wait until the switch has processed the frame */
 	do {
@@ -3081,7 +3081,7 @@ static int sja1105_port_bridge_flags(struct dsa_switch *ds, int port,
  * ref_clk pin. So port clocking needs to be initialized early, before
  * connecting to PHYs is attempted, otherwise they won't respond through MDIO.
  * Setting correct PHY link speed does not matter now.
- * But dsa_slave_phy_setup is called later than sja1105_setup, so the PHY
+ * But dsa_user_phy_setup is called later than sja1105_setup, so the PHY
  * bindings are not yet parsed by DSA core. We need to parse early so that we
  * can populate the xMII mode parameters table.
  */
diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c
index 5b02e9e426fd..96db032b478f 100644
--- a/drivers/net/dsa/xrs700x/xrs700x.c
+++ b/drivers/net/dsa/xrs700x/xrs700x.c
@@ -554,7 +554,7 @@ static int xrs700x_hsr_join(struct dsa_switch *ds, int port,
 	unsigned int val = XRS_HSR_CFG_HSR_PRP;
 	struct dsa_port *partner = NULL, *dp;
 	struct xrs700x *priv = ds->priv;
-	struct net_device *slave;
+	struct net_device *user;
 	int ret, i, hsr_pair[2];
 	enum hsr_version ver;
 	bool fwd = false;
@@ -638,8 +638,8 @@ static int xrs700x_hsr_join(struct dsa_switch *ds, int port,
 	hsr_pair[0] = port;
 	hsr_pair[1] = partner->index;
 	for (i = 0; i < ARRAY_SIZE(hsr_pair); i++) {
-		slave = dsa_to_port(ds, hsr_pair[i])->slave;
-		slave->features |= XRS7000X_SUPPORTED_HSR_FEATURES;
+		user = dsa_to_port(ds, hsr_pair[i])->user;
+		user->features |= XRS7000X_SUPPORTED_HSR_FEATURES;
 	}
 
 	return 0;
@@ -650,7 +650,7 @@ static int xrs700x_hsr_leave(struct dsa_switch *ds, int port,
 {
 	struct dsa_port *partner = NULL, *dp;
 	struct xrs700x *priv = ds->priv;
-	struct net_device *slave;
+	struct net_device *user;
 	int i, hsr_pair[2];
 	unsigned int val;
 
@@ -692,8 +692,8 @@ static int xrs700x_hsr_leave(struct dsa_switch *ds, int port,
 	hsr_pair[0] = port;
 	hsr_pair[1] = partner->index;
 	for (i = 0; i < ARRAY_SIZE(hsr_pair); i++) {
-		slave = dsa_to_port(ds, hsr_pair[i])->slave;
-		slave->features &= ~XRS7000X_SUPPORTED_HSR_FEATURES;
+		user = dsa_to_port(ds, hsr_pair[i])->user;
+		user->features &= ~XRS7000X_SUPPORTED_HSR_FEATURES;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index ab096795e805..c9faa8540859 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2430,7 +2430,7 @@ static int bcm_sysport_netdevice_event(struct notifier_block *nb,
 		if (dev->netdev_ops != &bcm_sysport_netdev_ops)
 			return NOTIFY_DONE;
 
-		if (!dsa_slave_dev_check(info->upper_dev))
+		if (!dsa_user_dev_check(info->upper_dev))
 			return NOTIFY_DONE;
 
 		if (info->linking)
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 60d49b0f595f..3cf6589cfdac 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -3329,7 +3329,7 @@ static int mtk_device_event(struct notifier_block *n, unsigned long event, void
 	return NOTIFY_DONE;
 
 found:
-	if (!dsa_slave_dev_check(dev))
+	if (!dsa_user_dev_check(dev))
 		return NOTIFY_DONE;
 
 	if (__ethtool_get_link_ksettings(dev, &s))
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index e073d2b5542c..fbb5e9d5af13 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -175,7 +175,7 @@ mtk_flow_get_dsa_port(struct net_device **dev)
 	if (dp->cpu_dp->tag_ops->proto != DSA_TAG_PROTO_MTK)
 		return -ENODEV;
 
-	*dev = dsa_port_to_master(dp);
+	*dev = dsa_port_to_conduit(dp);
 
 	return dp->index;
 #else
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index c177322f793d..b9dd35d4b8f5 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -28,7 +28,7 @@
 /* Source and Destination MAC of follow-up meta frames.
  * Whereas the choice of SMAC only affects the unique identification of the
  * switch as sender of meta frames, the DMAC must be an address that is present
- * in the DSA master port's multicast MAC filter.
+ * in the DSA conduit port's multicast MAC filter.
  * 01-80-C2-00-00-0E is a good choice for this, as all profiles of IEEE 1588
  * over L2 use this address for some purpose already.
  */
diff --git a/include/net/dsa.h b/include/net/dsa.h
index d98439ea6146..82135fbdb1e6 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -102,11 +102,11 @@ struct dsa_device_ops {
 	const char *name;
 	enum dsa_tag_protocol proto;
 	/* Some tagging protocols either mangle or shift the destination MAC
-	 * address, in which case the DSA master would drop packets on ingress
+	 * address, in which case the DSA conduit would drop packets on ingress
 	 * if what it understands out of the destination MAC address is not in
 	 * its RX filter.
 	 */
-	bool promisc_on_master;
+	bool promisc_on_conduit;
 };
 
 struct dsa_lag {
@@ -236,12 +236,12 @@ struct dsa_bridge {
 };
 
 struct dsa_port {
-	/* A CPU port is physically connected to a master device.
-	 * A user port exposed to userspace has a slave device.
+	/* A CPU port is physically connected to a conduit device. A user port
+	 * exposes a network device to user-space, called 'user' here.
 	 */
 	union {
-		struct net_device *master;
-		struct net_device *slave;
+		struct net_device *conduit;
+		struct net_device *user;
 	};
 
 	/* Copy of the tagging protocol operations, for quicker access
@@ -249,7 +249,7 @@ struct dsa_port {
 	 */
 	const struct dsa_device_ops *tag_ops;
 
-	/* Copies for faster access in master receive hot path */
+	/* Copies for faster access in conduit receive hot path */
 	struct dsa_switch_tree *dst;
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
 
@@ -281,9 +281,9 @@ struct dsa_port {
 
 	u8			lag_tx_enabled:1;
 
-	/* Master state bits, valid only on CPU ports */
-	u8			master_admin_up:1;
-	u8			master_oper_up:1;
+	/* conduit state bits, valid only on CPU ports */
+	u8			conduit_admin_up:1;
+	u8			conduit_oper_up:1;
 
 	/* Valid only on user ports */
 	u8			cpu_port_in_lag:1;
@@ -303,7 +303,7 @@ struct dsa_port {
 	struct list_head list;
 
 	/*
-	 * Original copy of the master netdev ethtool_ops
+	 * Original copy of the conduit netdev ethtool_ops
 	 */
 	const struct ethtool_ops *orig_ethtool_ops;
 
@@ -452,10 +452,10 @@ struct dsa_switch {
 	const struct dsa_switch_ops	*ops;
 
 	/*
-	 * Slave mii_bus and devices for the individual ports.
+	 * User mii_bus and devices for the individual ports.
 	 */
 	u32			phys_mii_mask;
-	struct mii_bus		*slave_mii_bus;
+	struct mii_bus		*user_mii_bus;
 
 	/* Ageing Time limits in msecs */
 	unsigned int ageing_time_min;
@@ -520,10 +520,10 @@ static inline bool dsa_port_is_unused(struct dsa_port *dp)
 	return dp->type == DSA_PORT_TYPE_UNUSED;
 }
 
-static inline bool dsa_port_master_is_operational(struct dsa_port *dp)
+static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp)
 {
-	return dsa_port_is_cpu(dp) && dp->master_admin_up &&
-	       dp->master_oper_up;
+	return dsa_port_is_cpu(dp) && dp->conduit_admin_up &&
+	       dp->conduit_oper_up;
 }
 
 static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
@@ -713,12 +713,12 @@ static inline bool dsa_port_offloads_lag(struct dsa_port *dp,
 	return dsa_port_lag_dev_get(dp) == lag->dev;
 }
 
-static inline struct net_device *dsa_port_to_master(const struct dsa_port *dp)
+static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp)
 {
 	if (dp->cpu_port_in_lag)
 		return dsa_port_lag_dev_get(dp->cpu_dp);
 
-	return dp->cpu_dp->master;
+	return dp->cpu_dp->conduit;
 }
 
 static inline
@@ -732,7 +732,7 @@ struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp)
 	else if (dp->hsr_dev)
 		return dp->hsr_dev;
 
-	return dp->slave;
+	return dp->user;
 }
 
 static inline struct net_device *
@@ -834,9 +834,9 @@ struct dsa_switch_ops {
 	int	(*connect_tag_protocol)(struct dsa_switch *ds,
 					enum dsa_tag_protocol proto);
 
-	int	(*port_change_master)(struct dsa_switch *ds, int port,
-				      struct net_device *master,
-				      struct netlink_ext_ack *extack);
+	int	(*port_change_conduit)(struct dsa_switch *ds, int port,
+				       struct net_device *conduit,
+				       struct netlink_ext_ack *extack);
 
 	/* Optional switch-wide initialization and destruction methods */
 	int	(*setup)(struct dsa_switch *ds);
@@ -1233,11 +1233,11 @@ struct dsa_switch_ops {
 	int	(*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);
 
 	/*
-	 * DSA master tracking operations
+	 * DSA conduit tracking operations
 	 */
-	void	(*master_state_change)(struct dsa_switch *ds,
-				       const struct net_device *master,
-				       bool operational);
+	void	(*conduit_state_change)(struct dsa_switch *ds,
+					const struct net_device *conduit,
+					bool operational);
 };
 
 #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\
@@ -1374,9 +1374,9 @@ static inline int dsa_switch_resume(struct dsa_switch *ds)
 #endif /* CONFIG_PM_SLEEP */
 
 #if IS_ENABLED(CONFIG_NET_DSA)
-bool dsa_slave_dev_check(const struct net_device *dev);
+bool dsa_user_dev_check(const struct net_device *dev);
 #else
-static inline bool dsa_slave_dev_check(const struct net_device *dev)
+static inline bool dsa_user_dev_check(const struct net_device *dev)
 {
 	return false;
 }
diff --git a/include/net/dsa_stubs.h b/include/net/dsa_stubs.h
index 361811750a54..6f384897f287 100644
--- a/include/net/dsa_stubs.h
+++ b/include/net/dsa_stubs.h
@@ -13,14 +13,14 @@
 extern const struct dsa_stubs *dsa_stubs;
 
 struct dsa_stubs {
-	int (*master_hwtstamp_validate)(struct net_device *dev,
-					const struct kernel_hwtstamp_config *config,
-					struct netlink_ext_ack *extack);
+	int (*conduit_hwtstamp_validate)(struct net_device *dev,
+					 const struct kernel_hwtstamp_config *config,
+					 struct netlink_ext_ack *extack);
 };
 
-static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
-					       const struct kernel_hwtstamp_config *config,
-					       struct netlink_ext_ack *extack)
+static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev,
+						const struct kernel_hwtstamp_config *config,
+						struct netlink_ext_ack *extack)
 {
 	if (!netdev_uses_dsa(dev))
 		return 0;
@@ -29,18 +29,18 @@ static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
 	 * netdev_uses_dsa() returns true, the dsa_core module is still
 	 * registered, and so, dsa_unregister_stubs() couldn't have run.
 	 * For netdev_uses_dsa() to start returning false, it would imply that
-	 * dsa_master_teardown() has executed, which requires rtnl_lock().
+	 * dsa_conduit_teardown() has executed, which requires rtnl_lock().
 	 */
 	ASSERT_RTNL();
 
-	return dsa_stubs->master_hwtstamp_validate(dev, config, extack);
+	return dsa_stubs->conduit_hwtstamp_validate(dev, config, extack);
 }
 
 #else
 
-static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
-					       const struct kernel_hwtstamp_config *config,
-					       struct netlink_ext_ack *extack)
+static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev,
+						const struct kernel_hwtstamp_config *config,
+						struct netlink_ext_ack *extack)
 {
 	return 0;
 }
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b46aedc36939..feeddf95f450 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -382,7 +382,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 	if (err)
 		return err;
 
-	err = dsa_master_hwtstamp_validate(dev, &kernel_cfg, &extack);
+	err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack);
 	if (err) {
 		if (extack._msg)
 			netdev_err(dev, "%s\n", extack._msg);
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 12e305824a96..8a1894a42552 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -8,16 +8,16 @@ endif
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
 dsa_core-y += \
+	conduit.o \
 	devlink.o \
 	dsa.o \
-	master.o \
 	netlink.o \
 	port.o \
-	slave.o \
 	switch.o \
 	tag.o \
 	tag_8021q.o \
-	trace.o
+	trace.o \
+	user.o
 
 # tagging formats
 obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o
diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c
new file mode 100644
index 000000000000..3dfdb3cb47dc
--- /dev/null
+++ b/net/dsa/conduit.c
@@ -0,0 +1,475 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handling of a conduit device, switching frames via its switch fabric CPU port
+ *
+ * Copyright (c) 2017 Savoir-faire Linux Inc.
+ *	Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ */
+
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/dsa.h>
+
+#include "conduit.h"
+#include "dsa.h"
+#include "port.h"
+#include "tag.h"
+
+static int dsa_conduit_get_regs_len(struct net_device *dev)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int ret = 0;
+	int len;
+
+	if (ops->get_regs_len) {
+		len = ops->get_regs_len(dev);
+		if (len < 0)
+			return len;
+		ret += len;
+	}
+
+	ret += sizeof(struct ethtool_drvinfo);
+	ret += sizeof(struct ethtool_regs);
+
+	if (ds->ops->get_regs_len) {
+		len = ds->ops->get_regs_len(ds, port);
+		if (len < 0)
+			return len;
+		ret += len;
+	}
+
+	return ret;
+}
+
+static void dsa_conduit_get_regs(struct net_device *dev,
+				 struct ethtool_regs *regs, void *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct ethtool_drvinfo *cpu_info;
+	struct ethtool_regs *cpu_regs;
+	int port = cpu_dp->index;
+	int len;
+
+	if (ops->get_regs_len && ops->get_regs) {
+		len = ops->get_regs_len(dev);
+		if (len < 0)
+			return;
+		regs->len = len;
+		ops->get_regs(dev, regs, data);
+		data += regs->len;
+	}
+
+	cpu_info = (struct ethtool_drvinfo *)data;
+	strscpy(cpu_info->driver, "dsa", sizeof(cpu_info->driver));
+	data += sizeof(*cpu_info);
+	cpu_regs = (struct ethtool_regs *)data;
+	data += sizeof(*cpu_regs);
+
+	if (ds->ops->get_regs_len && ds->ops->get_regs) {
+		len = ds->ops->get_regs_len(ds, port);
+		if (len < 0)
+			return;
+		cpu_regs->len = len;
+		ds->ops->get_regs(ds, port, cpu_regs, data);
+	}
+}
+
+static void dsa_conduit_get_ethtool_stats(struct net_device *dev,
+					  struct ethtool_stats *stats,
+					  uint64_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int count = 0;
+
+	if (ops->get_sset_count && ops->get_ethtool_stats) {
+		count = ops->get_sset_count(dev, ETH_SS_STATS);
+		ops->get_ethtool_stats(dev, stats, data);
+	}
+
+	if (ds->ops->get_ethtool_stats)
+		ds->ops->get_ethtool_stats(ds, port, data + count);
+}
+
+static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev,
+					      struct ethtool_stats *stats,
+					      uint64_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int count = 0;
+
+	if (dev->phydev && !ops->get_ethtool_phy_stats) {
+		count = phy_ethtool_get_sset_count(dev->phydev);
+		if (count >= 0)
+			phy_ethtool_get_stats(dev->phydev, stats, data);
+	} else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
+		count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+		ops->get_ethtool_phy_stats(dev, stats, data);
+	}
+
+	if (count < 0)
+		count = 0;
+
+	if (ds->ops->get_ethtool_phy_stats)
+		ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
+static int dsa_conduit_get_sset_count(struct net_device *dev, int sset)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int count = 0;
+
+	if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats)
+		count = phy_ethtool_get_sset_count(dev->phydev);
+	else if (ops->get_sset_count)
+		count = ops->get_sset_count(dev, sset);
+
+	if (count < 0)
+		count = 0;
+
+	if (ds->ops->get_sset_count)
+		count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
+
+	return count;
+}
+
+static void dsa_conduit_get_strings(struct net_device *dev, uint32_t stringset,
+				    uint8_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int len = ETH_GSTRING_LEN;
+	int mcount = 0, count, i;
+	uint8_t pfx[4];
+	uint8_t *ndata;
+
+	snprintf(pfx, sizeof(pfx), "p%.2d", port);
+	/* We do not want to be NULL-terminated, since this is a prefix */
+	pfx[sizeof(pfx) - 1] = '_';
+
+	if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats) {
+		mcount = phy_ethtool_get_sset_count(dev->phydev);
+		if (mcount < 0)
+			mcount = 0;
+		else
+			phy_ethtool_get_strings(dev->phydev, data);
+	} else if (ops->get_sset_count && ops->get_strings) {
+		mcount = ops->get_sset_count(dev, stringset);
+		if (mcount < 0)
+			mcount = 0;
+		ops->get_strings(dev, stringset, data);
+	}
+
+	if (ds->ops->get_strings) {
+		ndata = data + mcount * len;
+		/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
+		 * the output after to prepend our CPU port prefix we
+		 * constructed earlier
+		 */
+		ds->ops->get_strings(ds, port, stringset, ndata);
+		count = ds->ops->get_sset_count(ds, port, stringset);
+		if (count < 0)
+			return;
+		for (i = 0; i < count; i++) {
+			memmove(ndata + (i * len + sizeof(pfx)),
+				ndata + i * len, len - sizeof(pfx));
+			memcpy(ndata + i * len, pfx, sizeof(pfx));
+		}
+	}
+}
+
+/* Deny PTP operations on conduit if there is at least one switch in the tree
+ * that is PTP capable.
+ */
+int __dsa_conduit_hwtstamp_validate(struct net_device *dev,
+				    const struct kernel_hwtstamp_config *config,
+				    struct netlink_ext_ack *extack)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct dsa_switch_tree *dst;
+	struct dsa_port *dp;
+
+	dst = ds->dst;
+
+	list_for_each_entry(dp, &dst->ports, list) {
+		if (dsa_port_supports_hwtstamp(dp)) {
+			NL_SET_ERR_MSG(extack,
+				       "HW timestamping not allowed on DSA conduit when switch supports the operation");
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static int dsa_conduit_ethtool_setup(struct net_device *dev)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct ethtool_ops *ops;
+
+	if (netif_is_lag_master(dev))
+		return 0;
+
+	ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	cpu_dp->orig_ethtool_ops = dev->ethtool_ops;
+	if (cpu_dp->orig_ethtool_ops)
+		memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
+
+	ops->get_regs_len = dsa_conduit_get_regs_len;
+	ops->get_regs = dsa_conduit_get_regs;
+	ops->get_sset_count = dsa_conduit_get_sset_count;
+	ops->get_ethtool_stats = dsa_conduit_get_ethtool_stats;
+	ops->get_strings = dsa_conduit_get_strings;
+	ops->get_ethtool_phy_stats = dsa_conduit_get_ethtool_phy_stats;
+
+	dev->ethtool_ops = ops;
+
+	return 0;
+}
+
+static void dsa_conduit_ethtool_teardown(struct net_device *dev)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+	if (netif_is_lag_master(dev))
+		return;
+
+	dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
+	cpu_dp->orig_ethtool_ops = NULL;
+}
+
+/* Keep the conduit always promiscuous if the tagging protocol requires that
+ * (garbles MAC DA) or if it doesn't support unicast filtering, case in which
+ * it would revert to promiscuous mode as soon as we call dev_uc_add() on it
+ * anyway.
+ */
+static void dsa_conduit_set_promiscuity(struct net_device *dev, int inc)
+{
+	const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops;
+
+	if ((dev->priv_flags & IFF_UNICAST_FLT) && !ops->promisc_on_conduit)
+		return;
+
+	ASSERT_RTNL();
+
+	dev_set_promiscuity(dev, inc);
+}
+
+static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
+			    char *buf)
+{
+	struct net_device *dev = to_net_dev(d);
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+	return sysfs_emit(buf, "%s\n",
+		       dsa_tag_protocol_to_str(cpu_dp->tag_ops));
+}
+
+static ssize_t tagging_store(struct device *d, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	const struct dsa_device_ops *new_tag_ops, *old_tag_ops;
+	const char *end = strchrnul(buf, '\n'), *name;
+	struct net_device *dev = to_net_dev(d);
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	size_t len = end - buf;
+	int err;
+
+	/* Empty string passed */
+	if (!len)
+		return -ENOPROTOOPT;
+
+	name = kstrndup(buf, len, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	old_tag_ops = cpu_dp->tag_ops;
+	new_tag_ops = dsa_tag_driver_get_by_name(name);
+	kfree(name);
+	/* Bad tagger name? */
+	if (IS_ERR(new_tag_ops))
+		return PTR_ERR(new_tag_ops);
+
+	if (new_tag_ops == old_tag_ops)
+		/* Drop the temporarily held duplicate reference, since
+		 * the DSA switch tree uses this tagger.
+		 */
+		goto out;
+
+	err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, new_tag_ops,
+					old_tag_ops);
+	if (err) {
+		/* On failure the old tagger is restored, so we don't need the
+		 * driver for the new one.
+		 */
+		dsa_tag_driver_put(new_tag_ops);
+		return err;
+	}
+
+	/* On success we no longer need the module for the old tagging protocol
+	 */
+out:
+	dsa_tag_driver_put(old_tag_ops);
+	return count;
+}
+static DEVICE_ATTR_RW(tagging);
+
+static struct attribute *dsa_user_attrs[] = {
+	&dev_attr_tagging.attr,
+	NULL
+};
+
+static const struct attribute_group dsa_group = {
+	.name	= "dsa",
+	.attrs	= dsa_user_attrs,
+};
+
+static void dsa_conduit_reset_mtu(struct net_device *dev)
+{
+	int err;
+
+	err = dev_set_mtu(dev, ETH_DATA_LEN);
+	if (err)
+		netdev_dbg(dev,
+			   "Unable to reset MTU to exclude DSA overheads\n");
+}
+
+int dsa_conduit_setup(struct net_device *dev, struct dsa_port *cpu_dp)
+{
+	const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct device_link *consumer_link;
+	int mtu, ret;
+
+	mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops);
+
+	/* The DSA conduit must use SET_NETDEV_DEV for this to work. */
+	if (!netif_is_lag_master(dev)) {
+		consumer_link = device_link_add(ds->dev, dev->dev.parent,
+						DL_FLAG_AUTOREMOVE_CONSUMER);
+		if (!consumer_link)
+			netdev_err(dev,
+				   "Failed to create a device link to DSA switch %s\n",
+				   dev_name(ds->dev));
+	}
+
+	/* The switch driver may not implement ->port_change_mtu(), case in
+	 * which dsa_user_change_mtu() will not update the conduit MTU either,
+	 * so we need to do that here.
+	 */
+	ret = dev_set_mtu(dev, mtu);
+	if (ret)
+		netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n",
+			    ret, mtu);
+
+	/* If we use a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point on get
+	 * sent to the tag format's receive function.
+	 */
+	wmb();
+
+	dev->dsa_ptr = cpu_dp;
+
+	dsa_conduit_set_promiscuity(dev, 1);
+
+	ret = dsa_conduit_ethtool_setup(dev);
+	if (ret)
+		goto out_err_reset_promisc;
+
+	ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
+	if (ret)
+		goto out_err_ethtool_teardown;
+
+	return ret;
+
+out_err_ethtool_teardown:
+	dsa_conduit_ethtool_teardown(dev);
+out_err_reset_promisc:
+	dsa_conduit_set_promiscuity(dev, -1);
+	return ret;
+}
+
+void dsa_conduit_teardown(struct net_device *dev)
+{
+	sysfs_remove_group(&dev->dev.kobj, &dsa_group);
+	dsa_conduit_ethtool_teardown(dev);
+	dsa_conduit_reset_mtu(dev);
+	dsa_conduit_set_promiscuity(dev, -1);
+
+	dev->dsa_ptr = NULL;
+
+	/* If we used a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point get sent
+	 * without the tag and go through the regular receive path.
+	 */
+	wmb();
+}
+
+int dsa_conduit_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+			  struct netdev_lag_upper_info *uinfo,
+			  struct netlink_ext_ack *extack)
+{
+	bool conduit_setup = false;
+	int err;
+
+	if (!netdev_uses_dsa(lag_dev)) {
+		err = dsa_conduit_setup(lag_dev, cpu_dp);
+		if (err)
+			return err;
+
+		conduit_setup = true;
+	}
+
+	err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack);
+	if (err) {
+		NL_SET_ERR_MSG_WEAK_MOD(extack, "CPU port failed to join LAG");
+		goto out_conduit_teardown;
+	}
+
+	return 0;
+
+out_conduit_teardown:
+	if (conduit_setup)
+		dsa_conduit_teardown(lag_dev);
+	return err;
+}
+
+/* Tear down a conduit if there isn't any other user port on it,
+ * optionally also destroying LAG information.
+ */
+void dsa_conduit_lag_teardown(struct net_device *lag_dev,
+			      struct dsa_port *cpu_dp)
+{
+	struct net_device *upper;
+	struct list_head *iter;
+
+	dsa_port_lag_leave(cpu_dp, lag_dev);
+
+	netdev_for_each_upper_dev_rcu(lag_dev, upper, iter)
+		if (dsa_user_dev_check(upper))
+			return;
+
+	dsa_conduit_teardown(lag_dev);
+}
diff --git a/net/dsa/conduit.h b/net/dsa/conduit.h
new file mode 100644
index 000000000000..31f8834f54bb
--- /dev/null
+++ b/net/dsa/conduit.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_CONDUIT_H
+#define __DSA_CONDUIT_H
+
+struct dsa_port;
+struct net_device;
+struct netdev_lag_upper_info;
+struct netlink_ext_ack;
+
+int dsa_conduit_setup(struct net_device *dev, struct dsa_port *cpu_dp);
+void dsa_conduit_teardown(struct net_device *dev);
+int dsa_conduit_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+			  struct netdev_lag_upper_info *uinfo,
+			  struct netlink_ext_ack *extack);
+void dsa_conduit_lag_teardown(struct net_device *lag_dev,
+			      struct dsa_port *cpu_dp);
+int __dsa_conduit_hwtstamp_validate(struct net_device *dev,
+				    const struct kernel_hwtstamp_config *config,
+				    struct netlink_ext_ack *extack);
+
+#endif
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index ccbdb98109f8..ac7be864e80d 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -20,14 +20,14 @@
 #include <net/dsa_stubs.h>
 #include <net/sch_generic.h>
 
+#include "conduit.h"
 #include "devlink.h"
 #include "dsa.h"
-#include "master.h"
 #include "netlink.h"
 #include "port.h"
-#include "slave.h"
 #include "switch.h"
 #include "tag.h"
+#include "user.h"
 
 #define DSA_MAX_NUM_OFFLOADING_BRIDGES		BITS_PER_LONG
 
@@ -365,18 +365,18 @@ static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
 	return NULL;
 }
 
-struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst)
+struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst)
 {
 	struct device_node *ethernet;
-	struct net_device *master;
+	struct net_device *conduit;
 	struct dsa_port *cpu_dp;
 
 	cpu_dp = dsa_tree_find_first_cpu(dst);
 	ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0);
-	master = of_find_net_device_by_node(ethernet);
+	conduit = of_find_net_device_by_node(ethernet);
 	of_node_put(ethernet);
 
-	return master;
+	return conduit;
 }
 
 /* Assign the default CPU port (the first one in the tree) to all ports of the
@@ -517,7 +517,7 @@ static int dsa_port_setup(struct dsa_port *dp)
 		break;
 	case DSA_PORT_TYPE_USER:
 		of_get_mac_address(dp->dn, dp->mac);
-		err = dsa_slave_create(dp);
+		err = dsa_user_create(dp);
 		break;
 	}
 
@@ -554,9 +554,9 @@ static void dsa_port_teardown(struct dsa_port *dp)
 			dsa_shared_port_link_unregister_of(dp);
 		break;
 	case DSA_PORT_TYPE_USER:
-		if (dp->slave) {
-			dsa_slave_destroy(dp->slave);
-			dp->slave = NULL;
+		if (dp->user) {
+			dsa_user_destroy(dp->user);
+			dp->user = NULL;
 		}
 		break;
 	}
@@ -632,9 +632,9 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	if (ds->setup)
 		return 0;
 
-	/* Initialize ds->phys_mii_mask before registering the slave MDIO bus
+	/* Initialize ds->phys_mii_mask before registering the user MDIO bus
 	 * driver and before ops->setup() has run, since the switch drivers and
-	 * the slave MDIO bus driver rely on these values for probing PHY
+	 * the user MDIO bus driver rely on these values for probing PHY
 	 * devices or not
 	 */
 	ds->phys_mii_mask |= dsa_user_ports(ds);
@@ -657,21 +657,21 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	if (err)
 		goto teardown;
 
-	if (!ds->slave_mii_bus && ds->ops->phy_read) {
-		ds->slave_mii_bus = mdiobus_alloc();
-		if (!ds->slave_mii_bus) {
+	if (!ds->user_mii_bus && ds->ops->phy_read) {
+		ds->user_mii_bus = mdiobus_alloc();
+		if (!ds->user_mii_bus) {
 			err = -ENOMEM;
 			goto teardown;
 		}
 
-		dsa_slave_mii_bus_init(ds);
+		dsa_user_mii_bus_init(ds);
 
 		dn = of_get_child_by_name(ds->dev->of_node, "mdio");
 
-		err = of_mdiobus_register(ds->slave_mii_bus, dn);
+		err = of_mdiobus_register(ds->user_mii_bus, dn);
 		of_node_put(dn);
 		if (err < 0)
-			goto free_slave_mii_bus;
+			goto free_user_mii_bus;
 	}
 
 	dsa_switch_devlink_register(ds);
@@ -679,9 +679,9 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	ds->setup = true;
 	return 0;
 
-free_slave_mii_bus:
-	if (ds->slave_mii_bus && ds->ops->phy_read)
-		mdiobus_free(ds->slave_mii_bus);
+free_user_mii_bus:
+	if (ds->user_mii_bus && ds->ops->phy_read)
+		mdiobus_free(ds->user_mii_bus);
 teardown:
 	if (ds->ops->teardown)
 		ds->ops->teardown(ds);
@@ -699,10 +699,10 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
 
 	dsa_switch_devlink_unregister(ds);
 
-	if (ds->slave_mii_bus && ds->ops->phy_read) {
-		mdiobus_unregister(ds->slave_mii_bus);
-		mdiobus_free(ds->slave_mii_bus);
-		ds->slave_mii_bus = NULL;
+	if (ds->user_mii_bus && ds->ops->phy_read) {
+		mdiobus_unregister(ds->user_mii_bus);
+		mdiobus_free(ds->user_mii_bus);
+		ds->user_mii_bus = NULL;
 	}
 
 	dsa_switch_teardown_tag_protocol(ds);
@@ -793,7 +793,7 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
 	return err;
 }
 
-static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
+static int dsa_tree_setup_conduit(struct dsa_switch_tree *dst)
 {
 	struct dsa_port *cpu_dp;
 	int err = 0;
@@ -801,18 +801,18 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
 	rtnl_lock();
 
 	dsa_tree_for_each_cpu_port(cpu_dp, dst) {
-		struct net_device *master = cpu_dp->master;
-		bool admin_up = (master->flags & IFF_UP) &&
-				!qdisc_tx_is_noop(master);
+		struct net_device *conduit = cpu_dp->conduit;
+		bool admin_up = (conduit->flags & IFF_UP) &&
+				!qdisc_tx_is_noop(conduit);
 
-		err = dsa_master_setup(master, cpu_dp);
+		err = dsa_conduit_setup(conduit, cpu_dp);
 		if (err)
 			break;
 
-		/* Replay master state event */
-		dsa_tree_master_admin_state_change(dst, master, admin_up);
-		dsa_tree_master_oper_state_change(dst, master,
-						  netif_oper_up(master));
+		/* Replay conduit state event */
+		dsa_tree_conduit_admin_state_change(dst, conduit, admin_up);
+		dsa_tree_conduit_oper_state_change(dst, conduit,
+						   netif_oper_up(conduit));
 	}
 
 	rtnl_unlock();
@@ -820,22 +820,22 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
 	return err;
 }
 
-static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
+static void dsa_tree_teardown_conduit(struct dsa_switch_tree *dst)
 {
 	struct dsa_port *cpu_dp;
 
 	rtnl_lock();
 
 	dsa_tree_for_each_cpu_port(cpu_dp, dst) {
-		struct net_device *master = cpu_dp->master;
+		struct net_device *conduit = cpu_dp->conduit;
 
 		/* Synthesizing an "admin down" state is sufficient for
-		 * the switches to get a notification if the master is
+		 * the switches to get a notification if the conduit is
 		 * currently up and running.
 		 */
-		dsa_tree_master_admin_state_change(dst, master, false);
+		dsa_tree_conduit_admin_state_change(dst, conduit, false);
 
-		dsa_master_teardown(master);
+		dsa_conduit_teardown(conduit);
 	}
 
 	rtnl_unlock();
@@ -894,13 +894,13 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst)
 	if (err)
 		goto teardown_switches;
 
-	err = dsa_tree_setup_master(dst);
+	err = dsa_tree_setup_conduit(dst);
 	if (err)
 		goto teardown_ports;
 
 	err = dsa_tree_setup_lags(dst);
 	if (err)
-		goto teardown_master;
+		goto teardown_conduit;
 
 	dst->setup = true;
 
@@ -908,8 +908,8 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst)
 
 	return 0;
 
-teardown_master:
-	dsa_tree_teardown_master(dst);
+teardown_conduit:
+	dsa_tree_teardown_conduit(dst);
 teardown_ports:
 	dsa_tree_teardown_ports(dst);
 teardown_switches:
@@ -929,7 +929,7 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst)
 
 	dsa_tree_teardown_lags(dst);
 
-	dsa_tree_teardown_master(dst);
+	dsa_tree_teardown_conduit(dst);
 
 	dsa_tree_teardown_ports(dst);
 
@@ -978,7 +978,7 @@ out_disconnect:
 	return err;
 }
 
-/* Since the dsa/tagging sysfs device attribute is per master, the assumption
+/* Since the dsa/tagging sysfs device attribute is per conduit, the assumption
  * is that all DSA switches within a tree share the same tagger, otherwise
  * they would have formed disjoint trees (different "dsa,member" values).
  */
@@ -999,10 +999,10 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
 	 * restriction, there needs to be another mutex which serializes this.
 	 */
 	dsa_tree_for_each_user_port(dp, dst) {
-		if (dsa_port_to_master(dp)->flags & IFF_UP)
+		if (dsa_port_to_conduit(dp)->flags & IFF_UP)
 			goto out_unlock;
 
-		if (dp->slave->flags & IFF_UP)
+		if (dp->user->flags & IFF_UP)
 			goto out_unlock;
 	}
 
@@ -1028,62 +1028,62 @@ out_unlock:
 	return err;
 }
 
-static void dsa_tree_master_state_change(struct dsa_switch_tree *dst,
-					 struct net_device *master)
+static void dsa_tree_conduit_state_change(struct dsa_switch_tree *dst,
+					  struct net_device *conduit)
 {
-	struct dsa_notifier_master_state_info info;
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_notifier_conduit_state_info info;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 
-	info.master = master;
-	info.operational = dsa_port_master_is_operational(cpu_dp);
+	info.conduit = conduit;
+	info.operational = dsa_port_conduit_is_operational(cpu_dp);
 
-	dsa_tree_notify(dst, DSA_NOTIFIER_MASTER_STATE_CHANGE, &info);
+	dsa_tree_notify(dst, DSA_NOTIFIER_CONDUIT_STATE_CHANGE, &info);
 }
 
-void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
-					struct net_device *master,
-					bool up)
+void dsa_tree_conduit_admin_state_change(struct dsa_switch_tree *dst,
+					 struct net_device *conduit,
+					 bool up)
 {
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 	bool notify = false;
 
-	/* Don't keep track of admin state on LAG DSA masters,
-	 * but rather just of physical DSA masters
+	/* Don't keep track of admin state on LAG DSA conduits,
+	 * but rather just of physical DSA conduits
 	 */
-	if (netif_is_lag_master(master))
+	if (netif_is_lag_master(conduit))
 		return;
 
-	if ((dsa_port_master_is_operational(cpu_dp)) !=
-	    (up && cpu_dp->master_oper_up))
+	if ((dsa_port_conduit_is_operational(cpu_dp)) !=
+	    (up && cpu_dp->conduit_oper_up))
 		notify = true;
 
-	cpu_dp->master_admin_up = up;
+	cpu_dp->conduit_admin_up = up;
 
 	if (notify)
-		dsa_tree_master_state_change(dst, master);
+		dsa_tree_conduit_state_change(dst, conduit);
 }
 
-void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
-				       struct net_device *master,
-				       bool up)
+void dsa_tree_conduit_oper_state_change(struct dsa_switch_tree *dst,
+					struct net_device *conduit,
+					bool up)
 {
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 	bool notify = false;
 
-	/* Don't keep track of oper state on LAG DSA masters,
-	 * but rather just of physical DSA masters
+	/* Don't keep track of oper state on LAG DSA conduits,
+	 * but rather just of physical DSA conduits
 	 */
-	if (netif_is_lag_master(master))
+	if (netif_is_lag_master(conduit))
 		return;
 
-	if ((dsa_port_master_is_operational(cpu_dp)) !=
-	    (cpu_dp->master_admin_up && up))
+	if ((dsa_port_conduit_is_operational(cpu_dp)) !=
+	    (cpu_dp->conduit_admin_up && up))
 		notify = true;
 
-	cpu_dp->master_oper_up = up;
+	cpu_dp->conduit_oper_up = up;
 
 	if (notify)
-		dsa_tree_master_state_change(dst, master);
+		dsa_tree_conduit_state_change(dst, conduit);
 }
 
 static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
@@ -1129,7 +1129,7 @@ static int dsa_port_parse_dsa(struct dsa_port *dp)
 }
 
 static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
-						  struct net_device *master)
+						  struct net_device *conduit)
 {
 	enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE;
 	struct dsa_switch *mds, *ds = dp->ds;
@@ -1140,21 +1140,21 @@ static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
 	 * happens the switch driver may want to know if its tagging protocol
 	 * is going to work in such a configuration.
 	 */
-	if (dsa_slave_dev_check(master)) {
-		mdp = dsa_slave_to_port(master);
+	if (dsa_user_dev_check(conduit)) {
+		mdp = dsa_user_to_port(conduit);
 		mds = mdp->ds;
 		mdp_upstream = dsa_upstream_port(mds, mdp->index);
 		tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream,
 							  DSA_TAG_PROTO_NONE);
 	}
 
-	/* If the master device is not itself a DSA slave in a disjoint DSA
+	/* If the conduit device is not itself a DSA user in a disjoint DSA
 	 * tree, then return immediately.
 	 */
 	return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol);
 }
 
-static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
+static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *conduit,
 			      const char *user_protocol)
 {
 	const struct dsa_device_ops *tag_ops = NULL;
@@ -1163,7 +1163,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
 	enum dsa_tag_protocol default_proto;
 
 	/* Find out which protocol the switch would prefer. */
-	default_proto = dsa_get_tag_protocol(dp, master);
+	default_proto = dsa_get_tag_protocol(dp, conduit);
 	if (dst->default_proto) {
 		if (dst->default_proto != default_proto) {
 			dev_err(ds->dev,
@@ -1218,7 +1218,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
 		dst->tag_ops = tag_ops;
 	}
 
-	dp->master = master;
+	dp->conduit = conduit;
 	dp->type = DSA_PORT_TYPE_CPU;
 	dsa_port_set_tag_protocol(dp, dst->tag_ops);
 	dp->dst = dst;
@@ -1248,16 +1248,16 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
 	dp->dn = dn;
 
 	if (ethernet) {
-		struct net_device *master;
+		struct net_device *conduit;
 		const char *user_protocol;
 
-		master = of_find_net_device_by_node(ethernet);
+		conduit = of_find_net_device_by_node(ethernet);
 		of_node_put(ethernet);
-		if (!master)
+		if (!conduit)
 			return -EPROBE_DEFER;
 
 		user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL);
-		return dsa_port_parse_cpu(dp, master, user_protocol);
+		return dsa_port_parse_cpu(dp, conduit, user_protocol);
 	}
 
 	if (link)
@@ -1412,15 +1412,15 @@ static int dsa_port_parse(struct dsa_port *dp, const char *name,
 			  struct device *dev)
 {
 	if (!strcmp(name, "cpu")) {
-		struct net_device *master;
+		struct net_device *conduit;
 
-		master = dsa_dev_to_net_device(dev);
-		if (!master)
+		conduit = dsa_dev_to_net_device(dev);
+		if (!conduit)
 			return -EPROBE_DEFER;
 
-		dev_put(master);
+		dev_put(conduit);
 
-		return dsa_port_parse_cpu(dp, master, NULL);
+		return dsa_port_parse_cpu(dp, conduit, NULL);
 	}
 
 	if (!strcmp(name, "dsa"))
@@ -1566,14 +1566,14 @@ void dsa_unregister_switch(struct dsa_switch *ds)
 }
 EXPORT_SYMBOL_GPL(dsa_unregister_switch);
 
-/* If the DSA master chooses to unregister its net_device on .shutdown, DSA is
+/* If the DSA conduit chooses to unregister its net_device on .shutdown, DSA is
  * blocking that operation from completion, due to the dev_hold taken inside
- * netdev_upper_dev_link. Unlink the DSA slave interfaces from being uppers of
- * the DSA master, so that the system can reboot successfully.
+ * netdev_upper_dev_link. Unlink the DSA user interfaces from being uppers of
+ * the DSA conduit, so that the system can reboot successfully.
  */
 void dsa_switch_shutdown(struct dsa_switch *ds)
 {
-	struct net_device *master, *slave_dev;
+	struct net_device *conduit, *user_dev;
 	struct dsa_port *dp;
 
 	mutex_lock(&dsa2_mutex);
@@ -1584,17 +1584,17 @@ void dsa_switch_shutdown(struct dsa_switch *ds)
 	rtnl_lock();
 
 	dsa_switch_for_each_user_port(dp, ds) {
-		master = dsa_port_to_master(dp);
-		slave_dev = dp->slave;
+		conduit = dsa_port_to_conduit(dp);
+		user_dev = dp->user;
 
-		netdev_upper_dev_unlink(master, slave_dev);
+		netdev_upper_dev_unlink(conduit, user_dev);
 	}
 
-	/* Disconnect from further netdevice notifiers on the master,
+	/* Disconnect from further netdevice notifiers on the conduit,
 	 * since netdev_uses_dsa() will now return false.
 	 */
 	dsa_switch_for_each_cpu_port(dp, ds)
-		dp->master->dsa_ptr = NULL;
+		dp->conduit->dsa_ptr = NULL;
 
 	rtnl_unlock();
 out:
@@ -1605,7 +1605,7 @@ EXPORT_SYMBOL_GPL(dsa_switch_shutdown);
 #ifdef CONFIG_PM_SLEEP
 static bool dsa_port_is_initialized(const struct dsa_port *dp)
 {
-	return dp->type == DSA_PORT_TYPE_USER && dp->slave;
+	return dp->type == DSA_PORT_TYPE_USER && dp->user;
 }
 
 int dsa_switch_suspend(struct dsa_switch *ds)
@@ -1613,12 +1613,12 @@ int dsa_switch_suspend(struct dsa_switch *ds)
 	struct dsa_port *dp;
 	int ret = 0;
 
-	/* Suspend slave network devices */
+	/* Suspend user network devices */
 	dsa_switch_for_each_port(dp, ds) {
 		if (!dsa_port_is_initialized(dp))
 			continue;
 
-		ret = dsa_slave_suspend(dp->slave);
+		ret = dsa_user_suspend(dp->user);
 		if (ret)
 			return ret;
 	}
@@ -1641,12 +1641,12 @@ int dsa_switch_resume(struct dsa_switch *ds)
 	if (ret)
 		return ret;
 
-	/* Resume slave network devices */
+	/* Resume user network devices */
 	dsa_switch_for_each_port(dp, ds) {
 		if (!dsa_port_is_initialized(dp))
 			continue;
 
-		ret = dsa_slave_resume(dp->slave);
+		ret = dsa_user_resume(dp->user);
 		if (ret)
 			return ret;
 	}
@@ -1658,10 +1658,10 @@ EXPORT_SYMBOL_GPL(dsa_switch_resume);
 
 struct dsa_port *dsa_port_from_netdev(struct net_device *netdev)
 {
-	if (!netdev || !dsa_slave_dev_check(netdev))
+	if (!netdev || !dsa_user_dev_check(netdev))
 		return ERR_PTR(-ENODEV);
 
-	return dsa_slave_to_port(netdev);
+	return dsa_user_to_port(netdev);
 }
 EXPORT_SYMBOL_GPL(dsa_port_from_netdev);
 
@@ -1726,7 +1726,7 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
 EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db);
 
 static const struct dsa_stubs __dsa_stubs = {
-	.master_hwtstamp_validate = __dsa_master_hwtstamp_validate,
+	.conduit_hwtstamp_validate = __dsa_conduit_hwtstamp_validate,
 };
 
 static void dsa_register_stubs(void)
@@ -1748,7 +1748,7 @@ static int __init dsa_init_module(void)
 	if (!dsa_owq)
 		return -ENOMEM;
 
-	rc = dsa_slave_register_notifier();
+	rc = dsa_user_register_notifier();
 	if (rc)
 		goto register_notifier_fail;
 
@@ -1763,7 +1763,7 @@ static int __init dsa_init_module(void)
 	return 0;
 
 netlink_register_fail:
-	dsa_slave_unregister_notifier();
+	dsa_user_unregister_notifier();
 	dev_remove_pack(&dsa_pack_type);
 register_notifier_fail:
 	destroy_workqueue(dsa_owq);
@@ -1778,7 +1778,7 @@ static void __exit dsa_cleanup_module(void)
 
 	rtnl_link_unregister(&dsa_link_ops);
 
-	dsa_slave_unregister_notifier();
+	dsa_user_unregister_notifier();
 	dev_remove_pack(&dsa_pack_type);
 	destroy_workqueue(dsa_owq);
 }
diff --git a/net/dsa/dsa.h b/net/dsa/dsa.h
index b7e17ae1094d..3cc7823e9ef3 100644
--- a/net/dsa/dsa.h
+++ b/net/dsa/dsa.h
@@ -21,16 +21,16 @@ void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag);
 void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag);
 struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
 				  const struct net_device *lag_dev);
-struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst);
+struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst);
 int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
 			      const struct dsa_device_ops *tag_ops,
 			      const struct dsa_device_ops *old_tag_ops);
-void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
-					struct net_device *master,
+void dsa_tree_conduit_admin_state_change(struct dsa_switch_tree *dst,
+					 struct net_device *conduit,
+					 bool up);
+void dsa_tree_conduit_oper_state_change(struct dsa_switch_tree *dst,
+					struct net_device *conduit,
 					bool up);
-void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
-				       struct net_device *master,
-				       bool up);
 unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
 void dsa_bridge_num_put(const struct net_device *bridge_dev,
 			unsigned int bridge_num);
diff --git a/net/dsa/master.c b/net/dsa/master.c
deleted file mode 100644
index 6be89ab0cc01..000000000000
--- a/net/dsa/master.c
+++ /dev/null
@@ -1,475 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Handling of a master device, switching frames via its switch fabric CPU port
- *
- * Copyright (c) 2017 Savoir-faire Linux Inc.
- *	Vivien Didelot <vivien.didelot@savoirfairelinux.com>
- */
-
-#include <linux/ethtool.h>
-#include <linux/netdevice.h>
-#include <linux/netlink.h>
-#include <net/dsa.h>
-
-#include "dsa.h"
-#include "master.h"
-#include "port.h"
-#include "tag.h"
-
-static int dsa_master_get_regs_len(struct net_device *dev)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int port = cpu_dp->index;
-	int ret = 0;
-	int len;
-
-	if (ops->get_regs_len) {
-		len = ops->get_regs_len(dev);
-		if (len < 0)
-			return len;
-		ret += len;
-	}
-
-	ret += sizeof(struct ethtool_drvinfo);
-	ret += sizeof(struct ethtool_regs);
-
-	if (ds->ops->get_regs_len) {
-		len = ds->ops->get_regs_len(ds, port);
-		if (len < 0)
-			return len;
-		ret += len;
-	}
-
-	return ret;
-}
-
-static void dsa_master_get_regs(struct net_device *dev,
-				struct ethtool_regs *regs, void *data)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct ethtool_drvinfo *cpu_info;
-	struct ethtool_regs *cpu_regs;
-	int port = cpu_dp->index;
-	int len;
-
-	if (ops->get_regs_len && ops->get_regs) {
-		len = ops->get_regs_len(dev);
-		if (len < 0)
-			return;
-		regs->len = len;
-		ops->get_regs(dev, regs, data);
-		data += regs->len;
-	}
-
-	cpu_info = (struct ethtool_drvinfo *)data;
-	strscpy(cpu_info->driver, "dsa", sizeof(cpu_info->driver));
-	data += sizeof(*cpu_info);
-	cpu_regs = (struct ethtool_regs *)data;
-	data += sizeof(*cpu_regs);
-
-	if (ds->ops->get_regs_len && ds->ops->get_regs) {
-		len = ds->ops->get_regs_len(ds, port);
-		if (len < 0)
-			return;
-		cpu_regs->len = len;
-		ds->ops->get_regs(ds, port, cpu_regs, data);
-	}
-}
-
-static void dsa_master_get_ethtool_stats(struct net_device *dev,
-					 struct ethtool_stats *stats,
-					 uint64_t *data)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int port = cpu_dp->index;
-	int count = 0;
-
-	if (ops->get_sset_count && ops->get_ethtool_stats) {
-		count = ops->get_sset_count(dev, ETH_SS_STATS);
-		ops->get_ethtool_stats(dev, stats, data);
-	}
-
-	if (ds->ops->get_ethtool_stats)
-		ds->ops->get_ethtool_stats(ds, port, data + count);
-}
-
-static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
-					     struct ethtool_stats *stats,
-					     uint64_t *data)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int port = cpu_dp->index;
-	int count = 0;
-
-	if (dev->phydev && !ops->get_ethtool_phy_stats) {
-		count = phy_ethtool_get_sset_count(dev->phydev);
-		if (count >= 0)
-			phy_ethtool_get_stats(dev->phydev, stats, data);
-	} else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
-		count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
-		ops->get_ethtool_phy_stats(dev, stats, data);
-	}
-
-	if (count < 0)
-		count = 0;
-
-	if (ds->ops->get_ethtool_phy_stats)
-		ds->ops->get_ethtool_phy_stats(ds, port, data + count);
-}
-
-static int dsa_master_get_sset_count(struct net_device *dev, int sset)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int count = 0;
-
-	if (sset == ETH_SS_PHY_STATS && dev->phydev &&
-	    !ops->get_ethtool_phy_stats)
-		count = phy_ethtool_get_sset_count(dev->phydev);
-	else if (ops->get_sset_count)
-		count = ops->get_sset_count(dev, sset);
-
-	if (count < 0)
-		count = 0;
-
-	if (ds->ops->get_sset_count)
-		count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
-
-	return count;
-}
-
-static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
-				   uint8_t *data)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int port = cpu_dp->index;
-	int len = ETH_GSTRING_LEN;
-	int mcount = 0, count, i;
-	uint8_t pfx[4];
-	uint8_t *ndata;
-
-	snprintf(pfx, sizeof(pfx), "p%.2d", port);
-	/* We do not want to be NULL-terminated, since this is a prefix */
-	pfx[sizeof(pfx) - 1] = '_';
-
-	if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
-	    !ops->get_ethtool_phy_stats) {
-		mcount = phy_ethtool_get_sset_count(dev->phydev);
-		if (mcount < 0)
-			mcount = 0;
-		else
-			phy_ethtool_get_strings(dev->phydev, data);
-	} else if (ops->get_sset_count && ops->get_strings) {
-		mcount = ops->get_sset_count(dev, stringset);
-		if (mcount < 0)
-			mcount = 0;
-		ops->get_strings(dev, stringset, data);
-	}
-
-	if (ds->ops->get_strings) {
-		ndata = data + mcount * len;
-		/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
-		 * the output after to prepend our CPU port prefix we
-		 * constructed earlier
-		 */
-		ds->ops->get_strings(ds, port, stringset, ndata);
-		count = ds->ops->get_sset_count(ds, port, stringset);
-		if (count < 0)
-			return;
-		for (i = 0; i < count; i++) {
-			memmove(ndata + (i * len + sizeof(pfx)),
-				ndata + i * len, len - sizeof(pfx));
-			memcpy(ndata + i * len, pfx, sizeof(pfx));
-		}
-	}
-}
-
-/* Deny PTP operations on master if there is at least one switch in the tree
- * that is PTP capable.
- */
-int __dsa_master_hwtstamp_validate(struct net_device *dev,
-				   const struct kernel_hwtstamp_config *config,
-				   struct netlink_ext_ack *extack)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct dsa_switch_tree *dst;
-	struct dsa_port *dp;
-
-	dst = ds->dst;
-
-	list_for_each_entry(dp, &dst->ports, list) {
-		if (dsa_port_supports_hwtstamp(dp)) {
-			NL_SET_ERR_MSG(extack,
-				       "HW timestamping not allowed on DSA master when switch supports the operation");
-			return -EBUSY;
-		}
-	}
-
-	return 0;
-}
-
-static int dsa_master_ethtool_setup(struct net_device *dev)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct ethtool_ops *ops;
-
-	if (netif_is_lag_master(dev))
-		return 0;
-
-	ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL);
-	if (!ops)
-		return -ENOMEM;
-
-	cpu_dp->orig_ethtool_ops = dev->ethtool_ops;
-	if (cpu_dp->orig_ethtool_ops)
-		memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
-
-	ops->get_regs_len = dsa_master_get_regs_len;
-	ops->get_regs = dsa_master_get_regs;
-	ops->get_sset_count = dsa_master_get_sset_count;
-	ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
-	ops->get_strings = dsa_master_get_strings;
-	ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
-
-	dev->ethtool_ops = ops;
-
-	return 0;
-}
-
-static void dsa_master_ethtool_teardown(struct net_device *dev)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-
-	if (netif_is_lag_master(dev))
-		return;
-
-	dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
-	cpu_dp->orig_ethtool_ops = NULL;
-}
-
-/* Keep the master always promiscuous if the tagging protocol requires that
- * (garbles MAC DA) or if it doesn't support unicast filtering, case in which
- * it would revert to promiscuous mode as soon as we call dev_uc_add() on it
- * anyway.
- */
-static void dsa_master_set_promiscuity(struct net_device *dev, int inc)
-{
-	const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops;
-
-	if ((dev->priv_flags & IFF_UNICAST_FLT) && !ops->promisc_on_master)
-		return;
-
-	ASSERT_RTNL();
-
-	dev_set_promiscuity(dev, inc);
-}
-
-static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
-			    char *buf)
-{
-	struct net_device *dev = to_net_dev(d);
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-
-	return sysfs_emit(buf, "%s\n",
-		       dsa_tag_protocol_to_str(cpu_dp->tag_ops));
-}
-
-static ssize_t tagging_store(struct device *d, struct device_attribute *attr,
-			     const char *buf, size_t count)
-{
-	const struct dsa_device_ops *new_tag_ops, *old_tag_ops;
-	const char *end = strchrnul(buf, '\n'), *name;
-	struct net_device *dev = to_net_dev(d);
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	size_t len = end - buf;
-	int err;
-
-	/* Empty string passed */
-	if (!len)
-		return -ENOPROTOOPT;
-
-	name = kstrndup(buf, len, GFP_KERNEL);
-	if (!name)
-		return -ENOMEM;
-
-	old_tag_ops = cpu_dp->tag_ops;
-	new_tag_ops = dsa_tag_driver_get_by_name(name);
-	kfree(name);
-	/* Bad tagger name? */
-	if (IS_ERR(new_tag_ops))
-		return PTR_ERR(new_tag_ops);
-
-	if (new_tag_ops == old_tag_ops)
-		/* Drop the temporarily held duplicate reference, since
-		 * the DSA switch tree uses this tagger.
-		 */
-		goto out;
-
-	err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, new_tag_ops,
-					old_tag_ops);
-	if (err) {
-		/* On failure the old tagger is restored, so we don't need the
-		 * driver for the new one.
-		 */
-		dsa_tag_driver_put(new_tag_ops);
-		return err;
-	}
-
-	/* On success we no longer need the module for the old tagging protocol
-	 */
-out:
-	dsa_tag_driver_put(old_tag_ops);
-	return count;
-}
-static DEVICE_ATTR_RW(tagging);
-
-static struct attribute *dsa_slave_attrs[] = {
-	&dev_attr_tagging.attr,
-	NULL
-};
-
-static const struct attribute_group dsa_group = {
-	.name	= "dsa",
-	.attrs	= dsa_slave_attrs,
-};
-
-static void dsa_master_reset_mtu(struct net_device *dev)
-{
-	int err;
-
-	err = dev_set_mtu(dev, ETH_DATA_LEN);
-	if (err)
-		netdev_dbg(dev,
-			   "Unable to reset MTU to exclude DSA overheads\n");
-}
-
-int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
-{
-	const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct device_link *consumer_link;
-	int mtu, ret;
-
-	mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops);
-
-	/* The DSA master must use SET_NETDEV_DEV for this to work. */
-	if (!netif_is_lag_master(dev)) {
-		consumer_link = device_link_add(ds->dev, dev->dev.parent,
-						DL_FLAG_AUTOREMOVE_CONSUMER);
-		if (!consumer_link)
-			netdev_err(dev,
-				   "Failed to create a device link to DSA switch %s\n",
-				   dev_name(ds->dev));
-	}
-
-	/* The switch driver may not implement ->port_change_mtu(), case in
-	 * which dsa_slave_change_mtu() will not update the master MTU either,
-	 * so we need to do that here.
-	 */
-	ret = dev_set_mtu(dev, mtu);
-	if (ret)
-		netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n",
-			    ret, mtu);
-
-	/* If we use a tagging format that doesn't have an ethertype
-	 * field, make sure that all packets from this point on get
-	 * sent to the tag format's receive function.
-	 */
-	wmb();
-
-	dev->dsa_ptr = cpu_dp;
-
-	dsa_master_set_promiscuity(dev, 1);
-
-	ret = dsa_master_ethtool_setup(dev);
-	if (ret)
-		goto out_err_reset_promisc;
-
-	ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
-	if (ret)
-		goto out_err_ethtool_teardown;
-
-	return ret;
-
-out_err_ethtool_teardown:
-	dsa_master_ethtool_teardown(dev);
-out_err_reset_promisc:
-	dsa_master_set_promiscuity(dev, -1);
-	return ret;
-}
-
-void dsa_master_teardown(struct net_device *dev)
-{
-	sysfs_remove_group(&dev->dev.kobj, &dsa_group);
-	dsa_master_ethtool_teardown(dev);
-	dsa_master_reset_mtu(dev);
-	dsa_master_set_promiscuity(dev, -1);
-
-	dev->dsa_ptr = NULL;
-
-	/* If we used a tagging format that doesn't have an ethertype
-	 * field, make sure that all packets from this point get sent
-	 * without the tag and go through the regular receive path.
-	 */
-	wmb();
-}
-
-int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
-			 struct netdev_lag_upper_info *uinfo,
-			 struct netlink_ext_ack *extack)
-{
-	bool master_setup = false;
-	int err;
-
-	if (!netdev_uses_dsa(lag_dev)) {
-		err = dsa_master_setup(lag_dev, cpu_dp);
-		if (err)
-			return err;
-
-		master_setup = true;
-	}
-
-	err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack);
-	if (err) {
-		NL_SET_ERR_MSG_WEAK_MOD(extack, "CPU port failed to join LAG");
-		goto out_master_teardown;
-	}
-
-	return 0;
-
-out_master_teardown:
-	if (master_setup)
-		dsa_master_teardown(lag_dev);
-	return err;
-}
-
-/* Tear down a master if there isn't any other user port on it,
- * optionally also destroying LAG information.
- */
-void dsa_master_lag_teardown(struct net_device *lag_dev,
-			     struct dsa_port *cpu_dp)
-{
-	struct net_device *upper;
-	struct list_head *iter;
-
-	dsa_port_lag_leave(cpu_dp, lag_dev);
-
-	netdev_for_each_upper_dev_rcu(lag_dev, upper, iter)
-		if (dsa_slave_dev_check(upper))
-			return;
-
-	dsa_master_teardown(lag_dev);
-}
diff --git a/net/dsa/master.h b/net/dsa/master.h
deleted file mode 100644
index 76e39d3ec909..000000000000
--- a/net/dsa/master.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#ifndef __DSA_MASTER_H
-#define __DSA_MASTER_H
-
-struct dsa_port;
-struct net_device;
-struct netdev_lag_upper_info;
-struct netlink_ext_ack;
-
-int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp);
-void dsa_master_teardown(struct net_device *dev);
-int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
-			 struct netdev_lag_upper_info *uinfo,
-			 struct netlink_ext_ack *extack);
-void dsa_master_lag_teardown(struct net_device *lag_dev,
-			     struct dsa_port *cpu_dp);
-int __dsa_master_hwtstamp_validate(struct net_device *dev,
-				   const struct kernel_hwtstamp_config *config,
-				   struct netlink_ext_ack *extack);
-
-#endif
diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c
index bd4bbaf851de..f56f90a25b99 100644
--- a/net/dsa/netlink.c
+++ b/net/dsa/netlink.c
@@ -5,7 +5,7 @@
 #include <net/rtnetlink.h>
 
 #include "netlink.h"
-#include "slave.h"
+#include "user.h"
 
 static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = {
 	[IFLA_DSA_MASTER]	= { .type = NLA_U32 },
@@ -22,13 +22,13 @@ static int dsa_changelink(struct net_device *dev, struct nlattr *tb[],
 
 	if (data[IFLA_DSA_MASTER]) {
 		u32 ifindex = nla_get_u32(data[IFLA_DSA_MASTER]);
-		struct net_device *master;
+		struct net_device *conduit;
 
-		master = __dev_get_by_index(dev_net(dev), ifindex);
-		if (!master)
+		conduit = __dev_get_by_index(dev_net(dev), ifindex);
+		if (!conduit)
 			return -EINVAL;
 
-		err = dsa_slave_change_master(dev, master, extack);
+		err = dsa_user_change_conduit(dev, conduit, extack);
 		if (err)
 			return err;
 	}
@@ -44,9 +44,9 @@ static size_t dsa_get_size(const struct net_device *dev)
 
 static int dsa_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
-	struct net_device *master = dsa_slave_to_master(dev);
+	struct net_device *conduit = dsa_user_to_conduit(dev);
 
-	if (nla_put_u32(skb, IFLA_DSA_MASTER, master->ifindex))
+	if (nla_put_u32(skb, IFLA_DSA_MASTER, conduit->ifindex))
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 6e0d000a97c4..c42dac87671b 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -14,9 +14,9 @@
 
 #include "dsa.h"
 #include "port.h"
-#include "slave.h"
 #include "switch.h"
 #include "tag_8021q.h"
+#include "user.h"
 
 /**
  * dsa_port_notify - Notify the switching fabric of changes to a port
@@ -289,7 +289,7 @@ static void dsa_port_reset_vlan_filtering(struct dsa_port *dp,
 	}
 
 	/* If the bridge was vlan_filtering, the bridge core doesn't trigger an
-	 * event for changing vlan_filtering setting upon slave ports leaving
+	 * event for changing vlan_filtering setting upon user ports leaving
 	 * it. That is a good thing, because that lets us handle it and also
 	 * handle the case where the switch's vlan_filtering setting is global
 	 * (not per port). When that happens, the correct moment to trigger the
@@ -489,7 +489,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 		.dp = dp,
 		.extack = extack,
 	};
-	struct net_device *dev = dp->slave;
+	struct net_device *dev = dp->user;
 	struct net_device *brport_dev;
 	int err;
 
@@ -514,8 +514,8 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 	dp->bridge->tx_fwd_offload = info.tx_fwd_offload;
 
 	err = switchdev_bridge_port_offload(brport_dev, dev, dp,
-					    &dsa_slave_switchdev_notifier,
-					    &dsa_slave_switchdev_blocking_notifier,
+					    &dsa_user_switchdev_notifier,
+					    &dsa_user_switchdev_blocking_notifier,
 					    dp->bridge->tx_fwd_offload, extack);
 	if (err)
 		goto out_rollback_unbridge;
@@ -528,8 +528,8 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 
 out_rollback_unoffload:
 	switchdev_bridge_port_unoffload(brport_dev, dp,
-					&dsa_slave_switchdev_notifier,
-					&dsa_slave_switchdev_blocking_notifier);
+					&dsa_user_switchdev_notifier,
+					&dsa_user_switchdev_blocking_notifier);
 	dsa_flush_workqueue();
 out_rollback_unbridge:
 	dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
@@ -547,8 +547,8 @@ void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br)
 		return;
 
 	switchdev_bridge_port_unoffload(brport_dev, dp,
-					&dsa_slave_switchdev_notifier,
-					&dsa_slave_switchdev_blocking_notifier);
+					&dsa_user_switchdev_notifier,
+					&dsa_user_switchdev_blocking_notifier);
 
 	dsa_flush_workqueue();
 }
@@ -741,10 +741,10 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp,
 	 */
 	if (vlan_filtering && dsa_port_is_user(dp)) {
 		struct net_device *br = dsa_port_bridge_dev_get(dp);
-		struct net_device *upper_dev, *slave = dp->slave;
+		struct net_device *upper_dev, *user = dp->user;
 		struct list_head *iter;
 
-		netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) {
+		netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
 			struct bridge_vlan_info br_info;
 			u16 vid;
 
@@ -803,9 +803,9 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 	if (!ds->ops->port_vlan_filtering)
 		return -EOPNOTSUPP;
 
-	/* We are called from dsa_slave_switchdev_blocking_event(),
+	/* We are called from dsa_user_switchdev_blocking_event(),
 	 * which is not under rcu_read_lock(), unlike
-	 * dsa_slave_switchdev_event().
+	 * dsa_user_switchdev_event().
 	 */
 	rcu_read_lock();
 	apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering, extack);
@@ -827,24 +827,24 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 		ds->vlan_filtering = vlan_filtering;
 
 		dsa_switch_for_each_user_port(other_dp, ds) {
-			struct net_device *slave = other_dp->slave;
+			struct net_device *user = other_dp->user;
 
 			/* We might be called in the unbind path, so not
-			 * all slave devices might still be registered.
+			 * all user devices might still be registered.
 			 */
-			if (!slave)
+			if (!user)
 				continue;
 
-			err = dsa_slave_manage_vlan_filtering(slave,
-							      vlan_filtering);
+			err = dsa_user_manage_vlan_filtering(user,
+							     vlan_filtering);
 			if (err)
 				goto restore;
 		}
 	} else {
 		dp->vlan_filtering = vlan_filtering;
 
-		err = dsa_slave_manage_vlan_filtering(dp->slave,
-						      vlan_filtering);
+		err = dsa_user_manage_vlan_filtering(dp->user,
+						     vlan_filtering);
 		if (err)
 			goto restore;
 	}
@@ -863,7 +863,7 @@ restore:
 }
 
 /* This enforces legacy behavior for switch drivers which assume they can't
- * receive VLAN configuration when enslaved to a bridge with vlan_filtering=0
+ * receive VLAN configuration when joining a bridge with vlan_filtering=0
  */
 bool dsa_port_skip_vlan_configuration(struct dsa_port *dp)
 {
@@ -1047,7 +1047,7 @@ int dsa_port_standalone_host_fdb_add(struct dsa_port *dp,
 int dsa_port_bridge_host_fdb_add(struct dsa_port *dp,
 				 const unsigned char *addr, u16 vid)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_db db = {
 		.type = DSA_DB_BRIDGE,
 		.bridge = *dp->bridge,
@@ -1057,12 +1057,12 @@ int dsa_port_bridge_host_fdb_add(struct dsa_port *dp,
 	if (!dp->ds->fdb_isolation)
 		db.bridge.num = 0;
 
-	/* Avoid a call to __dev_set_promiscuity() on the master, which
+	/* Avoid a call to __dev_set_promiscuity() on the conduit, which
 	 * requires rtnl_lock(), since we can't guarantee that is held here,
 	 * and we can't take it either.
 	 */
-	if (master->priv_flags & IFF_UNICAST_FLT) {
-		err = dev_uc_add(master, addr);
+	if (conduit->priv_flags & IFF_UNICAST_FLT) {
+		err = dev_uc_add(conduit, addr);
 		if (err)
 			return err;
 	}
@@ -1098,7 +1098,7 @@ int dsa_port_standalone_host_fdb_del(struct dsa_port *dp,
 int dsa_port_bridge_host_fdb_del(struct dsa_port *dp,
 				 const unsigned char *addr, u16 vid)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_db db = {
 		.type = DSA_DB_BRIDGE,
 		.bridge = *dp->bridge,
@@ -1108,8 +1108,8 @@ int dsa_port_bridge_host_fdb_del(struct dsa_port *dp,
 	if (!dp->ds->fdb_isolation)
 		db.bridge.num = 0;
 
-	if (master->priv_flags & IFF_UNICAST_FLT) {
-		err = dev_uc_del(master, addr);
+	if (conduit->priv_flags & IFF_UNICAST_FLT) {
+		err = dev_uc_del(conduit, addr);
 		if (err)
 			return err;
 	}
@@ -1229,7 +1229,7 @@ int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp,
 int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
 				 const struct switchdev_obj_port_mdb *mdb)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_db db = {
 		.type = DSA_DB_BRIDGE,
 		.bridge = *dp->bridge,
@@ -1239,7 +1239,7 @@ int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
 	if (!dp->ds->fdb_isolation)
 		db.bridge.num = 0;
 
-	err = dev_mc_add(master, mdb->addr);
+	err = dev_mc_add(conduit, mdb->addr);
 	if (err)
 		return err;
 
@@ -1273,7 +1273,7 @@ int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp,
 int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
 				 const struct switchdev_obj_port_mdb *mdb)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_db db = {
 		.type = DSA_DB_BRIDGE,
 		.bridge = *dp->bridge,
@@ -1283,7 +1283,7 @@ int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
 	if (!dp->ds->fdb_isolation)
 		db.bridge.num = 0;
 
-	err = dev_mc_del(master, mdb->addr);
+	err = dev_mc_del(conduit, mdb->addr);
 	if (err)
 		return err;
 
@@ -1318,7 +1318,7 @@ int dsa_port_host_vlan_add(struct dsa_port *dp,
 			   const struct switchdev_obj_port_vlan *vlan,
 			   struct netlink_ext_ack *extack)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_notifier_vlan_info info = {
 		.dp = dp,
 		.vlan = vlan,
@@ -1330,7 +1330,7 @@ int dsa_port_host_vlan_add(struct dsa_port *dp,
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	vlan_vid_add(master, htons(ETH_P_8021Q), vlan->vid);
+	vlan_vid_add(conduit, htons(ETH_P_8021Q), vlan->vid);
 
 	return err;
 }
@@ -1338,7 +1338,7 @@ int dsa_port_host_vlan_add(struct dsa_port *dp,
 int dsa_port_host_vlan_del(struct dsa_port *dp,
 			   const struct switchdev_obj_port_vlan *vlan)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_notifier_vlan_info info = {
 		.dp = dp,
 		.vlan = vlan,
@@ -1349,7 +1349,7 @@ int dsa_port_host_vlan_del(struct dsa_port *dp,
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	vlan_vid_del(master, htons(ETH_P_8021Q), vlan->vid);
+	vlan_vid_del(conduit, htons(ETH_P_8021Q), vlan->vid);
 
 	return err;
 }
@@ -1398,24 +1398,24 @@ int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
 	return ds->ops->port_mrp_del_ring_role(ds, dp->index, mrp);
 }
 
-static int dsa_port_assign_master(struct dsa_port *dp,
-				  struct net_device *master,
-				  struct netlink_ext_ack *extack,
-				  bool fail_on_err)
+static int dsa_port_assign_conduit(struct dsa_port *dp,
+				   struct net_device *conduit,
+				   struct netlink_ext_ack *extack,
+				   bool fail_on_err)
 {
 	struct dsa_switch *ds = dp->ds;
 	int port = dp->index, err;
 
-	err = ds->ops->port_change_master(ds, port, master, extack);
+	err = ds->ops->port_change_conduit(ds, port, conduit, extack);
 	if (err && !fail_on_err)
-		dev_err(ds->dev, "port %d failed to assign master %s: %pe\n",
-			port, master->name, ERR_PTR(err));
+		dev_err(ds->dev, "port %d failed to assign conduit %s: %pe\n",
+			port, conduit->name, ERR_PTR(err));
 
 	if (err && fail_on_err)
 		return err;
 
-	dp->cpu_dp = master->dsa_ptr;
-	dp->cpu_port_in_lag = netif_is_lag_master(master);
+	dp->cpu_dp = conduit->dsa_ptr;
+	dp->cpu_port_in_lag = netif_is_lag_master(conduit);
 
 	return 0;
 }
@@ -1428,12 +1428,12 @@ static int dsa_port_assign_master(struct dsa_port *dp,
  * the old CPU port before changing it, and restore it on errors during the
  * bringup of the new one.
  */
-int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
-			   struct netlink_ext_ack *extack)
+int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit,
+			    struct netlink_ext_ack *extack)
 {
 	struct net_device *bridge_dev = dsa_port_bridge_dev_get(dp);
-	struct net_device *old_master = dsa_port_to_master(dp);
-	struct net_device *dev = dp->slave;
+	struct net_device *old_conduit = dsa_port_to_conduit(dp);
+	struct net_device *dev = dp->user;
 	struct dsa_switch *ds = dp->ds;
 	bool vlan_filtering;
 	int err, tmp;
@@ -1454,7 +1454,7 @@ int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
 	 */
 	vlan_filtering = dsa_port_is_vlan_filtering(dp);
 	if (vlan_filtering) {
-		err = dsa_slave_manage_vlan_filtering(dev, false);
+		err = dsa_user_manage_vlan_filtering(dev, false);
 		if (err) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed to remove standalone VLANs");
@@ -1465,16 +1465,16 @@ int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
 	/* Standalone addresses, and addresses of upper interfaces like
 	 * VLAN, LAG, HSR need to be migrated.
 	 */
-	dsa_slave_unsync_ha(dev);
+	dsa_user_unsync_ha(dev);
 
-	err = dsa_port_assign_master(dp, master, extack, true);
+	err = dsa_port_assign_conduit(dp, conduit, extack, true);
 	if (err)
 		goto rewind_old_addrs;
 
-	dsa_slave_sync_ha(dev);
+	dsa_user_sync_ha(dev);
 
 	if (vlan_filtering) {
-		err = dsa_slave_manage_vlan_filtering(dev, true);
+		err = dsa_user_manage_vlan_filtering(dev, true);
 		if (err) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed to restore standalone VLANs");
@@ -1495,19 +1495,19 @@ int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
 
 rewind_new_vlan:
 	if (vlan_filtering)
-		dsa_slave_manage_vlan_filtering(dev, false);
+		dsa_user_manage_vlan_filtering(dev, false);
 
 rewind_new_addrs:
-	dsa_slave_unsync_ha(dev);
+	dsa_user_unsync_ha(dev);
 
-	dsa_port_assign_master(dp, old_master, NULL, false);
+	dsa_port_assign_conduit(dp, old_conduit, NULL, false);
 
 /* Restore the objects on the old CPU port */
 rewind_old_addrs:
-	dsa_slave_sync_ha(dev);
+	dsa_user_sync_ha(dev);
 
 	if (vlan_filtering) {
-		tmp = dsa_slave_manage_vlan_filtering(dev, true);
+		tmp = dsa_user_manage_vlan_filtering(dev, true);
 		if (tmp) {
 			dev_err(ds->dev,
 				"port %d failed to restore standalone VLANs: %pe\n",
@@ -1620,7 +1620,7 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
 	struct dsa_switch *ds = dp->ds;
 
 	if (dsa_port_is_user(dp))
-		phydev = dp->slave->phydev;
+		phydev = dp->user->phydev;
 
 	if (!ds->ops->phylink_mac_link_down) {
 		if (ds->ops->adjust_link && phydev)
@@ -1808,7 +1808,7 @@ err_phy_connect:
  * their type.
  *
  * User ports with no phy-handle or fixed-link are expected to connect to an
- * internal PHY located on the ds->slave_mii_bus at an MDIO address equal to
+ * internal PHY located on the ds->user_mii_bus at an MDIO address equal to
  * the port number. This description is still actively supported.
  *
  * Shared (CPU and DSA) ports with no phy-handle or fixed-link are expected to
@@ -1829,7 +1829,7 @@ err_phy_connect:
  * a fixed-link, a phy-handle, or a managed = "in-band-status" property.
  * It becomes the responsibility of the driver to ensure that these ports
  * operate at the maximum speed (whatever this means) and will interoperate
- * with the DSA master or other cascade port, since phylink methods will not be
+ * with the DSA conduit or other cascade port, since phylink methods will not be
  * invoked for them.
  *
  * If you are considering expanding this table for newly introduced switches,
diff --git a/net/dsa/port.h b/net/dsa/port.h
index 334879964e2c..6bc3291573c0 100644
--- a/net/dsa/port.h
+++ b/net/dsa/port.h
@@ -109,7 +109,7 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
 int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
 void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
 void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc);
-int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
-			   struct netlink_ext_ack *extack);
+int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit,
+			    struct netlink_ext_ack *extack);
 
 #endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
deleted file mode 100644
index 4c3e502d7e16..000000000000
--- a/net/dsa/slave.c
+++ /dev/null
@@ -1,3727 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dsa/slave.c - Slave device handling
- * Copyright (c) 2008-2009 Marvell Semiconductor
- */
-
-#include <linux/list.h>
-#include <linux/etherdevice.h>
-#include <linux/netdevice.h>
-#include <linux/phy.h>
-#include <linux/phy_fixed.h>
-#include <linux/phylink.h>
-#include <linux/of_net.h>
-#include <linux/of_mdio.h>
-#include <linux/mdio.h>
-#include <net/rtnetlink.h>
-#include <net/pkt_cls.h>
-#include <net/selftests.h>
-#include <net/tc_act/tc_mirred.h>
-#include <linux/if_bridge.h>
-#include <linux/if_hsr.h>
-#include <net/dcbnl.h>
-#include <linux/netpoll.h>
-#include <linux/string.h>
-
-#include "dsa.h"
-#include "port.h"
-#include "master.h"
-#include "netlink.h"
-#include "slave.h"
-#include "switch.h"
-#include "tag.h"
-
-struct dsa_switchdev_event_work {
-	struct net_device *dev;
-	struct net_device *orig_dev;
-	struct work_struct work;
-	unsigned long event;
-	/* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
-	 * SWITCHDEV_FDB_DEL_TO_DEVICE
-	 */
-	unsigned char addr[ETH_ALEN];
-	u16 vid;
-	bool host_addr;
-};
-
-enum dsa_standalone_event {
-	DSA_UC_ADD,
-	DSA_UC_DEL,
-	DSA_MC_ADD,
-	DSA_MC_DEL,
-};
-
-struct dsa_standalone_event_work {
-	struct work_struct work;
-	struct net_device *dev;
-	enum dsa_standalone_event event;
-	unsigned char addr[ETH_ALEN];
-	u16 vid;
-};
-
-struct dsa_host_vlan_rx_filtering_ctx {
-	struct net_device *dev;
-	const unsigned char *addr;
-	enum dsa_standalone_event event;
-};
-
-static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
-{
-	return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
-	       ds->fdb_isolation && !ds->vlan_filtering_is_global &&
-	       !ds->needs_standalone_vlan_filtering;
-}
-
-static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
-{
-	return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
-	       ds->fdb_isolation && !ds->vlan_filtering_is_global &&
-	       !ds->needs_standalone_vlan_filtering;
-}
-
-static void dsa_slave_standalone_event_work(struct work_struct *work)
-{
-	struct dsa_standalone_event_work *standalone_work =
-		container_of(work, struct dsa_standalone_event_work, work);
-	const unsigned char *addr = standalone_work->addr;
-	struct net_device *dev = standalone_work->dev;
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_mdb mdb;
-	struct dsa_switch *ds = dp->ds;
-	u16 vid = standalone_work->vid;
-	int err;
-
-	switch (standalone_work->event) {
-	case DSA_UC_ADD:
-		err = dsa_port_standalone_host_fdb_add(dp, addr, vid);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to add %pM vid %d to fdb: %d\n",
-				dp->index, addr, vid, err);
-			break;
-		}
-		break;
-
-	case DSA_UC_DEL:
-		err = dsa_port_standalone_host_fdb_del(dp, addr, vid);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to delete %pM vid %d from fdb: %d\n",
-				dp->index, addr, vid, err);
-		}
-
-		break;
-	case DSA_MC_ADD:
-		ether_addr_copy(mdb.addr, addr);
-		mdb.vid = vid;
-
-		err = dsa_port_standalone_host_mdb_add(dp, &mdb);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to add %pM vid %d to mdb: %d\n",
-				dp->index, addr, vid, err);
-			break;
-		}
-		break;
-	case DSA_MC_DEL:
-		ether_addr_copy(mdb.addr, addr);
-		mdb.vid = vid;
-
-		err = dsa_port_standalone_host_mdb_del(dp, &mdb);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to delete %pM vid %d from mdb: %d\n",
-				dp->index, addr, vid, err);
-		}
-
-		break;
-	}
-
-	kfree(standalone_work);
-}
-
-static int dsa_slave_schedule_standalone_work(struct net_device *dev,
-					      enum dsa_standalone_event event,
-					      const unsigned char *addr,
-					      u16 vid)
-{
-	struct dsa_standalone_event_work *standalone_work;
-
-	standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC);
-	if (!standalone_work)
-		return -ENOMEM;
-
-	INIT_WORK(&standalone_work->work, dsa_slave_standalone_event_work);
-	standalone_work->event = event;
-	standalone_work->dev = dev;
-
-	ether_addr_copy(standalone_work->addr, addr);
-	standalone_work->vid = vid;
-
-	dsa_schedule_work(&standalone_work->work);
-
-	return 0;
-}
-
-static int dsa_slave_host_vlan_rx_filtering(void *arg, int vid)
-{
-	struct dsa_host_vlan_rx_filtering_ctx *ctx = arg;
-
-	return dsa_slave_schedule_standalone_work(ctx->dev, ctx->event,
-						  ctx->addr, vid);
-}
-
-static int dsa_slave_vlan_for_each(struct net_device *dev,
-				   int (*cb)(void *arg, int vid), void *arg)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_vlan *v;
-	int err;
-
-	lockdep_assert_held(&dev->addr_list_lock);
-
-	err = cb(arg, 0);
-	if (err)
-		return err;
-
-	list_for_each_entry(v, &dp->user_vlans, list) {
-		err = cb(arg, v->vid);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static int dsa_slave_sync_uc(struct net_device *dev,
-			     const unsigned char *addr)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_host_vlan_rx_filtering_ctx ctx = {
-		.dev = dev,
-		.addr = addr,
-		.event = DSA_UC_ADD,
-	};
-
-	dev_uc_add(master, addr);
-
-	if (!dsa_switch_supports_uc_filtering(dp->ds))
-		return 0;
-
-	return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering,
-				       &ctx);
-}
-
-static int dsa_slave_unsync_uc(struct net_device *dev,
-			       const unsigned char *addr)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_host_vlan_rx_filtering_ctx ctx = {
-		.dev = dev,
-		.addr = addr,
-		.event = DSA_UC_DEL,
-	};
-
-	dev_uc_del(master, addr);
-
-	if (!dsa_switch_supports_uc_filtering(dp->ds))
-		return 0;
-
-	return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering,
-				       &ctx);
-}
-
-static int dsa_slave_sync_mc(struct net_device *dev,
-			     const unsigned char *addr)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_host_vlan_rx_filtering_ctx ctx = {
-		.dev = dev,
-		.addr = addr,
-		.event = DSA_MC_ADD,
-	};
-
-	dev_mc_add(master, addr);
-
-	if (!dsa_switch_supports_mc_filtering(dp->ds))
-		return 0;
-
-	return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering,
-				       &ctx);
-}
-
-static int dsa_slave_unsync_mc(struct net_device *dev,
-			       const unsigned char *addr)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_host_vlan_rx_filtering_ctx ctx = {
-		.dev = dev,
-		.addr = addr,
-		.event = DSA_MC_DEL,
-	};
-
-	dev_mc_del(master, addr);
-
-	if (!dsa_switch_supports_mc_filtering(dp->ds))
-		return 0;
-
-	return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering,
-				       &ctx);
-}
-
-void dsa_slave_sync_ha(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct netdev_hw_addr *ha;
-
-	netif_addr_lock_bh(dev);
-
-	netdev_for_each_synced_mc_addr(ha, dev)
-		dsa_slave_sync_mc(dev, ha->addr);
-
-	netdev_for_each_synced_uc_addr(ha, dev)
-		dsa_slave_sync_uc(dev, ha->addr);
-
-	netif_addr_unlock_bh(dev);
-
-	if (dsa_switch_supports_uc_filtering(ds) ||
-	    dsa_switch_supports_mc_filtering(ds))
-		dsa_flush_workqueue();
-}
-
-void dsa_slave_unsync_ha(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct netdev_hw_addr *ha;
-
-	netif_addr_lock_bh(dev);
-
-	netdev_for_each_synced_uc_addr(ha, dev)
-		dsa_slave_unsync_uc(dev, ha->addr);
-
-	netdev_for_each_synced_mc_addr(ha, dev)
-		dsa_slave_unsync_mc(dev, ha->addr);
-
-	netif_addr_unlock_bh(dev);
-
-	if (dsa_switch_supports_uc_filtering(ds) ||
-	    dsa_switch_supports_mc_filtering(ds))
-		dsa_flush_workqueue();
-}
-
-/* slave mii_bus handling ***************************************************/
-static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
-{
-	struct dsa_switch *ds = bus->priv;
-
-	if (ds->phys_mii_mask & (1 << addr))
-		return ds->ops->phy_read(ds, addr, reg);
-
-	return 0xffff;
-}
-
-static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
-{
-	struct dsa_switch *ds = bus->priv;
-
-	if (ds->phys_mii_mask & (1 << addr))
-		return ds->ops->phy_write(ds, addr, reg, val);
-
-	return 0;
-}
-
-void dsa_slave_mii_bus_init(struct dsa_switch *ds)
-{
-	ds->slave_mii_bus->priv = (void *)ds;
-	ds->slave_mii_bus->name = "dsa slave smi";
-	ds->slave_mii_bus->read = dsa_slave_phy_read;
-	ds->slave_mii_bus->write = dsa_slave_phy_write;
-	snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
-		 ds->dst->index, ds->index);
-	ds->slave_mii_bus->parent = ds->dev;
-	ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
-}
-
-
-/* slave device handling ****************************************************/
-static int dsa_slave_get_iflink(const struct net_device *dev)
-{
-	return dsa_slave_to_master(dev)->ifindex;
-}
-
-static int dsa_slave_open(struct net_device *dev)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int err;
-
-	err = dev_open(master, NULL);
-	if (err < 0) {
-		netdev_err(dev, "failed to open master %s\n", master->name);
-		goto out;
-	}
-
-	if (dsa_switch_supports_uc_filtering(ds)) {
-		err = dsa_port_standalone_host_fdb_add(dp, dev->dev_addr, 0);
-		if (err)
-			goto out;
-	}
-
-	if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) {
-		err = dev_uc_add(master, dev->dev_addr);
-		if (err < 0)
-			goto del_host_addr;
-	}
-
-	err = dsa_port_enable_rt(dp, dev->phydev);
-	if (err)
-		goto del_unicast;
-
-	return 0;
-
-del_unicast:
-	if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
-		dev_uc_del(master, dev->dev_addr);
-del_host_addr:
-	if (dsa_switch_supports_uc_filtering(ds))
-		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
-out:
-	return err;
-}
-
-static int dsa_slave_close(struct net_device *dev)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	dsa_port_disable_rt(dp);
-
-	if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
-		dev_uc_del(master, dev->dev_addr);
-
-	if (dsa_switch_supports_uc_filtering(ds))
-		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
-
-	return 0;
-}
-
-static void dsa_slave_manage_host_flood(struct net_device *dev)
-{
-	bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	bool uc = dev->flags & IFF_PROMISC;
-
-	dsa_port_set_host_flood(dp, uc, mc);
-}
-
-static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (change & IFF_ALLMULTI)
-		dev_set_allmulti(master,
-				 dev->flags & IFF_ALLMULTI ? 1 : -1);
-	if (change & IFF_PROMISC)
-		dev_set_promiscuity(master,
-				    dev->flags & IFF_PROMISC ? 1 : -1);
-
-	if (dsa_switch_supports_uc_filtering(ds) &&
-	    dsa_switch_supports_mc_filtering(ds))
-		dsa_slave_manage_host_flood(dev);
-}
-
-static void dsa_slave_set_rx_mode(struct net_device *dev)
-{
-	__dev_mc_sync(dev, dsa_slave_sync_mc, dsa_slave_unsync_mc);
-	__dev_uc_sync(dev, dsa_slave_sync_uc, dsa_slave_unsync_uc);
-}
-
-static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct sockaddr *addr = a;
-	int err;
-
-	if (!is_valid_ether_addr(addr->sa_data))
-		return -EADDRNOTAVAIL;
-
-	if (ds->ops->port_set_mac_address) {
-		err = ds->ops->port_set_mac_address(ds, dp->index,
-						    addr->sa_data);
-		if (err)
-			return err;
-	}
-
-	/* If the port is down, the address isn't synced yet to hardware or
-	 * to the DSA master, so there is nothing to change.
-	 */
-	if (!(dev->flags & IFF_UP))
-		goto out_change_dev_addr;
-
-	if (dsa_switch_supports_uc_filtering(ds)) {
-		err = dsa_port_standalone_host_fdb_add(dp, addr->sa_data, 0);
-		if (err)
-			return err;
-	}
-
-	if (!ether_addr_equal(addr->sa_data, master->dev_addr)) {
-		err = dev_uc_add(master, addr->sa_data);
-		if (err < 0)
-			goto del_unicast;
-	}
-
-	if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
-		dev_uc_del(master, dev->dev_addr);
-
-	if (dsa_switch_supports_uc_filtering(ds))
-		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
-
-out_change_dev_addr:
-	eth_hw_addr_set(dev, addr->sa_data);
-
-	return 0;
-
-del_unicast:
-	if (dsa_switch_supports_uc_filtering(ds))
-		dsa_port_standalone_host_fdb_del(dp, addr->sa_data, 0);
-
-	return err;
-}
-
-struct dsa_slave_dump_ctx {
-	struct net_device *dev;
-	struct sk_buff *skb;
-	struct netlink_callback *cb;
-	int idx;
-};
-
-static int
-dsa_slave_port_fdb_do_dump(const unsigned char *addr, u16 vid,
-			   bool is_static, void *data)
-{
-	struct dsa_slave_dump_ctx *dump = data;
-	u32 portid = NETLINK_CB(dump->cb->skb).portid;
-	u32 seq = dump->cb->nlh->nlmsg_seq;
-	struct nlmsghdr *nlh;
-	struct ndmsg *ndm;
-
-	if (dump->idx < dump->cb->args[2])
-		goto skip;
-
-	nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
-			sizeof(*ndm), NLM_F_MULTI);
-	if (!nlh)
-		return -EMSGSIZE;
-
-	ndm = nlmsg_data(nlh);
-	ndm->ndm_family  = AF_BRIDGE;
-	ndm->ndm_pad1    = 0;
-	ndm->ndm_pad2    = 0;
-	ndm->ndm_flags   = NTF_SELF;
-	ndm->ndm_type    = 0;
-	ndm->ndm_ifindex = dump->dev->ifindex;
-	ndm->ndm_state   = is_static ? NUD_NOARP : NUD_REACHABLE;
-
-	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
-		goto nla_put_failure;
-
-	if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
-		goto nla_put_failure;
-
-	nlmsg_end(dump->skb, nlh);
-
-skip:
-	dump->idx++;
-	return 0;
-
-nla_put_failure:
-	nlmsg_cancel(dump->skb, nlh);
-	return -EMSGSIZE;
-}
-
-static int
-dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
-		   struct net_device *dev, struct net_device *filter_dev,
-		   int *idx)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_slave_dump_ctx dump = {
-		.dev = dev,
-		.skb = skb,
-		.cb = cb,
-		.idx = *idx,
-	};
-	int err;
-
-	err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump);
-	*idx = dump.idx;
-
-	return err;
-}
-
-static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->dp->ds;
-	int port = p->dp->index;
-
-	/* Pass through to switch driver if it supports timestamping */
-	switch (cmd) {
-	case SIOCGHWTSTAMP:
-		if (ds->ops->port_hwtstamp_get)
-			return ds->ops->port_hwtstamp_get(ds, port, ifr);
-		break;
-	case SIOCSHWTSTAMP:
-		if (ds->ops->port_hwtstamp_set)
-			return ds->ops->port_hwtstamp_set(ds, port, ifr);
-		break;
-	}
-
-	return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
-}
-
-static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx,
-				   const struct switchdev_attr *attr,
-				   struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	int ret;
-
-	if (ctx && ctx != dp)
-		return 0;
-
-	switch (attr->id) {
-	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
-		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_set_state(dp, attr->u.stp_state, true);
-		break;
-	case SWITCHDEV_ATTR_ID_PORT_MST_STATE:
-		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack);
-		break;
-	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
-		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering,
-					      extack);
-		break;
-	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
-		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_ageing_time(dp, attr->u.ageing_time);
-		break;
-	case SWITCHDEV_ATTR_ID_BRIDGE_MST:
-		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_mst_enable(dp, attr->u.mst, extack);
-		break;
-	case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS:
-		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags,
-						extack);
-		break;
-	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
-		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack);
-		break;
-	case SWITCHDEV_ATTR_ID_VLAN_MSTI:
-		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti);
-		break;
-	default:
-		ret = -EOPNOTSUPP;
-		break;
-	}
-
-	return ret;
-}
-
-/* Must be called under rcu_read_lock() */
-static int
-dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave,
-				      const struct switchdev_obj_port_vlan *vlan)
-{
-	struct net_device *upper_dev;
-	struct list_head *iter;
-
-	netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) {
-		u16 vid;
-
-		if (!is_vlan_dev(upper_dev))
-			continue;
-
-		vid = vlan_dev_vlan_id(upper_dev);
-		if (vid == vlan->vid)
-			return -EBUSY;
-	}
-
-	return 0;
-}
-
-static int dsa_slave_vlan_add(struct net_device *dev,
-			      const struct switchdev_obj *obj,
-			      struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan *vlan;
-	int err;
-
-	if (dsa_port_skip_vlan_configuration(dp)) {
-		NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
-		return 0;
-	}
-
-	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
-
-	/* Deny adding a bridge VLAN when there is already an 802.1Q upper with
-	 * the same VID.
-	 */
-	if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) {
-		rcu_read_lock();
-		err = dsa_slave_vlan_check_for_8021q_uppers(dev, vlan);
-		rcu_read_unlock();
-		if (err) {
-			NL_SET_ERR_MSG_MOD(extack,
-					   "Port already has a VLAN upper with this VID");
-			return err;
-		}
-	}
-
-	return dsa_port_vlan_add(dp, vlan, extack);
-}
-
-/* Offload a VLAN installed on the bridge or on a foreign interface by
- * installing it as a VLAN towards the CPU port.
- */
-static int dsa_slave_host_vlan_add(struct net_device *dev,
-				   const struct switchdev_obj *obj,
-				   struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan vlan;
-
-	/* Do nothing if this is a software bridge */
-	if (!dp->bridge)
-		return -EOPNOTSUPP;
-
-	if (dsa_port_skip_vlan_configuration(dp)) {
-		NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
-		return 0;
-	}
-
-	vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
-
-	/* Even though drivers often handle CPU membership in special ways,
-	 * it doesn't make sense to program a PVID, so clear this flag.
-	 */
-	vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;
-
-	return dsa_port_host_vlan_add(dp, &vlan, extack);
-}
-
-static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx,
-				  const struct switchdev_obj *obj,
-				  struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	int err;
-
-	if (ctx && ctx != dp)
-		return 0;
-
-	switch (obj->id) {
-	case SWITCHDEV_OBJ_ID_PORT_MDB:
-		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_HOST_MDB:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-			err = dsa_slave_vlan_add(dev, obj, extack);
-		else
-			err = dsa_slave_host_vlan_add(dev, obj, extack);
-		break;
-	case SWITCHDEV_OBJ_ID_MRP:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mrp_add_ring_role(dp,
-						 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
-static int dsa_slave_vlan_del(struct net_device *dev,
-			      const struct switchdev_obj *obj)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan *vlan;
-
-	if (dsa_port_skip_vlan_configuration(dp))
-		return 0;
-
-	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
-
-	return dsa_port_vlan_del(dp, vlan);
-}
-
-static int dsa_slave_host_vlan_del(struct net_device *dev,
-				   const struct switchdev_obj *obj)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan *vlan;
-
-	/* Do nothing if this is a software bridge */
-	if (!dp->bridge)
-		return -EOPNOTSUPP;
-
-	if (dsa_port_skip_vlan_configuration(dp))
-		return 0;
-
-	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
-
-	return dsa_port_host_vlan_del(dp, vlan);
-}
-
-static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx,
-				  const struct switchdev_obj *obj)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	int err;
-
-	if (ctx && ctx != dp)
-		return 0;
-
-	switch (obj->id) {
-	case SWITCHDEV_OBJ_ID_PORT_MDB:
-		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_HOST_MDB:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-			err = dsa_slave_vlan_del(dev, obj);
-		else
-			err = dsa_slave_host_vlan_del(dev, obj);
-		break;
-	case SWITCHDEV_OBJ_ID_MRP:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mrp_del_ring_role(dp,
-						 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
-static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
-						     struct sk_buff *skb)
-{
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	return netpoll_send_skb(p->netpoll, skb);
-#else
-	BUG();
-	return NETDEV_TX_OK;
-#endif
-}
-
-static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
-				 struct sk_buff *skb)
-{
-	struct dsa_switch *ds = p->dp->ds;
-
-	if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
-		return;
-
-	if (!ds->ops->port_txtstamp)
-		return;
-
-	ds->ops->port_txtstamp(ds, p->dp->index, skb);
-}
-
-netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
-{
-	/* SKB for netpoll still need to be mangled with the protocol-specific
-	 * tag to be successfully transmitted
-	 */
-	if (unlikely(netpoll_tx_running(dev)))
-		return dsa_slave_netpoll_send_skb(dev, skb);
-
-	/* Queue the SKB for transmission on the parent interface, but
-	 * do not modify its EtherType
-	 */
-	skb->dev = dsa_slave_to_master(dev);
-	dev_queue_xmit(skb);
-
-	return NETDEV_TX_OK;
-}
-EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
-
-static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
-{
-	int needed_headroom = dev->needed_headroom;
-	int needed_tailroom = dev->needed_tailroom;
-
-	/* For tail taggers, we need to pad short frames ourselves, to ensure
-	 * that the tail tag does not fail at its role of being at the end of
-	 * the packet, once the master interface pads the frame. Account for
-	 * that pad length here, and pad later.
-	 */
-	if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
-		needed_tailroom += ETH_ZLEN - skb->len;
-	/* skb_headroom() returns unsigned int... */
-	needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
-	needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
-
-	if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
-		/* No reallocation needed, yay! */
-		return 0;
-
-	return pskb_expand_head(skb, needed_headroom, needed_tailroom,
-				GFP_ATOMIC);
-}
-
-static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct sk_buff *nskb;
-
-	dev_sw_netstats_tx_add(dev, 1, skb->len);
-
-	memset(skb->cb, 0, sizeof(skb->cb));
-
-	/* Handle tx timestamp if any */
-	dsa_skb_tx_timestamp(p, skb);
-
-	if (dsa_realloc_skb(skb, dev)) {
-		dev_kfree_skb_any(skb);
-		return NETDEV_TX_OK;
-	}
-
-	/* needed_tailroom should still be 'warm' in the cache line from
-	 * dsa_realloc_skb(), which has also ensured that padding is safe.
-	 */
-	if (dev->needed_tailroom)
-		eth_skb_pad(skb);
-
-	/* Transmit function may have to reallocate the original SKB,
-	 * in which case it must have freed it. Only free it here on error.
-	 */
-	nskb = p->xmit(skb, dev);
-	if (!nskb) {
-		kfree_skb(skb);
-		return NETDEV_TX_OK;
-	}
-
-	return dsa_enqueue_skb(nskb, dev);
-}
-
-/* ethtool operations *******************************************************/
-
-static void dsa_slave_get_drvinfo(struct net_device *dev,
-				  struct ethtool_drvinfo *drvinfo)
-{
-	strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
-	strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
-	strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
-}
-
-static int dsa_slave_get_regs_len(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_regs_len)
-		return ds->ops->get_regs_len(ds, dp->index);
-
-	return -EOPNOTSUPP;
-}
-
-static void
-dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_regs)
-		ds->ops->get_regs(ds, dp->index, regs, _p);
-}
-
-static int dsa_slave_nway_reset(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return phylink_ethtool_nway_reset(dp->pl);
-}
-
-static int dsa_slave_get_eeprom_len(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->cd && ds->cd->eeprom_len)
-		return ds->cd->eeprom_len;
-
-	if (ds->ops->get_eeprom_len)
-		return ds->ops->get_eeprom_len(ds);
-
-	return 0;
-}
-
-static int dsa_slave_get_eeprom(struct net_device *dev,
-				struct ethtool_eeprom *eeprom, u8 *data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_eeprom)
-		return ds->ops->get_eeprom(ds, eeprom, data);
-
-	return -EOPNOTSUPP;
-}
-
-static int dsa_slave_set_eeprom(struct net_device *dev,
-				struct ethtool_eeprom *eeprom, u8 *data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->set_eeprom)
-		return ds->ops->set_eeprom(ds, eeprom, data);
-
-	return -EOPNOTSUPP;
-}
-
-static void dsa_slave_get_strings(struct net_device *dev,
-				  uint32_t stringset, uint8_t *data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (stringset == ETH_SS_STATS) {
-		int len = ETH_GSTRING_LEN;
-
-		strscpy_pad(data, "tx_packets", len);
-		strscpy_pad(data + len, "tx_bytes", len);
-		strscpy_pad(data + 2 * len, "rx_packets", len);
-		strscpy_pad(data + 3 * len, "rx_bytes", len);
-		if (ds->ops->get_strings)
-			ds->ops->get_strings(ds, dp->index, stringset,
-					     data + 4 * len);
-	} else if (stringset ==  ETH_SS_TEST) {
-		net_selftest_get_strings(data);
-	}
-
-}
-
-static void dsa_slave_get_ethtool_stats(struct net_device *dev,
-					struct ethtool_stats *stats,
-					uint64_t *data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct pcpu_sw_netstats *s;
-	unsigned int start;
-	int i;
-
-	for_each_possible_cpu(i) {
-		u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
-
-		s = per_cpu_ptr(dev->tstats, i);
-		do {
-			start = u64_stats_fetch_begin(&s->syncp);
-			tx_packets = u64_stats_read(&s->tx_packets);
-			tx_bytes = u64_stats_read(&s->tx_bytes);
-			rx_packets = u64_stats_read(&s->rx_packets);
-			rx_bytes = u64_stats_read(&s->rx_bytes);
-		} while (u64_stats_fetch_retry(&s->syncp, start));
-		data[0] += tx_packets;
-		data[1] += tx_bytes;
-		data[2] += rx_packets;
-		data[3] += rx_bytes;
-	}
-	if (ds->ops->get_ethtool_stats)
-		ds->ops->get_ethtool_stats(ds, dp->index, data + 4);
-}
-
-static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (sset == ETH_SS_STATS) {
-		int count = 0;
-
-		if (ds->ops->get_sset_count) {
-			count = ds->ops->get_sset_count(ds, dp->index, sset);
-			if (count < 0)
-				return count;
-		}
-
-		return count + 4;
-	} else if (sset ==  ETH_SS_TEST) {
-		return net_selftest_get_count();
-	}
-
-	return -EOPNOTSUPP;
-}
-
-static void dsa_slave_get_eth_phy_stats(struct net_device *dev,
-					struct ethtool_eth_phy_stats *phy_stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_eth_phy_stats)
-		ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats);
-}
-
-static void dsa_slave_get_eth_mac_stats(struct net_device *dev,
-					struct ethtool_eth_mac_stats *mac_stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_eth_mac_stats)
-		ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats);
-}
-
-static void
-dsa_slave_get_eth_ctrl_stats(struct net_device *dev,
-			     struct ethtool_eth_ctrl_stats *ctrl_stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_eth_ctrl_stats)
-		ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
-}
-
-static void
-dsa_slave_get_rmon_stats(struct net_device *dev,
-			 struct ethtool_rmon_stats *rmon_stats,
-			 const struct ethtool_rmon_hist_range **ranges)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_rmon_stats)
-		ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
-}
-
-static void dsa_slave_net_selftest(struct net_device *ndev,
-				   struct ethtool_test *etest, u64 *buf)
-{
-	struct dsa_port *dp = dsa_slave_to_port(ndev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->self_test) {
-		ds->ops->self_test(ds, dp->index, etest, buf);
-		return;
-	}
-
-	net_selftest(ndev, etest, buf);
-}
-
-static int dsa_slave_get_mm(struct net_device *dev,
-			    struct ethtool_mm_state *state)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->get_mm)
-		return -EOPNOTSUPP;
-
-	return ds->ops->get_mm(ds, dp->index, state);
-}
-
-static int dsa_slave_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg,
-			    struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->set_mm)
-		return -EOPNOTSUPP;
-
-	return ds->ops->set_mm(ds, dp->index, cfg, extack);
-}
-
-static void dsa_slave_get_mm_stats(struct net_device *dev,
-				   struct ethtool_mm_stats *stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_mm_stats)
-		ds->ops->get_mm_stats(ds, dp->index, stats);
-}
-
-static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	phylink_ethtool_get_wol(dp->pl, w);
-
-	if (ds->ops->get_wol)
-		ds->ops->get_wol(ds, dp->index, w);
-}
-
-static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int ret = -EOPNOTSUPP;
-
-	phylink_ethtool_set_wol(dp->pl, w);
-
-	if (ds->ops->set_wol)
-		ret = ds->ops->set_wol(ds, dp->index, w);
-
-	return ret;
-}
-
-static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int ret;
-
-	/* Port's PHY and MAC both need to be EEE capable */
-	if (!dev->phydev || !dp->pl)
-		return -ENODEV;
-
-	if (!ds->ops->set_mac_eee)
-		return -EOPNOTSUPP;
-
-	ret = ds->ops->set_mac_eee(ds, dp->index, e);
-	if (ret)
-		return ret;
-
-	return phylink_ethtool_set_eee(dp->pl, e);
-}
-
-static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int ret;
-
-	/* Port's PHY and MAC both need to be EEE capable */
-	if (!dev->phydev || !dp->pl)
-		return -ENODEV;
-
-	if (!ds->ops->get_mac_eee)
-		return -EOPNOTSUPP;
-
-	ret = ds->ops->get_mac_eee(ds, dp->index, e);
-	if (ret)
-		return ret;
-
-	return phylink_ethtool_get_eee(dp->pl, e);
-}
-
-static int dsa_slave_get_link_ksettings(struct net_device *dev,
-					struct ethtool_link_ksettings *cmd)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return phylink_ethtool_ksettings_get(dp->pl, cmd);
-}
-
-static int dsa_slave_set_link_ksettings(struct net_device *dev,
-					const struct ethtool_link_ksettings *cmd)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return phylink_ethtool_ksettings_set(dp->pl, cmd);
-}
-
-static void dsa_slave_get_pause_stats(struct net_device *dev,
-				  struct ethtool_pause_stats *pause_stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_pause_stats)
-		ds->ops->get_pause_stats(ds, dp->index, pause_stats);
-}
-
-static void dsa_slave_get_pauseparam(struct net_device *dev,
-				     struct ethtool_pauseparam *pause)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	phylink_ethtool_get_pauseparam(dp->pl, pause);
-}
-
-static int dsa_slave_set_pauseparam(struct net_device *dev,
-				    struct ethtool_pauseparam *pause)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return phylink_ethtool_set_pauseparam(dp->pl, pause);
-}
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static int dsa_slave_netpoll_setup(struct net_device *dev,
-				   struct netpoll_info *ni)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct netpoll *netpoll;
-	int err = 0;
-
-	netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
-	if (!netpoll)
-		return -ENOMEM;
-
-	err = __netpoll_setup(netpoll, master);
-	if (err) {
-		kfree(netpoll);
-		goto out;
-	}
-
-	p->netpoll = netpoll;
-out:
-	return err;
-}
-
-static void dsa_slave_netpoll_cleanup(struct net_device *dev)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct netpoll *netpoll = p->netpoll;
-
-	if (!netpoll)
-		return;
-
-	p->netpoll = NULL;
-
-	__netpoll_free(netpoll);
-}
-
-static void dsa_slave_poll_controller(struct net_device *dev)
-{
-}
-#endif
-
-static struct dsa_mall_tc_entry *
-dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_mall_tc_entry *mall_tc_entry;
-
-	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
-		if (mall_tc_entry->cookie == cookie)
-			return mall_tc_entry;
-
-	return NULL;
-}
-
-static int
-dsa_slave_add_cls_matchall_mirred(struct net_device *dev,
-				  struct tc_cls_matchall_offload *cls,
-				  bool ingress)
-{
-	struct netlink_ext_ack *extack = cls->common.extack;
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_mall_mirror_tc_entry *mirror;
-	struct dsa_mall_tc_entry *mall_tc_entry;
-	struct dsa_switch *ds = dp->ds;
-	struct flow_action_entry *act;
-	struct dsa_port *to_dp;
-	int err;
-
-	if (!ds->ops->port_mirror_add)
-		return -EOPNOTSUPP;
-
-	if (!flow_action_basic_hw_stats_check(&cls->rule->action,
-					      cls->common.extack))
-		return -EOPNOTSUPP;
-
-	act = &cls->rule->action.entries[0];
-
-	if (!act->dev)
-		return -EINVAL;
-
-	if (!dsa_slave_dev_check(act->dev))
-		return -EOPNOTSUPP;
-
-	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
-	if (!mall_tc_entry)
-		return -ENOMEM;
-
-	mall_tc_entry->cookie = cls->cookie;
-	mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
-	mirror = &mall_tc_entry->mirror;
-
-	to_dp = dsa_slave_to_port(act->dev);
-
-	mirror->to_local_port = to_dp->index;
-	mirror->ingress = ingress;
-
-	err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack);
-	if (err) {
-		kfree(mall_tc_entry);
-		return err;
-	}
-
-	list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
-
-	return err;
-}
-
-static int
-dsa_slave_add_cls_matchall_police(struct net_device *dev,
-				  struct tc_cls_matchall_offload *cls,
-				  bool ingress)
-{
-	struct netlink_ext_ack *extack = cls->common.extack;
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_mall_policer_tc_entry *policer;
-	struct dsa_mall_tc_entry *mall_tc_entry;
-	struct dsa_switch *ds = dp->ds;
-	struct flow_action_entry *act;
-	int err;
-
-	if (!ds->ops->port_policer_add) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "Policing offload not implemented");
-		return -EOPNOTSUPP;
-	}
-
-	if (!ingress) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "Only supported on ingress qdisc");
-		return -EOPNOTSUPP;
-	}
-
-	if (!flow_action_basic_hw_stats_check(&cls->rule->action,
-					      cls->common.extack))
-		return -EOPNOTSUPP;
-
-	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
-		if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
-			NL_SET_ERR_MSG_MOD(extack,
-					   "Only one port policer allowed");
-			return -EEXIST;
-		}
-	}
-
-	act = &cls->rule->action.entries[0];
-
-	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
-	if (!mall_tc_entry)
-		return -ENOMEM;
-
-	mall_tc_entry->cookie = cls->cookie;
-	mall_tc_entry->type = DSA_PORT_MALL_POLICER;
-	policer = &mall_tc_entry->policer;
-	policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
-	policer->burst = act->police.burst;
-
-	err = ds->ops->port_policer_add(ds, dp->index, policer);
-	if (err) {
-		kfree(mall_tc_entry);
-		return err;
-	}
-
-	list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
-
-	return err;
-}
-
-static int dsa_slave_add_cls_matchall(struct net_device *dev,
-				      struct tc_cls_matchall_offload *cls,
-				      bool ingress)
-{
-	int err = -EOPNOTSUPP;
-
-	if (cls->common.protocol == htons(ETH_P_ALL) &&
-	    flow_offload_has_one_action(&cls->rule->action) &&
-	    cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED)
-		err = dsa_slave_add_cls_matchall_mirred(dev, cls, ingress);
-	else if (flow_offload_has_one_action(&cls->rule->action) &&
-		 cls->rule->action.entries[0].id == FLOW_ACTION_POLICE)
-		err = dsa_slave_add_cls_matchall_police(dev, cls, ingress);
-
-	return err;
-}
-
-static void dsa_slave_del_cls_matchall(struct net_device *dev,
-				       struct tc_cls_matchall_offload *cls)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_mall_tc_entry *mall_tc_entry;
-	struct dsa_switch *ds = dp->ds;
-
-	mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie);
-	if (!mall_tc_entry)
-		return;
-
-	list_del(&mall_tc_entry->list);
-
-	switch (mall_tc_entry->type) {
-	case DSA_PORT_MALL_MIRROR:
-		if (ds->ops->port_mirror_del)
-			ds->ops->port_mirror_del(ds, dp->index,
-						 &mall_tc_entry->mirror);
-		break;
-	case DSA_PORT_MALL_POLICER:
-		if (ds->ops->port_policer_del)
-			ds->ops->port_policer_del(ds, dp->index);
-		break;
-	default:
-		WARN_ON(1);
-	}
-
-	kfree(mall_tc_entry);
-}
-
-static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev,
-					   struct tc_cls_matchall_offload *cls,
-					   bool ingress)
-{
-	if (cls->common.chain_index)
-		return -EOPNOTSUPP;
-
-	switch (cls->command) {
-	case TC_CLSMATCHALL_REPLACE:
-		return dsa_slave_add_cls_matchall(dev, cls, ingress);
-	case TC_CLSMATCHALL_DESTROY:
-		dsa_slave_del_cls_matchall(dev, cls);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int dsa_slave_add_cls_flower(struct net_device *dev,
-				    struct flow_cls_offload *cls,
-				    bool ingress)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int port = dp->index;
-
-	if (!ds->ops->cls_flower_add)
-		return -EOPNOTSUPP;
-
-	return ds->ops->cls_flower_add(ds, port, cls, ingress);
-}
-
-static int dsa_slave_del_cls_flower(struct net_device *dev,
-				    struct flow_cls_offload *cls,
-				    bool ingress)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int port = dp->index;
-
-	if (!ds->ops->cls_flower_del)
-		return -EOPNOTSUPP;
-
-	return ds->ops->cls_flower_del(ds, port, cls, ingress);
-}
-
-static int dsa_slave_stats_cls_flower(struct net_device *dev,
-				      struct flow_cls_offload *cls,
-				      bool ingress)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int port = dp->index;
-
-	if (!ds->ops->cls_flower_stats)
-		return -EOPNOTSUPP;
-
-	return ds->ops->cls_flower_stats(ds, port, cls, ingress);
-}
-
-static int dsa_slave_setup_tc_cls_flower(struct net_device *dev,
-					 struct flow_cls_offload *cls,
-					 bool ingress)
-{
-	switch (cls->command) {
-	case FLOW_CLS_REPLACE:
-		return dsa_slave_add_cls_flower(dev, cls, ingress);
-	case FLOW_CLS_DESTROY:
-		return dsa_slave_del_cls_flower(dev, cls, ingress);
-	case FLOW_CLS_STATS:
-		return dsa_slave_stats_cls_flower(dev, cls, ingress);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int dsa_slave_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
-				       void *cb_priv, bool ingress)
-{
-	struct net_device *dev = cb_priv;
-
-	if (!tc_can_offload(dev))
-		return -EOPNOTSUPP;
-
-	switch (type) {
-	case TC_SETUP_CLSMATCHALL:
-		return dsa_slave_setup_tc_cls_matchall(dev, type_data, ingress);
-	case TC_SETUP_CLSFLOWER:
-		return dsa_slave_setup_tc_cls_flower(dev, type_data, ingress);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int dsa_slave_setup_tc_block_cb_ig(enum tc_setup_type type,
-					  void *type_data, void *cb_priv)
-{
-	return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, true);
-}
-
-static int dsa_slave_setup_tc_block_cb_eg(enum tc_setup_type type,
-					  void *type_data, void *cb_priv)
-{
-	return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, false);
-}
-
-static LIST_HEAD(dsa_slave_block_cb_list);
-
-static int dsa_slave_setup_tc_block(struct net_device *dev,
-				    struct flow_block_offload *f)
-{
-	struct flow_block_cb *block_cb;
-	flow_setup_cb_t *cb;
-
-	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		cb = dsa_slave_setup_tc_block_cb_ig;
-	else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
-		cb = dsa_slave_setup_tc_block_cb_eg;
-	else
-		return -EOPNOTSUPP;
-
-	f->driver_block_list = &dsa_slave_block_cb_list;
-
-	switch (f->command) {
-	case FLOW_BLOCK_BIND:
-		if (flow_block_cb_is_busy(cb, dev, &dsa_slave_block_cb_list))
-			return -EBUSY;
-
-		block_cb = flow_block_cb_alloc(cb, dev, dev, NULL);
-		if (IS_ERR(block_cb))
-			return PTR_ERR(block_cb);
-
-		flow_block_cb_add(block_cb, f);
-		list_add_tail(&block_cb->driver_list, &dsa_slave_block_cb_list);
-		return 0;
-	case FLOW_BLOCK_UNBIND:
-		block_cb = flow_block_cb_lookup(f->block, cb, dev);
-		if (!block_cb)
-			return -ENOENT;
-
-		flow_block_cb_remove(block_cb, f);
-		list_del(&block_cb->driver_list);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int dsa_slave_setup_ft_block(struct dsa_switch *ds, int port,
-				    void *type_data)
-{
-	struct net_device *master = dsa_port_to_master(dsa_to_port(ds, port));
-
-	if (!master->netdev_ops->ndo_setup_tc)
-		return -EOPNOTSUPP;
-
-	return master->netdev_ops->ndo_setup_tc(master, TC_SETUP_FT, type_data);
-}
-
-static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			      void *type_data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	switch (type) {
-	case TC_SETUP_BLOCK:
-		return dsa_slave_setup_tc_block(dev, type_data);
-	case TC_SETUP_FT:
-		return dsa_slave_setup_ft_block(ds, dp->index, type_data);
-	default:
-		break;
-	}
-
-	if (!ds->ops->port_setup_tc)
-		return -EOPNOTSUPP;
-
-	return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
-}
-
-static int dsa_slave_get_rxnfc(struct net_device *dev,
-			       struct ethtool_rxnfc *nfc, u32 *rule_locs)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->get_rxnfc)
-		return -EOPNOTSUPP;
-
-	return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs);
-}
-
-static int dsa_slave_set_rxnfc(struct net_device *dev,
-			       struct ethtool_rxnfc *nfc)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->set_rxnfc)
-		return -EOPNOTSUPP;
-
-	return ds->ops->set_rxnfc(ds, dp->index, nfc);
-}
-
-static int dsa_slave_get_ts_info(struct net_device *dev,
-				 struct ethtool_ts_info *ts)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->dp->ds;
-
-	if (!ds->ops->get_ts_info)
-		return -EOPNOTSUPP;
-
-	return ds->ops->get_ts_info(ds, p->dp->index, ts);
-}
-
-static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
-				     u16 vid)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan vlan = {
-		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
-		.vid = vid,
-		/* This API only allows programming tagged, non-PVID VIDs */
-		.flags = 0,
-	};
-	struct netlink_ext_ack extack = {0};
-	struct dsa_switch *ds = dp->ds;
-	struct netdev_hw_addr *ha;
-	struct dsa_vlan *v;
-	int ret;
-
-	/* User port... */
-	ret = dsa_port_vlan_add(dp, &vlan, &extack);
-	if (ret) {
-		if (extack._msg)
-			netdev_err(dev, "%s\n", extack._msg);
-		return ret;
-	}
-
-	/* And CPU port... */
-	ret = dsa_port_host_vlan_add(dp, &vlan, &extack);
-	if (ret) {
-		if (extack._msg)
-			netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index,
-				   extack._msg);
-		return ret;
-	}
-
-	if (!dsa_switch_supports_uc_filtering(ds) &&
-	    !dsa_switch_supports_mc_filtering(ds))
-		return 0;
-
-	v = kzalloc(sizeof(*v), GFP_KERNEL);
-	if (!v) {
-		ret = -ENOMEM;
-		goto rollback;
-	}
-
-	netif_addr_lock_bh(dev);
-
-	v->vid = vid;
-	list_add_tail(&v->list, &dp->user_vlans);
-
-	if (dsa_switch_supports_mc_filtering(ds)) {
-		netdev_for_each_synced_mc_addr(ha, dev) {
-			dsa_slave_schedule_standalone_work(dev, DSA_MC_ADD,
-							   ha->addr, vid);
-		}
-	}
-
-	if (dsa_switch_supports_uc_filtering(ds)) {
-		netdev_for_each_synced_uc_addr(ha, dev) {
-			dsa_slave_schedule_standalone_work(dev, DSA_UC_ADD,
-							   ha->addr, vid);
-		}
-	}
-
-	netif_addr_unlock_bh(dev);
-
-	dsa_flush_workqueue();
-
-	return 0;
-
-rollback:
-	dsa_port_host_vlan_del(dp, &vlan);
-	dsa_port_vlan_del(dp, &vlan);
-
-	return ret;
-}
-
-static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
-				      u16 vid)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan vlan = {
-		.vid = vid,
-		/* This API only allows programming tagged, non-PVID VIDs */
-		.flags = 0,
-	};
-	struct dsa_switch *ds = dp->ds;
-	struct netdev_hw_addr *ha;
-	struct dsa_vlan *v;
-	int err;
-
-	err = dsa_port_vlan_del(dp, &vlan);
-	if (err)
-		return err;
-
-	err = dsa_port_host_vlan_del(dp, &vlan);
-	if (err)
-		return err;
-
-	if (!dsa_switch_supports_uc_filtering(ds) &&
-	    !dsa_switch_supports_mc_filtering(ds))
-		return 0;
-
-	netif_addr_lock_bh(dev);
-
-	v = dsa_vlan_find(&dp->user_vlans, &vlan);
-	if (!v) {
-		netif_addr_unlock_bh(dev);
-		return -ENOENT;
-	}
-
-	list_del(&v->list);
-	kfree(v);
-
-	if (dsa_switch_supports_mc_filtering(ds)) {
-		netdev_for_each_synced_mc_addr(ha, dev) {
-			dsa_slave_schedule_standalone_work(dev, DSA_MC_DEL,
-							   ha->addr, vid);
-		}
-	}
-
-	if (dsa_switch_supports_uc_filtering(ds)) {
-		netdev_for_each_synced_uc_addr(ha, dev) {
-			dsa_slave_schedule_standalone_work(dev, DSA_UC_DEL,
-							   ha->addr, vid);
-		}
-	}
-
-	netif_addr_unlock_bh(dev);
-
-	dsa_flush_workqueue();
-
-	return 0;
-}
-
-static int dsa_slave_restore_vlan(struct net_device *vdev, int vid, void *arg)
-{
-	__be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
-
-	return dsa_slave_vlan_rx_add_vid(arg, proto, vid);
-}
-
-static int dsa_slave_clear_vlan(struct net_device *vdev, int vid, void *arg)
-{
-	__be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
-
-	return dsa_slave_vlan_rx_kill_vid(arg, proto, vid);
-}
-
-/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN
- * filtering is enabled. The baseline is that only ports that offload a
- * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware,
- * but there are exceptions for quirky hardware.
- *
- * If ds->vlan_filtering_is_global = true, then standalone ports which share
- * the same switch with other ports that offload a VLAN-aware bridge are also
- * inevitably VLAN-aware.
- *
- * To summarize, a DSA switch port offloads:
- *
- * - If standalone (this includes software bridge, software LAG):
- *     - if ds->needs_standalone_vlan_filtering = true, OR if
- *       (ds->vlan_filtering_is_global = true AND there are bridges spanning
- *       this switch chip which have vlan_filtering=1)
- *         - the 8021q upper VLANs
- *     - else (standalone VLAN filtering is not needed, VLAN filtering is not
- *       global, or it is, but no port is under a VLAN-aware bridge):
- *         - no VLAN (any 8021q upper is a software VLAN)
- *
- * - If under a vlan_filtering=0 bridge which it offload:
- *     - if ds->configure_vlan_while_not_filtering = true (default):
- *         - the bridge VLANs. These VLANs are committed to hardware but inactive.
- *     - else (deprecated):
- *         - no VLAN. The bridge VLANs are not restored when VLAN awareness is
- *           enabled, so this behavior is broken and discouraged.
- *
- * - If under a vlan_filtering=1 bridge which it offload:
- *     - the bridge VLANs
- *     - the 8021q upper VLANs
- */
-int dsa_slave_manage_vlan_filtering(struct net_device *slave,
-				    bool vlan_filtering)
-{
-	int err;
-
-	if (vlan_filtering) {
-		slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
-
-		err = vlan_for_each(slave, dsa_slave_restore_vlan, slave);
-		if (err) {
-			vlan_for_each(slave, dsa_slave_clear_vlan, slave);
-			slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
-			return err;
-		}
-	} else {
-		err = vlan_for_each(slave, dsa_slave_clear_vlan, slave);
-		if (err)
-			return err;
-
-		slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
-	}
-
-	return 0;
-}
-
-struct dsa_hw_port {
-	struct list_head list;
-	struct net_device *dev;
-	int old_mtu;
-};
-
-static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
-{
-	const struct dsa_hw_port *p;
-	int err;
-
-	list_for_each_entry(p, hw_port_list, list) {
-		if (p->dev->mtu == mtu)
-			continue;
-
-		err = dev_set_mtu(p->dev, mtu);
-		if (err)
-			goto rollback;
-	}
-
-	return 0;
-
-rollback:
-	list_for_each_entry_continue_reverse(p, hw_port_list, list) {
-		if (p->dev->mtu == p->old_mtu)
-			continue;
-
-		if (dev_set_mtu(p->dev, p->old_mtu))
-			netdev_err(p->dev, "Failed to restore MTU\n");
-	}
-
-	return err;
-}
-
-static void dsa_hw_port_list_free(struct list_head *hw_port_list)
-{
-	struct dsa_hw_port *p, *n;
-
-	list_for_each_entry_safe(p, n, hw_port_list, list)
-		kfree(p);
-}
-
-/* Make the hardware datapath to/from @dev limited to a common MTU */
-static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
-{
-	struct list_head hw_port_list;
-	struct dsa_switch_tree *dst;
-	int min_mtu = ETH_MAX_MTU;
-	struct dsa_port *other_dp;
-	int err;
-
-	if (!dp->ds->mtu_enforcement_ingress)
-		return;
-
-	if (!dp->bridge)
-		return;
-
-	INIT_LIST_HEAD(&hw_port_list);
-
-	/* Populate the list of ports that are part of the same bridge
-	 * as the newly added/modified port
-	 */
-	list_for_each_entry(dst, &dsa_tree_list, list) {
-		list_for_each_entry(other_dp, &dst->ports, list) {
-			struct dsa_hw_port *hw_port;
-			struct net_device *slave;
-
-			if (other_dp->type != DSA_PORT_TYPE_USER)
-				continue;
-
-			if (!dsa_port_bridge_same(dp, other_dp))
-				continue;
-
-			if (!other_dp->ds->mtu_enforcement_ingress)
-				continue;
-
-			slave = other_dp->slave;
-
-			if (min_mtu > slave->mtu)
-				min_mtu = slave->mtu;
-
-			hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
-			if (!hw_port)
-				goto out;
-
-			hw_port->dev = slave;
-			hw_port->old_mtu = slave->mtu;
-
-			list_add(&hw_port->list, &hw_port_list);
-		}
-	}
-
-	/* Attempt to configure the entire hardware bridge to the newly added
-	 * interface's MTU first, regardless of whether the intention of the
-	 * user was to raise or lower it.
-	 */
-	err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->slave->mtu);
-	if (!err)
-		goto out;
-
-	/* Clearly that didn't work out so well, so just set the minimum MTU on
-	 * all hardware bridge ports now. If this fails too, then all ports will
-	 * still have their old MTU rolled back anyway.
-	 */
-	dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);
-
-out:
-	dsa_hw_port_list_free(&hw_port_list);
-}
-
-int dsa_slave_change_mtu(struct net_device *dev, int new_mtu)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_port *cpu_dp = dp->cpu_dp;
-	struct dsa_switch *ds = dp->ds;
-	struct dsa_port *other_dp;
-	int largest_mtu = 0;
-	int new_master_mtu;
-	int old_master_mtu;
-	int mtu_limit;
-	int overhead;
-	int cpu_mtu;
-	int err;
-
-	if (!ds->ops->port_change_mtu)
-		return -EOPNOTSUPP;
-
-	dsa_tree_for_each_user_port(other_dp, ds->dst) {
-		int slave_mtu;
-
-		/* During probe, this function will be called for each slave
-		 * device, while not all of them have been allocated. That's
-		 * ok, it doesn't change what the maximum is, so ignore it.
-		 */
-		if (!other_dp->slave)
-			continue;
-
-		/* Pretend that we already applied the setting, which we
-		 * actually haven't (still haven't done all integrity checks)
-		 */
-		if (dp == other_dp)
-			slave_mtu = new_mtu;
-		else
-			slave_mtu = other_dp->slave->mtu;
-
-		if (largest_mtu < slave_mtu)
-			largest_mtu = slave_mtu;
-	}
-
-	overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops);
-	mtu_limit = min_t(int, master->max_mtu, dev->max_mtu + overhead);
-	old_master_mtu = master->mtu;
-	new_master_mtu = largest_mtu + overhead;
-	if (new_master_mtu > mtu_limit)
-		return -ERANGE;
-
-	/* If the master MTU isn't over limit, there's no need to check the CPU
-	 * MTU, since that surely isn't either.
-	 */
-	cpu_mtu = largest_mtu;
-
-	/* Start applying stuff */
-	if (new_master_mtu != old_master_mtu) {
-		err = dev_set_mtu(master, new_master_mtu);
-		if (err < 0)
-			goto out_master_failed;
-
-		/* We only need to propagate the MTU of the CPU port to
-		 * upstream switches, so emit a notifier which updates them.
-		 */
-		err = dsa_port_mtu_change(cpu_dp, cpu_mtu);
-		if (err)
-			goto out_cpu_failed;
-	}
-
-	err = ds->ops->port_change_mtu(ds, dp->index, new_mtu);
-	if (err)
-		goto out_port_failed;
-
-	dev->mtu = new_mtu;
-
-	dsa_bridge_mtu_normalization(dp);
-
-	return 0;
-
-out_port_failed:
-	if (new_master_mtu != old_master_mtu)
-		dsa_port_mtu_change(cpu_dp, old_master_mtu - overhead);
-out_cpu_failed:
-	if (new_master_mtu != old_master_mtu)
-		dev_set_mtu(master, old_master_mtu);
-out_master_failed:
-	return err;
-}
-
-static int __maybe_unused
-dsa_slave_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	unsigned long mask, new_prio;
-	int err, port = dp->index;
-
-	if (!ds->ops->port_set_default_prio)
-		return -EOPNOTSUPP;
-
-	err = dcb_ieee_setapp(dev, app);
-	if (err)
-		return err;
-
-	mask = dcb_ieee_getapp_mask(dev, app);
-	new_prio = __fls(mask);
-
-	err = ds->ops->port_set_default_prio(ds, port, new_prio);
-	if (err) {
-		dcb_ieee_delapp(dev, app);
-		return err;
-	}
-
-	return 0;
-}
-
-static int __maybe_unused
-dsa_slave_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	unsigned long mask, new_prio;
-	int err, port = dp->index;
-	u8 dscp = app->protocol;
-
-	if (!ds->ops->port_add_dscp_prio)
-		return -EOPNOTSUPP;
-
-	if (dscp >= 64) {
-		netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n",
-			   dscp);
-		return -EINVAL;
-	}
-
-	err = dcb_ieee_setapp(dev, app);
-	if (err)
-		return err;
-
-	mask = dcb_ieee_getapp_mask(dev, app);
-	new_prio = __fls(mask);
-
-	err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio);
-	if (err) {
-		dcb_ieee_delapp(dev, app);
-		return err;
-	}
-
-	return 0;
-}
-
-static int __maybe_unused dsa_slave_dcbnl_ieee_setapp(struct net_device *dev,
-						      struct dcb_app *app)
-{
-	switch (app->selector) {
-	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
-		switch (app->protocol) {
-		case 0:
-			return dsa_slave_dcbnl_set_default_prio(dev, app);
-		default:
-			return -EOPNOTSUPP;
-		}
-		break;
-	case IEEE_8021QAZ_APP_SEL_DSCP:
-		return dsa_slave_dcbnl_add_dscp_prio(dev, app);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int __maybe_unused
-dsa_slave_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	unsigned long mask, new_prio;
-	int err, port = dp->index;
-
-	if (!ds->ops->port_set_default_prio)
-		return -EOPNOTSUPP;
-
-	err = dcb_ieee_delapp(dev, app);
-	if (err)
-		return err;
-
-	mask = dcb_ieee_getapp_mask(dev, app);
-	new_prio = mask ? __fls(mask) : 0;
-
-	err = ds->ops->port_set_default_prio(ds, port, new_prio);
-	if (err) {
-		dcb_ieee_setapp(dev, app);
-		return err;
-	}
-
-	return 0;
-}
-
-static int __maybe_unused
-dsa_slave_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int err, port = dp->index;
-	u8 dscp = app->protocol;
-
-	if (!ds->ops->port_del_dscp_prio)
-		return -EOPNOTSUPP;
-
-	err = dcb_ieee_delapp(dev, app);
-	if (err)
-		return err;
-
-	err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority);
-	if (err) {
-		dcb_ieee_setapp(dev, app);
-		return err;
-	}
-
-	return 0;
-}
-
-static int __maybe_unused dsa_slave_dcbnl_ieee_delapp(struct net_device *dev,
-						      struct dcb_app *app)
-{
-	switch (app->selector) {
-	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
-		switch (app->protocol) {
-		case 0:
-			return dsa_slave_dcbnl_del_default_prio(dev, app);
-		default:
-			return -EOPNOTSUPP;
-		}
-		break;
-	case IEEE_8021QAZ_APP_SEL_DSCP:
-		return dsa_slave_dcbnl_del_dscp_prio(dev, app);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-/* Pre-populate the DCB application priority table with the priorities
- * configured during switch setup, which we read from hardware here.
- */
-static int dsa_slave_dcbnl_init(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int port = dp->index;
-	int err;
-
-	if (ds->ops->port_get_default_prio) {
-		int prio = ds->ops->port_get_default_prio(ds, port);
-		struct dcb_app app = {
-			.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE,
-			.protocol = 0,
-			.priority = prio,
-		};
-
-		if (prio < 0)
-			return prio;
-
-		err = dcb_ieee_setapp(dev, &app);
-		if (err)
-			return err;
-	}
-
-	if (ds->ops->port_get_dscp_prio) {
-		int protocol;
-
-		for (protocol = 0; protocol < 64; protocol++) {
-			struct dcb_app app = {
-				.selector = IEEE_8021QAZ_APP_SEL_DSCP,
-				.protocol = protocol,
-			};
-			int prio;
-
-			prio = ds->ops->port_get_dscp_prio(ds, port, protocol);
-			if (prio == -EOPNOTSUPP)
-				continue;
-			if (prio < 0)
-				return prio;
-
-			app.priority = prio;
-
-			err = dcb_ieee_setapp(dev, &app);
-			if (err)
-				return err;
-		}
-	}
-
-	return 0;
-}
-
-static const struct ethtool_ops dsa_slave_ethtool_ops = {
-	.get_drvinfo		= dsa_slave_get_drvinfo,
-	.get_regs_len		= dsa_slave_get_regs_len,
-	.get_regs		= dsa_slave_get_regs,
-	.nway_reset		= dsa_slave_nway_reset,
-	.get_link		= ethtool_op_get_link,
-	.get_eeprom_len		= dsa_slave_get_eeprom_len,
-	.get_eeprom		= dsa_slave_get_eeprom,
-	.set_eeprom		= dsa_slave_set_eeprom,
-	.get_strings		= dsa_slave_get_strings,
-	.get_ethtool_stats	= dsa_slave_get_ethtool_stats,
-	.get_sset_count		= dsa_slave_get_sset_count,
-	.get_eth_phy_stats	= dsa_slave_get_eth_phy_stats,
-	.get_eth_mac_stats	= dsa_slave_get_eth_mac_stats,
-	.get_eth_ctrl_stats	= dsa_slave_get_eth_ctrl_stats,
-	.get_rmon_stats		= dsa_slave_get_rmon_stats,
-	.set_wol		= dsa_slave_set_wol,
-	.get_wol		= dsa_slave_get_wol,
-	.set_eee		= dsa_slave_set_eee,
-	.get_eee		= dsa_slave_get_eee,
-	.get_link_ksettings	= dsa_slave_get_link_ksettings,
-	.set_link_ksettings	= dsa_slave_set_link_ksettings,
-	.get_pause_stats	= dsa_slave_get_pause_stats,
-	.get_pauseparam		= dsa_slave_get_pauseparam,
-	.set_pauseparam		= dsa_slave_set_pauseparam,
-	.get_rxnfc		= dsa_slave_get_rxnfc,
-	.set_rxnfc		= dsa_slave_set_rxnfc,
-	.get_ts_info		= dsa_slave_get_ts_info,
-	.self_test		= dsa_slave_net_selftest,
-	.get_mm			= dsa_slave_get_mm,
-	.set_mm			= dsa_slave_set_mm,
-	.get_mm_stats		= dsa_slave_get_mm_stats,
-};
-
-static const struct dcbnl_rtnl_ops __maybe_unused dsa_slave_dcbnl_ops = {
-	.ieee_setapp		= dsa_slave_dcbnl_ieee_setapp,
-	.ieee_delapp		= dsa_slave_dcbnl_ieee_delapp,
-};
-
-static void dsa_slave_get_stats64(struct net_device *dev,
-				  struct rtnl_link_stats64 *s)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_stats64)
-		ds->ops->get_stats64(ds, dp->index, s);
-	else
-		dev_get_tstats64(dev, s);
-}
-
-static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx,
-				       struct net_device_path *path)
-{
-	struct dsa_port *dp = dsa_slave_to_port(ctx->dev);
-	struct net_device *master = dsa_port_to_master(dp);
-	struct dsa_port *cpu_dp = dp->cpu_dp;
-
-	path->dev = ctx->dev;
-	path->type = DEV_PATH_DSA;
-	path->dsa.proto = cpu_dp->tag_ops->proto;
-	path->dsa.port = dp->index;
-	ctx->dev = master;
-
-	return 0;
-}
-
-static const struct net_device_ops dsa_slave_netdev_ops = {
-	.ndo_open	 	= dsa_slave_open,
-	.ndo_stop		= dsa_slave_close,
-	.ndo_start_xmit		= dsa_slave_xmit,
-	.ndo_change_rx_flags	= dsa_slave_change_rx_flags,
-	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
-	.ndo_set_mac_address	= dsa_slave_set_mac_address,
-	.ndo_fdb_dump		= dsa_slave_fdb_dump,
-	.ndo_eth_ioctl		= dsa_slave_ioctl,
-	.ndo_get_iflink		= dsa_slave_get_iflink,
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_netpoll_setup	= dsa_slave_netpoll_setup,
-	.ndo_netpoll_cleanup	= dsa_slave_netpoll_cleanup,
-	.ndo_poll_controller	= dsa_slave_poll_controller,
-#endif
-	.ndo_setup_tc		= dsa_slave_setup_tc,
-	.ndo_get_stats64	= dsa_slave_get_stats64,
-	.ndo_vlan_rx_add_vid	= dsa_slave_vlan_rx_add_vid,
-	.ndo_vlan_rx_kill_vid	= dsa_slave_vlan_rx_kill_vid,
-	.ndo_change_mtu		= dsa_slave_change_mtu,
-	.ndo_fill_forward_path	= dsa_slave_fill_forward_path,
-};
-
-static struct device_type dsa_type = {
-	.name	= "dsa",
-};
-
-void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
-{
-	const struct dsa_port *dp = dsa_to_port(ds, port);
-
-	if (dp->pl)
-		phylink_mac_change(dp->pl, up);
-}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
-
-static void dsa_slave_phylink_fixed_state(struct phylink_config *config,
-					  struct phylink_link_state *state)
-{
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
-	struct dsa_switch *ds = dp->ds;
-
-	/* No need to check that this operation is valid, the callback would
-	 * not be called if it was not.
-	 */
-	ds->ops->phylink_fixed_state(ds, dp->index, state);
-}
-
-/* slave device setup *******************************************************/
-static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr,
-				 u32 flags)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct dsa_switch *ds = dp->ds;
-
-	slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
-	if (!slave_dev->phydev) {
-		netdev_err(slave_dev, "no phy at %d\n", addr);
-		return -ENODEV;
-	}
-
-	slave_dev->phydev->dev_flags |= flags;
-
-	return phylink_connect_phy(dp->pl, slave_dev->phydev);
-}
-
-static int dsa_slave_phy_setup(struct net_device *slave_dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct device_node *port_dn = dp->dn;
-	struct dsa_switch *ds = dp->ds;
-	u32 phy_flags = 0;
-	int ret;
-
-	dp->pl_config.dev = &slave_dev->dev;
-	dp->pl_config.type = PHYLINK_NETDEV;
-
-	/* The get_fixed_state callback takes precedence over polling the
-	 * link GPIO in PHYLINK (see phylink_get_fixed_state).  Only set
-	 * this if the switch provides such a callback.
-	 */
-	if (ds->ops->phylink_fixed_state) {
-		dp->pl_config.get_fixed_state = dsa_slave_phylink_fixed_state;
-		dp->pl_config.poll_fixed_state = true;
-	}
-
-	ret = dsa_port_phylink_create(dp);
-	if (ret)
-		return ret;
-
-	if (ds->ops->get_phy_flags)
-		phy_flags = ds->ops->get_phy_flags(ds, dp->index);
-
-	ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
-	if (ret == -ENODEV && ds->slave_mii_bus) {
-		/* We could not connect to a designated PHY or SFP, so try to
-		 * use the switch internal MDIO bus instead
-		 */
-		ret = dsa_slave_phy_connect(slave_dev, dp->index, phy_flags);
-	}
-	if (ret) {
-		netdev_err(slave_dev, "failed to connect to PHY: %pe\n",
-			   ERR_PTR(ret));
-		dsa_port_phylink_destroy(dp);
-	}
-
-	return ret;
-}
-
-void dsa_slave_setup_tagger(struct net_device *slave)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave);
-	struct net_device *master = dsa_port_to_master(dp);
-	struct dsa_slave_priv *p = netdev_priv(slave);
-	const struct dsa_port *cpu_dp = dp->cpu_dp;
-	const struct dsa_switch *ds = dp->ds;
-
-	slave->needed_headroom = cpu_dp->tag_ops->needed_headroom;
-	slave->needed_tailroom = cpu_dp->tag_ops->needed_tailroom;
-	/* Try to save one extra realloc later in the TX path (in the master)
-	 * by also inheriting the master's needed headroom and tailroom.
-	 * The 8021q driver also does this.
-	 */
-	slave->needed_headroom += master->needed_headroom;
-	slave->needed_tailroom += master->needed_tailroom;
-
-	p->xmit = cpu_dp->tag_ops->xmit;
-
-	slave->features = master->vlan_features | NETIF_F_HW_TC;
-	slave->hw_features |= NETIF_F_HW_TC;
-	slave->features |= NETIF_F_LLTX;
-	if (slave->needed_tailroom)
-		slave->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST);
-	if (ds->needs_standalone_vlan_filtering)
-		slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
-}
-
-int dsa_slave_suspend(struct net_device *slave_dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-
-	if (!netif_running(slave_dev))
-		return 0;
-
-	netif_device_detach(slave_dev);
-
-	rtnl_lock();
-	phylink_stop(dp->pl);
-	rtnl_unlock();
-
-	return 0;
-}
-
-int dsa_slave_resume(struct net_device *slave_dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-
-	if (!netif_running(slave_dev))
-		return 0;
-
-	netif_device_attach(slave_dev);
-
-	rtnl_lock();
-	phylink_start(dp->pl);
-	rtnl_unlock();
-
-	return 0;
-}
-
-int dsa_slave_create(struct dsa_port *port)
-{
-	struct net_device *master = dsa_port_to_master(port);
-	struct dsa_switch *ds = port->ds;
-	struct net_device *slave_dev;
-	struct dsa_slave_priv *p;
-	const char *name;
-	int assign_type;
-	int ret;
-
-	if (!ds->num_tx_queues)
-		ds->num_tx_queues = 1;
-
-	if (port->name) {
-		name = port->name;
-		assign_type = NET_NAME_PREDICTABLE;
-	} else {
-		name = "eth%d";
-		assign_type = NET_NAME_ENUM;
-	}
-
-	slave_dev = alloc_netdev_mqs(sizeof(struct dsa_slave_priv), name,
-				     assign_type, ether_setup,
-				     ds->num_tx_queues, 1);
-	if (slave_dev == NULL)
-		return -ENOMEM;
-
-	slave_dev->rtnl_link_ops = &dsa_link_ops;
-	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
-#if IS_ENABLED(CONFIG_DCB)
-	slave_dev->dcbnl_ops = &dsa_slave_dcbnl_ops;
-#endif
-	if (!is_zero_ether_addr(port->mac))
-		eth_hw_addr_set(slave_dev, port->mac);
-	else
-		eth_hw_addr_inherit(slave_dev, master);
-	slave_dev->priv_flags |= IFF_NO_QUEUE;
-	if (dsa_switch_supports_uc_filtering(ds))
-		slave_dev->priv_flags |= IFF_UNICAST_FLT;
-	slave_dev->netdev_ops = &dsa_slave_netdev_ops;
-	if (ds->ops->port_max_mtu)
-		slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
-	SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
-
-	SET_NETDEV_DEV(slave_dev, port->ds->dev);
-	SET_NETDEV_DEVLINK_PORT(slave_dev, &port->devlink_port);
-	slave_dev->dev.of_node = port->dn;
-	slave_dev->vlan_features = master->vlan_features;
-
-	p = netdev_priv(slave_dev);
-	slave_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
-	if (!slave_dev->tstats) {
-		free_netdev(slave_dev);
-		return -ENOMEM;
-	}
-
-	ret = gro_cells_init(&p->gcells, slave_dev);
-	if (ret)
-		goto out_free;
-
-	p->dp = port;
-	INIT_LIST_HEAD(&p->mall_tc_list);
-	port->slave = slave_dev;
-	dsa_slave_setup_tagger(slave_dev);
-
-	netif_carrier_off(slave_dev);
-
-	ret = dsa_slave_phy_setup(slave_dev);
-	if (ret) {
-		netdev_err(slave_dev,
-			   "error %d setting up PHY for tree %d, switch %d, port %d\n",
-			   ret, ds->dst->index, ds->index, port->index);
-		goto out_gcells;
-	}
-
-	rtnl_lock();
-
-	ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN);
-	if (ret && ret != -EOPNOTSUPP)
-		dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n",
-			 ret, ETH_DATA_LEN, port->index);
-
-	ret = register_netdevice(slave_dev);
-	if (ret) {
-		netdev_err(master, "error %d registering interface %s\n",
-			   ret, slave_dev->name);
-		rtnl_unlock();
-		goto out_phy;
-	}
-
-	if (IS_ENABLED(CONFIG_DCB)) {
-		ret = dsa_slave_dcbnl_init(slave_dev);
-		if (ret) {
-			netdev_err(slave_dev,
-				   "failed to initialize DCB: %pe\n",
-				   ERR_PTR(ret));
-			rtnl_unlock();
-			goto out_unregister;
-		}
-	}
-
-	ret = netdev_upper_dev_link(master, slave_dev, NULL);
-
-	rtnl_unlock();
-
-	if (ret)
-		goto out_unregister;
-
-	return 0;
-
-out_unregister:
-	unregister_netdev(slave_dev);
-out_phy:
-	rtnl_lock();
-	phylink_disconnect_phy(p->dp->pl);
-	rtnl_unlock();
-	dsa_port_phylink_destroy(p->dp);
-out_gcells:
-	gro_cells_destroy(&p->gcells);
-out_free:
-	free_percpu(slave_dev->tstats);
-	free_netdev(slave_dev);
-	port->slave = NULL;
-	return ret;
-}
-
-void dsa_slave_destroy(struct net_device *slave_dev)
-{
-	struct net_device *master = dsa_slave_to_master(slave_dev);
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
-
-	netif_carrier_off(slave_dev);
-	rtnl_lock();
-	netdev_upper_dev_unlink(master, slave_dev);
-	unregister_netdevice(slave_dev);
-	phylink_disconnect_phy(dp->pl);
-	rtnl_unlock();
-
-	dsa_port_phylink_destroy(dp);
-	gro_cells_destroy(&p->gcells);
-	free_percpu(slave_dev->tstats);
-	free_netdev(slave_dev);
-}
-
-int dsa_slave_change_master(struct net_device *dev, struct net_device *master,
-			    struct netlink_ext_ack *extack)
-{
-	struct net_device *old_master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct net_device *upper;
-	struct list_head *iter;
-	int err;
-
-	if (master == old_master)
-		return 0;
-
-	if (!ds->ops->port_change_master) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "Driver does not support changing DSA master");
-		return -EOPNOTSUPP;
-	}
-
-	if (!netdev_uses_dsa(master)) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "Interface not eligible as DSA master");
-		return -EOPNOTSUPP;
-	}
-
-	netdev_for_each_upper_dev_rcu(master, upper, iter) {
-		if (dsa_slave_dev_check(upper))
-			continue;
-		if (netif_is_bridge_master(upper))
-			continue;
-		NL_SET_ERR_MSG_MOD(extack, "Cannot join master with unknown uppers");
-		return -EOPNOTSUPP;
-	}
-
-	/* Since we allow live-changing the DSA master, plus we auto-open the
-	 * DSA master when the user port opens => we need to ensure that the
-	 * new DSA master is open too.
-	 */
-	if (dev->flags & IFF_UP) {
-		err = dev_open(master, extack);
-		if (err)
-			return err;
-	}
-
-	netdev_upper_dev_unlink(old_master, dev);
-
-	err = netdev_upper_dev_link(master, dev, extack);
-	if (err)
-		goto out_revert_old_master_unlink;
-
-	err = dsa_port_change_master(dp, master, extack);
-	if (err)
-		goto out_revert_master_link;
-
-	/* Update the MTU of the new CPU port through cross-chip notifiers */
-	err = dsa_slave_change_mtu(dev, dev->mtu);
-	if (err && err != -EOPNOTSUPP) {
-		netdev_warn(dev,
-			    "nonfatal error updating MTU with new master: %pe\n",
-			    ERR_PTR(err));
-	}
-
-	/* If the port doesn't have its own MAC address and relies on the DSA
-	 * master's one, inherit it again from the new DSA master.
-	 */
-	if (is_zero_ether_addr(dp->mac))
-		eth_hw_addr_inherit(dev, master);
-
-	return 0;
-
-out_revert_master_link:
-	netdev_upper_dev_unlink(master, dev);
-out_revert_old_master_unlink:
-	netdev_upper_dev_link(old_master, dev, NULL);
-	return err;
-}
-
-bool dsa_slave_dev_check(const struct net_device *dev)
-{
-	return dev->netdev_ops == &dsa_slave_netdev_ops;
-}
-EXPORT_SYMBOL_GPL(dsa_slave_dev_check);
-
-static int dsa_slave_changeupper(struct net_device *dev,
-				 struct netdev_notifier_changeupper_info *info)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct netlink_ext_ack *extack;
-	int err = NOTIFY_DONE;
-
-	if (!dsa_slave_dev_check(dev))
-		return err;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
-
-	if (netif_is_bridge_master(info->upper_dev)) {
-		if (info->linking) {
-			err = dsa_port_bridge_join(dp, info->upper_dev, extack);
-			if (!err)
-				dsa_bridge_mtu_normalization(dp);
-			if (err == -EOPNOTSUPP) {
-				NL_SET_ERR_MSG_WEAK_MOD(extack,
-							"Offloading not supported");
-				err = 0;
-			}
-			err = notifier_from_errno(err);
-		} else {
-			dsa_port_bridge_leave(dp, info->upper_dev);
-			err = NOTIFY_OK;
-		}
-	} else if (netif_is_lag_master(info->upper_dev)) {
-		if (info->linking) {
-			err = dsa_port_lag_join(dp, info->upper_dev,
-						info->upper_info, extack);
-			if (err == -EOPNOTSUPP) {
-				NL_SET_ERR_MSG_WEAK_MOD(extack,
-							"Offloading not supported");
-				err = 0;
-			}
-			err = notifier_from_errno(err);
-		} else {
-			dsa_port_lag_leave(dp, info->upper_dev);
-			err = NOTIFY_OK;
-		}
-	} else if (is_hsr_master(info->upper_dev)) {
-		if (info->linking) {
-			err = dsa_port_hsr_join(dp, info->upper_dev, extack);
-			if (err == -EOPNOTSUPP) {
-				NL_SET_ERR_MSG_WEAK_MOD(extack,
-							"Offloading not supported");
-				err = 0;
-			}
-			err = notifier_from_errno(err);
-		} else {
-			dsa_port_hsr_leave(dp, info->upper_dev);
-			err = NOTIFY_OK;
-		}
-	}
-
-	return err;
-}
-
-static int dsa_slave_prechangeupper(struct net_device *dev,
-				    struct netdev_notifier_changeupper_info *info)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	if (!dsa_slave_dev_check(dev))
-		return NOTIFY_DONE;
-
-	if (netif_is_bridge_master(info->upper_dev) && !info->linking)
-		dsa_port_pre_bridge_leave(dp, info->upper_dev);
-	else if (netif_is_lag_master(info->upper_dev) && !info->linking)
-		dsa_port_pre_lag_leave(dp, info->upper_dev);
-	/* dsa_port_pre_hsr_leave is not yet necessary since hsr cannot be
-	 * meaningfully enslaved to a bridge yet
-	 */
-
-	return NOTIFY_DONE;
-}
-
-static int
-dsa_slave_lag_changeupper(struct net_device *dev,
-			  struct netdev_notifier_changeupper_info *info)
-{
-	struct net_device *lower;
-	struct list_head *iter;
-	int err = NOTIFY_DONE;
-	struct dsa_port *dp;
-
-	if (!netif_is_lag_master(dev))
-		return err;
-
-	netdev_for_each_lower_dev(dev, lower, iter) {
-		if (!dsa_slave_dev_check(lower))
-			continue;
-
-		dp = dsa_slave_to_port(lower);
-		if (!dp->lag)
-			/* Software LAG */
-			continue;
-
-		err = dsa_slave_changeupper(lower, info);
-		if (notifier_to_errno(err))
-			break;
-	}
-
-	return err;
-}
-
-/* Same as dsa_slave_lag_changeupper() except that it calls
- * dsa_slave_prechangeupper()
- */
-static int
-dsa_slave_lag_prechangeupper(struct net_device *dev,
-			     struct netdev_notifier_changeupper_info *info)
-{
-	struct net_device *lower;
-	struct list_head *iter;
-	int err = NOTIFY_DONE;
-	struct dsa_port *dp;
-
-	if (!netif_is_lag_master(dev))
-		return err;
-
-	netdev_for_each_lower_dev(dev, lower, iter) {
-		if (!dsa_slave_dev_check(lower))
-			continue;
-
-		dp = dsa_slave_to_port(lower);
-		if (!dp->lag)
-			/* Software LAG */
-			continue;
-
-		err = dsa_slave_prechangeupper(lower, info);
-		if (notifier_to_errno(err))
-			break;
-	}
-
-	return err;
-}
-
-static int
-dsa_prevent_bridging_8021q_upper(struct net_device *dev,
-				 struct netdev_notifier_changeupper_info *info)
-{
-	struct netlink_ext_ack *ext_ack;
-	struct net_device *slave, *br;
-	struct dsa_port *dp;
-
-	ext_ack = netdev_notifier_info_to_extack(&info->info);
-
-	if (!is_vlan_dev(dev))
-		return NOTIFY_DONE;
-
-	slave = vlan_dev_real_dev(dev);
-	if (!dsa_slave_dev_check(slave))
-		return NOTIFY_DONE;
-
-	dp = dsa_slave_to_port(slave);
-	br = dsa_port_bridge_dev_get(dp);
-	if (!br)
-		return NOTIFY_DONE;
-
-	/* Deny enslaving a VLAN device into a VLAN-aware bridge */
-	if (br_vlan_enabled(br) &&
-	    netif_is_bridge_master(info->upper_dev) && info->linking) {
-		NL_SET_ERR_MSG_MOD(ext_ack,
-				   "Cannot enslave VLAN device into VLAN aware bridge");
-		return notifier_from_errno(-EINVAL);
-	}
-
-	return NOTIFY_DONE;
-}
-
-static int
-dsa_slave_check_8021q_upper(struct net_device *dev,
-			    struct netdev_notifier_changeupper_info *info)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct net_device *br = dsa_port_bridge_dev_get(dp);
-	struct bridge_vlan_info br_info;
-	struct netlink_ext_ack *extack;
-	int err = NOTIFY_DONE;
-	u16 vid;
-
-	if (!br || !br_vlan_enabled(br))
-		return NOTIFY_DONE;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
-	vid = vlan_dev_vlan_id(info->upper_dev);
-
-	/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
-	 * device, respectively the VID is not found, returning
-	 * 0 means success, which is a failure for us here.
-	 */
-	err = br_vlan_get_info(br, vid, &br_info);
-	if (err == 0) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "This VLAN is already configured by the bridge");
-		return notifier_from_errno(-EBUSY);
-	}
-
-	return NOTIFY_DONE;
-}
-
-static int
-dsa_slave_prechangeupper_sanity_check(struct net_device *dev,
-				      struct netdev_notifier_changeupper_info *info)
-{
-	struct dsa_switch *ds;
-	struct dsa_port *dp;
-	int err;
-
-	if (!dsa_slave_dev_check(dev))
-		return dsa_prevent_bridging_8021q_upper(dev, info);
-
-	dp = dsa_slave_to_port(dev);
-	ds = dp->ds;
-
-	if (ds->ops->port_prechangeupper) {
-		err = ds->ops->port_prechangeupper(ds, dp->index, info);
-		if (err)
-			return notifier_from_errno(err);
-	}
-
-	if (is_vlan_dev(info->upper_dev))
-		return dsa_slave_check_8021q_upper(dev, info);
-
-	return NOTIFY_DONE;
-}
-
-/* To be eligible as a DSA master, a LAG must have all lower interfaces be
- * eligible DSA masters. Additionally, all LAG slaves must be DSA masters of
- * switches in the same switch tree.
- */
-static int dsa_lag_master_validate(struct net_device *lag_dev,
-				   struct netlink_ext_ack *extack)
-{
-	struct net_device *lower1, *lower2;
-	struct list_head *iter1, *iter2;
-
-	netdev_for_each_lower_dev(lag_dev, lower1, iter1) {
-		netdev_for_each_lower_dev(lag_dev, lower2, iter2) {
-			if (!netdev_uses_dsa(lower1) ||
-			    !netdev_uses_dsa(lower2)) {
-				NL_SET_ERR_MSG_MOD(extack,
-						   "All LAG ports must be eligible as DSA masters");
-				return notifier_from_errno(-EINVAL);
-			}
-
-			if (lower1 == lower2)
-				continue;
-
-			if (!dsa_port_tree_same(lower1->dsa_ptr,
-						lower2->dsa_ptr)) {
-				NL_SET_ERR_MSG_MOD(extack,
-						   "LAG contains DSA masters of disjoint switch trees");
-				return notifier_from_errno(-EINVAL);
-			}
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static int
-dsa_master_prechangeupper_sanity_check(struct net_device *master,
-				       struct netdev_notifier_changeupper_info *info)
-{
-	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
-
-	if (!netdev_uses_dsa(master))
-		return NOTIFY_DONE;
-
-	if (!info->linking)
-		return NOTIFY_DONE;
-
-	/* Allow DSA switch uppers */
-	if (dsa_slave_dev_check(info->upper_dev))
-		return NOTIFY_DONE;
-
-	/* Allow bridge uppers of DSA masters, subject to further
-	 * restrictions in dsa_bridge_prechangelower_sanity_check()
-	 */
-	if (netif_is_bridge_master(info->upper_dev))
-		return NOTIFY_DONE;
-
-	/* Allow LAG uppers, subject to further restrictions in
-	 * dsa_lag_master_prechangelower_sanity_check()
-	 */
-	if (netif_is_lag_master(info->upper_dev))
-		return dsa_lag_master_validate(info->upper_dev, extack);
-
-	NL_SET_ERR_MSG_MOD(extack,
-			   "DSA master cannot join unknown upper interfaces");
-	return notifier_from_errno(-EBUSY);
-}
-
-static int
-dsa_lag_master_prechangelower_sanity_check(struct net_device *dev,
-					   struct netdev_notifier_changeupper_info *info)
-{
-	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
-	struct net_device *lag_dev = info->upper_dev;
-	struct net_device *lower;
-	struct list_head *iter;
-
-	if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev))
-		return NOTIFY_DONE;
-
-	if (!info->linking)
-		return NOTIFY_DONE;
-
-	if (!netdev_uses_dsa(dev)) {
-		NL_SET_ERR_MSG(extack,
-			       "Only DSA masters can join a LAG DSA master");
-		return notifier_from_errno(-EINVAL);
-	}
-
-	netdev_for_each_lower_dev(lag_dev, lower, iter) {
-		if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) {
-			NL_SET_ERR_MSG(extack,
-				       "Interface is DSA master for a different switch tree than this LAG");
-			return notifier_from_errno(-EINVAL);
-		}
-
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-/* Don't allow bridging of DSA masters, since the bridge layer rx_handler
- * prevents the DSA fake ethertype handler to be invoked, so we don't get the
- * chance to strip off and parse the DSA switch tag protocol header (the bridge
- * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these
- * frames).
- * The only case where that would not be an issue is when bridging can already
- * be offloaded, such as when the DSA master is itself a DSA or plain switchdev
- * port, and is bridged only with other ports from the same hardware device.
- */
-static int
-dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower,
-				       struct netdev_notifier_changeupper_info *info)
-{
-	struct net_device *br = info->upper_dev;
-	struct netlink_ext_ack *extack;
-	struct net_device *lower;
-	struct list_head *iter;
-
-	if (!netif_is_bridge_master(br))
-		return NOTIFY_DONE;
-
-	if (!info->linking)
-		return NOTIFY_DONE;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
-
-	netdev_for_each_lower_dev(br, lower, iter) {
-		if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower))
-			continue;
-
-		if (!netdev_port_same_parent_id(lower, new_lower)) {
-			NL_SET_ERR_MSG(extack,
-				       "Cannot do software bridging with a DSA master");
-			return notifier_from_errno(-EINVAL);
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static void dsa_tree_migrate_ports_from_lag_master(struct dsa_switch_tree *dst,
-						   struct net_device *lag_dev)
-{
-	struct net_device *new_master = dsa_tree_find_first_master(dst);
-	struct dsa_port *dp;
-	int err;
-
-	dsa_tree_for_each_user_port(dp, dst) {
-		if (dsa_port_to_master(dp) != lag_dev)
-			continue;
-
-		err = dsa_slave_change_master(dp->slave, new_master, NULL);
-		if (err) {
-			netdev_err(dp->slave,
-				   "failed to restore master to %s: %pe\n",
-				   new_master->name, ERR_PTR(err));
-		}
-	}
-}
-
-static int dsa_master_lag_join(struct net_device *master,
-			       struct net_device *lag_dev,
-			       struct netdev_lag_upper_info *uinfo,
-			       struct netlink_ext_ack *extack)
-{
-	struct dsa_port *cpu_dp = master->dsa_ptr;
-	struct dsa_switch_tree *dst = cpu_dp->dst;
-	struct dsa_port *dp;
-	int err;
-
-	err = dsa_master_lag_setup(lag_dev, cpu_dp, uinfo, extack);
-	if (err)
-		return err;
-
-	dsa_tree_for_each_user_port(dp, dst) {
-		if (dsa_port_to_master(dp) != master)
-			continue;
-
-		err = dsa_slave_change_master(dp->slave, lag_dev, extack);
-		if (err)
-			goto restore;
-	}
-
-	return 0;
-
-restore:
-	dsa_tree_for_each_user_port_continue_reverse(dp, dst) {
-		if (dsa_port_to_master(dp) != lag_dev)
-			continue;
-
-		err = dsa_slave_change_master(dp->slave, master, NULL);
-		if (err) {
-			netdev_err(dp->slave,
-				   "failed to restore master to %s: %pe\n",
-				   master->name, ERR_PTR(err));
-		}
-	}
-
-	dsa_master_lag_teardown(lag_dev, master->dsa_ptr);
-
-	return err;
-}
-
-static void dsa_master_lag_leave(struct net_device *master,
-				 struct net_device *lag_dev)
-{
-	struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr;
-	struct dsa_switch_tree *dst = cpu_dp->dst;
-	struct dsa_port *new_cpu_dp = NULL;
-	struct net_device *lower;
-	struct list_head *iter;
-
-	netdev_for_each_lower_dev(lag_dev, lower, iter) {
-		if (netdev_uses_dsa(lower)) {
-			new_cpu_dp = lower->dsa_ptr;
-			break;
-		}
-	}
-
-	if (new_cpu_dp) {
-		/* Update the CPU port of the user ports still under the LAG
-		 * so that dsa_port_to_master() continues to work properly
-		 */
-		dsa_tree_for_each_user_port(dp, dst)
-			if (dsa_port_to_master(dp) == lag_dev)
-				dp->cpu_dp = new_cpu_dp;
-
-		/* Update the index of the virtual CPU port to match the lowest
-		 * physical CPU port
-		 */
-		lag_dev->dsa_ptr = new_cpu_dp;
-		wmb();
-	} else {
-		/* If the LAG DSA master has no ports left, migrate back all
-		 * user ports to the first physical CPU port
-		 */
-		dsa_tree_migrate_ports_from_lag_master(dst, lag_dev);
-	}
-
-	/* This DSA master has left its LAG in any case, so let
-	 * the CPU port leave the hardware LAG as well
-	 */
-	dsa_master_lag_teardown(lag_dev, master->dsa_ptr);
-}
-
-static int dsa_master_changeupper(struct net_device *dev,
-				  struct netdev_notifier_changeupper_info *info)
-{
-	struct netlink_ext_ack *extack;
-	int err = NOTIFY_DONE;
-
-	if (!netdev_uses_dsa(dev))
-		return err;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
-
-	if (netif_is_lag_master(info->upper_dev)) {
-		if (info->linking) {
-			err = dsa_master_lag_join(dev, info->upper_dev,
-						  info->upper_info, extack);
-			err = notifier_from_errno(err);
-		} else {
-			dsa_master_lag_leave(dev, info->upper_dev);
-			err = NOTIFY_OK;
-		}
-	}
-
-	return err;
-}
-
-static int dsa_slave_netdevice_event(struct notifier_block *nb,
-				     unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	switch (event) {
-	case NETDEV_PRECHANGEUPPER: {
-		struct netdev_notifier_changeupper_info *info = ptr;
-		int err;
-
-		err = dsa_slave_prechangeupper_sanity_check(dev, info);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_master_prechangeupper_sanity_check(dev, info);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_lag_master_prechangelower_sanity_check(dev, info);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_bridge_prechangelower_sanity_check(dev, info);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_slave_prechangeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_slave_lag_prechangeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		break;
-	}
-	case NETDEV_CHANGEUPPER: {
-		int err;
-
-		err = dsa_slave_changeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_slave_lag_changeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_master_changeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		break;
-	}
-	case NETDEV_CHANGELOWERSTATE: {
-		struct netdev_notifier_changelowerstate_info *info = ptr;
-		struct dsa_port *dp;
-		int err = 0;
-
-		if (dsa_slave_dev_check(dev)) {
-			dp = dsa_slave_to_port(dev);
-
-			err = dsa_port_lag_change(dp, info->lower_state_info);
-		}
-
-		/* Mirror LAG port events on DSA masters that are in
-		 * a LAG towards their respective switch CPU ports
-		 */
-		if (netdev_uses_dsa(dev)) {
-			dp = dev->dsa_ptr;
-
-			err = dsa_port_lag_change(dp, info->lower_state_info);
-		}
-
-		return notifier_from_errno(err);
-	}
-	case NETDEV_CHANGE:
-	case NETDEV_UP: {
-		/* Track state of master port.
-		 * DSA driver may require the master port (and indirectly
-		 * the tagger) to be available for some special operation.
-		 */
-		if (netdev_uses_dsa(dev)) {
-			struct dsa_port *cpu_dp = dev->dsa_ptr;
-			struct dsa_switch_tree *dst = cpu_dp->ds->dst;
-
-			/* Track when the master port is UP */
-			dsa_tree_master_oper_state_change(dst, dev,
-							  netif_oper_up(dev));
-
-			/* Track when the master port is ready and can accept
-			 * packet.
-			 * NETDEV_UP event is not enough to flag a port as ready.
-			 * We also have to wait for linkwatch_do_dev to dev_activate
-			 * and emit a NETDEV_CHANGE event.
-			 * We check if a master port is ready by checking if the dev
-			 * have a qdisc assigned and is not noop.
-			 */
-			dsa_tree_master_admin_state_change(dst, dev,
-							   !qdisc_tx_is_noop(dev));
-
-			return NOTIFY_OK;
-		}
-
-		return NOTIFY_DONE;
-	}
-	case NETDEV_GOING_DOWN: {
-		struct dsa_port *dp, *cpu_dp;
-		struct dsa_switch_tree *dst;
-		LIST_HEAD(close_list);
-
-		if (!netdev_uses_dsa(dev))
-			return NOTIFY_DONE;
-
-		cpu_dp = dev->dsa_ptr;
-		dst = cpu_dp->ds->dst;
-
-		dsa_tree_master_admin_state_change(dst, dev, false);
-
-		list_for_each_entry(dp, &dst->ports, list) {
-			if (!dsa_port_is_user(dp))
-				continue;
-
-			if (dp->cpu_dp != cpu_dp)
-				continue;
-
-			list_add(&dp->slave->close_list, &close_list);
-		}
-
-		dev_close_many(&close_list, true);
-
-		return NOTIFY_OK;
-	}
-	default:
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static void
-dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work)
-{
-	struct switchdev_notifier_fdb_info info = {};
-
-	info.addr = switchdev_work->addr;
-	info.vid = switchdev_work->vid;
-	info.offloaded = true;
-	call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
-				 switchdev_work->orig_dev, &info.info, NULL);
-}
-
-static void dsa_slave_switchdev_event_work(struct work_struct *work)
-{
-	struct dsa_switchdev_event_work *switchdev_work =
-		container_of(work, struct dsa_switchdev_event_work, work);
-	const unsigned char *addr = switchdev_work->addr;
-	struct net_device *dev = switchdev_work->dev;
-	u16 vid = switchdev_work->vid;
-	struct dsa_switch *ds;
-	struct dsa_port *dp;
-	int err;
-
-	dp = dsa_slave_to_port(dev);
-	ds = dp->ds;
-
-	switch (switchdev_work->event) {
-	case SWITCHDEV_FDB_ADD_TO_DEVICE:
-		if (switchdev_work->host_addr)
-			err = dsa_port_bridge_host_fdb_add(dp, addr, vid);
-		else if (dp->lag)
-			err = dsa_port_lag_fdb_add(dp, addr, vid);
-		else
-			err = dsa_port_fdb_add(dp, addr, vid);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to add %pM vid %d to fdb: %d\n",
-				dp->index, addr, vid, err);
-			break;
-		}
-		dsa_fdb_offload_notify(switchdev_work);
-		break;
-
-	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		if (switchdev_work->host_addr)
-			err = dsa_port_bridge_host_fdb_del(dp, addr, vid);
-		else if (dp->lag)
-			err = dsa_port_lag_fdb_del(dp, addr, vid);
-		else
-			err = dsa_port_fdb_del(dp, addr, vid);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to delete %pM vid %d from fdb: %d\n",
-				dp->index, addr, vid, err);
-		}
-
-		break;
-	}
-
-	kfree(switchdev_work);
-}
-
-static bool dsa_foreign_dev_check(const struct net_device *dev,
-				  const struct net_device *foreign_dev)
-{
-	const struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch_tree *dst = dp->ds->dst;
-
-	if (netif_is_bridge_master(foreign_dev))
-		return !dsa_tree_offloads_bridge_dev(dst, foreign_dev);
-
-	if (netif_is_bridge_port(foreign_dev))
-		return !dsa_tree_offloads_bridge_port(dst, foreign_dev);
-
-	/* Everything else is foreign */
-	return true;
-}
-
-static int dsa_slave_fdb_event(struct net_device *dev,
-			       struct net_device *orig_dev,
-			       unsigned long event, const void *ctx,
-			       const struct switchdev_notifier_fdb_info *fdb_info)
-{
-	struct dsa_switchdev_event_work *switchdev_work;
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	bool host_addr = fdb_info->is_local;
-	struct dsa_switch *ds = dp->ds;
-
-	if (ctx && ctx != dp)
-		return 0;
-
-	if (!dp->bridge)
-		return 0;
-
-	if (switchdev_fdb_is_dynamically_learned(fdb_info)) {
-		if (dsa_port_offloads_bridge_port(dp, orig_dev))
-			return 0;
-
-		/* FDB entries learned by the software bridge or by foreign
-		 * bridge ports should be installed as host addresses only if
-		 * the driver requests assisted learning.
-		 */
-		if (!ds->assisted_learning_on_cpu_port)
-			return 0;
-	}
-
-	/* Also treat FDB entries on foreign interfaces bridged with us as host
-	 * addresses.
-	 */
-	if (dsa_foreign_dev_check(dev, orig_dev))
-		host_addr = true;
-
-	/* Check early that we're not doing work in vain.
-	 * Host addresses on LAG ports still require regular FDB ops,
-	 * since the CPU port isn't in a LAG.
-	 */
-	if (dp->lag && !host_addr) {
-		if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del)
-			return -EOPNOTSUPP;
-	} else {
-		if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del)
-			return -EOPNOTSUPP;
-	}
-
-	switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
-	if (!switchdev_work)
-		return -ENOMEM;
-
-	netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n",
-		   event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting",
-		   orig_dev->name, fdb_info->addr, fdb_info->vid,
-		   host_addr ? " as host address" : "");
-
-	INIT_WORK(&switchdev_work->work, dsa_slave_switchdev_event_work);
-	switchdev_work->event = event;
-	switchdev_work->dev = dev;
-	switchdev_work->orig_dev = orig_dev;
-
-	ether_addr_copy(switchdev_work->addr, fdb_info->addr);
-	switchdev_work->vid = fdb_info->vid;
-	switchdev_work->host_addr = host_addr;
-
-	dsa_schedule_work(&switchdev_work->work);
-
-	return 0;
-}
-
-/* Called under rcu_read_lock() */
-static int dsa_slave_switchdev_event(struct notifier_block *unused,
-				     unsigned long event, void *ptr)
-{
-	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
-	int err;
-
-	switch (event) {
-	case SWITCHDEV_PORT_ATTR_SET:
-		err = switchdev_handle_port_attr_set(dev, ptr,
-						     dsa_slave_dev_check,
-						     dsa_slave_port_attr_set);
-		return notifier_from_errno(err);
-	case SWITCHDEV_FDB_ADD_TO_DEVICE:
-	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		err = switchdev_handle_fdb_event_to_device(dev, event, ptr,
-							   dsa_slave_dev_check,
-							   dsa_foreign_dev_check,
-							   dsa_slave_fdb_event);
-		return notifier_from_errno(err);
-	default:
-		return NOTIFY_DONE;
-	}
-
-	return NOTIFY_OK;
-}
-
-static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused,
-					      unsigned long event, void *ptr)
-{
-	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
-	int err;
-
-	switch (event) {
-	case SWITCHDEV_PORT_OBJ_ADD:
-		err = switchdev_handle_port_obj_add_foreign(dev, ptr,
-							    dsa_slave_dev_check,
-							    dsa_foreign_dev_check,
-							    dsa_slave_port_obj_add);
-		return notifier_from_errno(err);
-	case SWITCHDEV_PORT_OBJ_DEL:
-		err = switchdev_handle_port_obj_del_foreign(dev, ptr,
-							    dsa_slave_dev_check,
-							    dsa_foreign_dev_check,
-							    dsa_slave_port_obj_del);
-		return notifier_from_errno(err);
-	case SWITCHDEV_PORT_ATTR_SET:
-		err = switchdev_handle_port_attr_set(dev, ptr,
-						     dsa_slave_dev_check,
-						     dsa_slave_port_attr_set);
-		return notifier_from_errno(err);
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block dsa_slave_nb __read_mostly = {
-	.notifier_call  = dsa_slave_netdevice_event,
-};
-
-struct notifier_block dsa_slave_switchdev_notifier = {
-	.notifier_call = dsa_slave_switchdev_event,
-};
-
-struct notifier_block dsa_slave_switchdev_blocking_notifier = {
-	.notifier_call = dsa_slave_switchdev_blocking_event,
-};
-
-int dsa_slave_register_notifier(void)
-{
-	struct notifier_block *nb;
-	int err;
-
-	err = register_netdevice_notifier(&dsa_slave_nb);
-	if (err)
-		return err;
-
-	err = register_switchdev_notifier(&dsa_slave_switchdev_notifier);
-	if (err)
-		goto err_switchdev_nb;
-
-	nb = &dsa_slave_switchdev_blocking_notifier;
-	err = register_switchdev_blocking_notifier(nb);
-	if (err)
-		goto err_switchdev_blocking_nb;
-
-	return 0;
-
-err_switchdev_blocking_nb:
-	unregister_switchdev_notifier(&dsa_slave_switchdev_notifier);
-err_switchdev_nb:
-	unregister_netdevice_notifier(&dsa_slave_nb);
-	return err;
-}
-
-void dsa_slave_unregister_notifier(void)
-{
-	struct notifier_block *nb;
-	int err;
-
-	nb = &dsa_slave_switchdev_blocking_notifier;
-	err = unregister_switchdev_blocking_notifier(nb);
-	if (err)
-		pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err);
-
-	err = unregister_switchdev_notifier(&dsa_slave_switchdev_notifier);
-	if (err)
-		pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);
-
-	err = unregister_netdevice_notifier(&dsa_slave_nb);
-	if (err)
-		pr_err("DSA: failed to unregister slave notifier (%d)\n", err);
-}
diff --git a/net/dsa/slave.h b/net/dsa/slave.h
deleted file mode 100644
index d0abe609e00d..000000000000
--- a/net/dsa/slave.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#ifndef __DSA_SLAVE_H
-#define __DSA_SLAVE_H
-
-#include <linux/if_bridge.h>
-#include <linux/if_vlan.h>
-#include <linux/list.h>
-#include <linux/netpoll.h>
-#include <linux/types.h>
-#include <net/dsa.h>
-#include <net/gro_cells.h>
-
-struct net_device;
-struct netlink_ext_ack;
-
-extern struct notifier_block dsa_slave_switchdev_notifier;
-extern struct notifier_block dsa_slave_switchdev_blocking_notifier;
-
-struct dsa_slave_priv {
-	/* Copy of CPU port xmit for faster access in slave transmit hot path */
-	struct sk_buff *	(*xmit)(struct sk_buff *skb,
-					struct net_device *dev);
-
-	struct gro_cells	gcells;
-
-	/* DSA port data, such as switch, port index, etc. */
-	struct dsa_port		*dp;
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	struct netpoll		*netpoll;
-#endif
-
-	/* TC context */
-	struct list_head	mall_tc_list;
-};
-
-void dsa_slave_mii_bus_init(struct dsa_switch *ds);
-int dsa_slave_create(struct dsa_port *dp);
-void dsa_slave_destroy(struct net_device *slave_dev);
-int dsa_slave_suspend(struct net_device *slave_dev);
-int dsa_slave_resume(struct net_device *slave_dev);
-int dsa_slave_register_notifier(void);
-void dsa_slave_unregister_notifier(void);
-void dsa_slave_sync_ha(struct net_device *dev);
-void dsa_slave_unsync_ha(struct net_device *dev);
-void dsa_slave_setup_tagger(struct net_device *slave);
-int dsa_slave_change_mtu(struct net_device *dev, int new_mtu);
-int dsa_slave_change_master(struct net_device *dev, struct net_device *master,
-			    struct netlink_ext_ack *extack);
-int dsa_slave_manage_vlan_filtering(struct net_device *dev,
-				    bool vlan_filtering);
-
-static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	return p->dp;
-}
-
-static inline struct net_device *
-dsa_slave_to_master(const struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return dsa_port_to_master(dp);
-}
-
-#endif
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 1a42f9317334..3d2feeea897b 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -15,10 +15,10 @@
 #include "dsa.h"
 #include "netlink.h"
 #include "port.h"
-#include "slave.h"
 #include "switch.h"
 #include "tag_8021q.h"
 #include "trace.h"
+#include "user.h"
 
 static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
 						   unsigned int ageing_time)
@@ -894,12 +894,12 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds,
 	 * bits that depend on the tagger, such as the MTU.
 	 */
 	dsa_switch_for_each_user_port(dp, ds) {
-		struct net_device *slave = dp->slave;
+		struct net_device *user = dp->user;
 
-		dsa_slave_setup_tagger(slave);
+		dsa_user_setup_tagger(user);
 
 		/* rtnl_mutex is held in dsa_tree_change_tag_proto */
-		dsa_slave_change_mtu(slave, slave->mtu);
+		dsa_user_change_mtu(user, user->mtu);
 	}
 
 	return 0;
@@ -960,13 +960,13 @@ dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
 }
 
 static int
-dsa_switch_master_state_change(struct dsa_switch *ds,
-			       struct dsa_notifier_master_state_info *info)
+dsa_switch_conduit_state_change(struct dsa_switch *ds,
+				struct dsa_notifier_conduit_state_info *info)
 {
-	if (!ds->ops->master_state_change)
+	if (!ds->ops->conduit_state_change)
 		return 0;
 
-	ds->ops->master_state_change(ds, info->master, info->operational);
+	ds->ops->conduit_state_change(ds, info->conduit, info->operational);
 
 	return 0;
 }
@@ -1056,8 +1056,8 @@ static int dsa_switch_event(struct notifier_block *nb,
 	case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL:
 		err = dsa_switch_tag_8021q_vlan_del(ds, info);
 		break;
-	case DSA_NOTIFIER_MASTER_STATE_CHANGE:
-		err = dsa_switch_master_state_change(ds, info);
+	case DSA_NOTIFIER_CONDUIT_STATE_CHANGE:
+		err = dsa_switch_conduit_state_change(ds, info);
 		break;
 	default:
 		err = -EOPNOTSUPP;
diff --git a/net/dsa/switch.h b/net/dsa/switch.h
index ea034677da15..be0a2749cd97 100644
--- a/net/dsa/switch.h
+++ b/net/dsa/switch.h
@@ -34,7 +34,7 @@ enum {
 	DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
 	DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
 	DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
-	DSA_NOTIFIER_MASTER_STATE_CHANGE,
+	DSA_NOTIFIER_CONDUIT_STATE_CHANGE,
 };
 
 /* DSA_NOTIFIER_AGEING_TIME */
@@ -105,9 +105,9 @@ struct dsa_notifier_tag_8021q_vlan_info {
 	u16 vid;
 };
 
-/* DSA_NOTIFIER_MASTER_STATE_CHANGE */
-struct dsa_notifier_master_state_info {
-	const struct net_device *master;
+/* DSA_NOTIFIER_CONDUIT_STATE_CHANGE */
+struct dsa_notifier_conduit_state_info {
+	const struct net_device *conduit;
 	bool operational;
 };
 
diff --git a/net/dsa/tag.c b/net/dsa/tag.c
index 5105a5ff58fa..6e402d49afd3 100644
--- a/net/dsa/tag.c
+++ b/net/dsa/tag.c
@@ -13,8 +13,8 @@
 #include <net/dsa.h>
 #include <net/dst_metadata.h>
 
-#include "slave.h"
 #include "tag.h"
+#include "user.h"
 
 static LIST_HEAD(dsa_tag_drivers_list);
 static DEFINE_MUTEX(dsa_tag_drivers_lock);
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(dsa_tag_drivers_lock);
  * switch, the DSA driver owning the interface to which the packet is
  * delivered is never notified unless we do so here.
  */
-static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p,
+static bool dsa_skb_defer_rx_timestamp(struct dsa_user_priv *p,
 				       struct sk_buff *skb)
 {
 	struct dsa_switch *ds = p->dp->ds;
@@ -57,7 +57,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct sk_buff *nskb = NULL;
-	struct dsa_slave_priv *p;
+	struct dsa_user_priv *p;
 
 	if (unlikely(!cpu_dp)) {
 		kfree_skb(skb);
@@ -75,7 +75,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 		if (!skb_has_extensions(skb))
 			skb->slow_gro = 0;
 
-		skb->dev = dsa_master_find_slave(dev, 0, port);
+		skb->dev = dsa_conduit_find_user(dev, 0, port);
 		if (likely(skb->dev)) {
 			dsa_default_offload_fwd_mark(skb);
 			nskb = skb;
@@ -94,7 +94,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	skb->pkt_type = PACKET_HOST;
 	skb->protocol = eth_type_trans(skb, skb->dev);
 
-	if (unlikely(!dsa_slave_dev_check(skb->dev))) {
+	if (unlikely(!dsa_user_dev_check(skb->dev))) {
 		/* Packet is to be injected directly on an upper
 		 * device, e.g. a team/bond, so skip all DSA-port
 		 * specific actions.
diff --git a/net/dsa/tag.h b/net/dsa/tag.h
index 32d12f4a9d73..f6b9c73718df 100644
--- a/net/dsa/tag.h
+++ b/net/dsa/tag.h
@@ -9,7 +9,7 @@
 #include <net/dsa.h>
 
 #include "port.h"
-#include "slave.h"
+#include "user.h"
 
 struct dsa_tag_driver {
 	const struct dsa_device_ops *ops;
@@ -29,7 +29,7 @@ static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops)
 	return ops->needed_headroom + ops->needed_tailroom;
 }
 
-static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
+static inline struct net_device *dsa_conduit_find_user(struct net_device *dev,
 						       int device, int port)
 {
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -39,7 +39,7 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
 	list_for_each_entry(dp, &dst->ports, list)
 		if (dp->ds->index == device && dp->index == port &&
 		    dp->type == DSA_PORT_TYPE_USER)
-			return dp->slave;
+			return dp->user;
 
 	return NULL;
 }
@@ -49,7 +49,7 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
  */
 static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb)
 {
-	struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+	struct dsa_port *dp = dsa_user_to_port(skb->dev);
 	struct net_device *br = dsa_port_bridge_dev_get(dp);
 	struct net_device *dev = skb->dev;
 	struct net_device *upper_dev;
@@ -107,12 +107,12 @@ static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb)
  * to support termination through the bridge.
  */
 static inline struct net_device *
-dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid)
+dsa_find_designated_bridge_port_by_vid(struct net_device *conduit, u16 vid)
 {
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 	struct dsa_switch_tree *dst = cpu_dp->dst;
 	struct bridge_vlan_info vinfo;
-	struct net_device *slave;
+	struct net_device *user;
 	struct dsa_port *dp;
 	int err;
 
@@ -134,13 +134,13 @@ dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid)
 		if (dp->cpu_dp != cpu_dp)
 			continue;
 
-		slave = dp->slave;
+		user = dp->user;
 
-		err = br_vlan_get_info_rcu(slave, vid, &vinfo);
+		err = br_vlan_get_info_rcu(user, vid, &vinfo);
 		if (err)
 			continue;
 
-		return slave;
+		return user;
 	}
 
 	return NULL;
@@ -155,7 +155,7 @@ dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid)
  */
 static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb)
 {
-	struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+	struct dsa_port *dp = dsa_user_to_port(skb->dev);
 
 	skb->offload_fwd_mark = !!(dp->bridge);
 }
@@ -215,9 +215,9 @@ static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len)
 	memmove(skb->data, skb->data + len, 2 * ETH_ALEN);
 }
 
-/* On RX, eth_type_trans() on the DSA master pulls ETH_HLEN bytes starting from
+/* On RX, eth_type_trans() on the DSA conduit pulls ETH_HLEN bytes starting from
  * skb_mac_header(skb), which leaves skb->data pointing at the first byte after
- * what the DSA master perceives as the EtherType (the beginning of the L3
+ * what the DSA conduit perceives as the EtherType (the beginning of the L3
  * protocol). Since DSA EtherType header taggers treat the EtherType as part of
  * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header
  * is located 2 bytes behind skb->data. Note that EtherType in this context
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index cbdfc392f7e0..71b26ae6db39 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -73,7 +73,7 @@ struct dsa_tag_8021q_vlan {
 struct dsa_8021q_context {
 	struct dsa_switch *ds;
 	struct list_head vlans;
-	/* EtherType of RX VID, used for filtering on master interface */
+	/* EtherType of RX VID, used for filtering on conduit interface */
 	__be16 proto;
 };
 
@@ -338,7 +338,7 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
 	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_port *dp = dsa_to_port(ds, port);
 	u16 vid = dsa_tag_8021q_standalone_vid(dp);
-	struct net_device *master;
+	struct net_device *conduit;
 	int err;
 
 	/* The CPU port is implicitly configured by
@@ -347,7 +347,7 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
 	if (!dsa_port_is_user(dp))
 		return 0;
 
-	master = dsa_port_to_master(dp);
+	conduit = dsa_port_to_conduit(dp);
 
 	err = dsa_port_tag_8021q_vlan_add(dp, vid, false);
 	if (err) {
@@ -357,8 +357,8 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
 		return err;
 	}
 
-	/* Add the VLAN to the master's RX filter. */
-	vlan_vid_add(master, ctx->proto, vid);
+	/* Add the VLAN to the conduit's RX filter. */
+	vlan_vid_add(conduit, ctx->proto, vid);
 
 	return err;
 }
@@ -368,7 +368,7 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port)
 	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_port *dp = dsa_to_port(ds, port);
 	u16 vid = dsa_tag_8021q_standalone_vid(dp);
-	struct net_device *master;
+	struct net_device *conduit;
 
 	/* The CPU port is implicitly configured by
 	 * configuring the front-panel ports
@@ -376,11 +376,11 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port)
 	if (!dsa_port_is_user(dp))
 		return;
 
-	master = dsa_port_to_master(dp);
+	conduit = dsa_port_to_conduit(dp);
 
 	dsa_port_tag_8021q_vlan_del(dp, vid, false);
 
-	vlan_vid_del(master, ctx->proto, vid);
+	vlan_vid_del(conduit, ctx->proto, vid);
 }
 
 static int dsa_tag_8021q_setup(struct dsa_switch *ds)
@@ -468,10 +468,10 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
 
-struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
+struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *conduit,
 						   int vbid)
 {
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 	struct dsa_switch_tree *dst = cpu_dp->dst;
 	struct dsa_port *dp;
 
@@ -490,7 +490,7 @@ struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
 			continue;
 
 		if (dsa_port_bridge_num_get(dp) == vbid)
-			return dp->slave;
+			return dp->user;
 	}
 
 	return NULL;
diff --git a/net/dsa/tag_8021q.h b/net/dsa/tag_8021q.h
index b75cbaa028ef..41f7167ac520 100644
--- a/net/dsa/tag_8021q.h
+++ b/net/dsa/tag_8021q.h
@@ -16,7 +16,7 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
 		   int *vbid);
 
-struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
+struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *conduit,
 						   int vbid);
 
 int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
index 7f3b7d730b85..92ce67b93a58 100644
--- a/net/dsa/tag_ar9331.c
+++ b/net/dsa/tag_ar9331.c
@@ -29,7 +29,7 @@
 static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb,
 				       struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__le16 *phdr;
 	u16 hdr;
 
@@ -74,7 +74,7 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
 	/* Get source port information */
 	port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr);
 
-	skb->dev = dsa_master_find_slave(ndev, 0, port);
+	skb->dev = dsa_conduit_find_user(ndev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index cacdafb41200..83d283a5d27e 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -85,7 +85,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 					struct net_device *dev,
 					unsigned int offset)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u16 queue = skb_get_queue_mapping(skb);
 	u8 *brcm_tag;
 
@@ -96,7 +96,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 	 * (including FCS and tag) because the length verification is done after
 	 * the Broadcom tag is stripped off the ingress packet.
 	 *
-	 * Let dsa_slave_xmit() free the SKB
+	 * Let dsa_user_xmit() free the SKB
 	 */
 	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN, false))
 		return NULL;
@@ -119,7 +119,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 		brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
 	brcm_tag[3] = (1 << dp->index) & BRCM_IG_DSTMAP1_MASK;
 
-	/* Now tell the master network device about the desired output queue
+	/* Now tell the conduit network device about the desired output queue
 	 * as well
 	 */
 	skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(dp->index, queue));
@@ -164,7 +164,7 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
 	/* Locate which port this is coming from */
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev)
 		return NULL;
 
@@ -216,7 +216,7 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME);
 static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
 					 struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 *brcm_tag;
 
 	/* The Ethernet switch we are interfaced with needs packets to be at
@@ -226,7 +226,7 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
 	 * (including FCS and tag) because the length verification is done after
 	 * the Broadcom tag is stripped off the ingress packet.
 	 *
-	 * Let dsa_slave_xmit() free the SKB
+	 * Let dsa_user_xmit() free the SKB
 	 */
 	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
 		return NULL;
@@ -264,7 +264,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
 
 	source_port = brcm_tag[5] & BRCM_LEG_PORT_ID;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 1fd7fa26db64..8ed52dd663ab 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -129,7 +129,7 @@ enum dsa_code {
 static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev,
 				   u8 extra)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct net_device *br_dev;
 	u8 tag_dev, tag_port;
 	enum dsa_cmd cmd;
@@ -267,14 +267,14 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
 		lag = dsa_lag_by_id(cpu_dp->dst, source_port + 1);
 		skb->dev = lag ? lag->dev : NULL;
 	} else {
-		skb->dev = dsa_master_find_slave(dev, source_device,
+		skb->dev = dsa_conduit_find_user(dev, source_device,
 						 source_port);
 	}
 
 	if (!skb->dev)
 		return NULL;
 
-	/* When using LAG offload, skb->dev is not a DSA slave interface,
+	/* When using LAG offload, skb->dev is not a DSA user interface,
 	 * so we cannot call dsa_default_offload_fwd_mark and we need to
 	 * special-case it.
 	 */
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index e279cd9057b0..3539141b5350 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -61,7 +61,7 @@
 static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 *gswip_tag;
 
 	skb_push(skb, GSWIP_TX_HEADER_LEN);
@@ -89,7 +89,7 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
 
 	/* Get source port information */
 	port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
index 03a1fb9c87a9..6e233cd0aa38 100644
--- a/net/dsa/tag_hellcreek.c
+++ b/net/dsa/tag_hellcreek.c
@@ -20,7 +20,7 @@
 static struct sk_buff *hellcreek_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 *tag;
 
 	/* Calculate checksums (if required) before adding the trailer tag to
@@ -45,7 +45,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
 	u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN;
 	unsigned int port = tag[0] & 0x03;
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
 		netdev_warn_once(dev, "Failed to get source port: %d\n", port);
 		return NULL;
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 3632e47dea9e..9be341fa88f0 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -87,7 +87,7 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
 				      struct net_device *dev,
 				      unsigned int port, unsigned int len)
 {
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
@@ -119,7 +119,7 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
 
 static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct ethhdr *hdr;
 	u8 *tag;
 
@@ -256,7 +256,7 @@ static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb)
 		return NULL;
 
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_slave_xmit
+	/* Increase refcount so the kfree_skb in dsa_user_xmit
 	 * won't really free the packet.
 	 */
 	xmit_work->dp = dp;
@@ -272,7 +272,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
 {
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 prio = netdev_txq_to_tc(dev, queue_mapping);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct ethhdr *hdr;
 	__be16 *tag;
 	u16 val;
@@ -344,7 +344,7 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
 {
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 prio = netdev_txq_to_tc(dev, queue_mapping);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct ethhdr *hdr;
 	u8 *tag;
 
@@ -410,7 +410,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
 {
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 prio = netdev_txq_to_tc(dev, queue_mapping);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	const struct ethhdr *hdr = eth_hdr(skb);
 	__be16 *tag;
 	u16 val;
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index c25f5536706b..1ed8ee24855d 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -56,7 +56,7 @@ static int lan9303_xmit_use_arl(struct dsa_port *dp, u8 *dest_addr)
 
 static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__be16 *lan9303_tag;
 	u16 tag;
 
@@ -99,7 +99,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
 
 	source_port = lan9303_tag1 & 0x3;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
 		return NULL;
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 40af80452747..2483785f6ab1 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -23,7 +23,7 @@
 static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 				    struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 xmit_tpid;
 	u8 *mtk_tag;
 
@@ -85,7 +85,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	/* Get source port information */
 	port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK);
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_none.c b/net/dsa/tag_none.c
index d2fd179c4227..9a473624db50 100644
--- a/net/dsa/tag_none.c
+++ b/net/dsa/tag_none.c
@@ -12,8 +12,8 @@
 
 #define NONE_NAME	"none"
 
-static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
-					    struct net_device *dev)
+static struct sk_buff *dsa_user_notag_xmit(struct sk_buff *skb,
+					   struct net_device *dev)
 {
 	/* Just return the original SKB */
 	return skb;
@@ -22,7 +22,7 @@ static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
 static const struct dsa_device_ops none_ops = {
 	.name	= NONE_NAME,
 	.proto	= DSA_TAG_PROTO_NONE,
-	.xmit	= dsa_slave_notag_xmit,
+	.xmit	= dsa_user_notag_xmit,
 };
 
 module_dsa_tag_driver(none_ops);
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 20bf7074d5a6..ef2f8fffb2c7 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -45,7 +45,7 @@ static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp,
 static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
 			       __be32 ifh_prefix, void **ifh)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	struct dsa_switch *ds = dp->ds;
 	u64 vlan_tci, tag_type;
 	void *injection;
@@ -79,7 +79,7 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
 static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 				   struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	void *injection;
 
 	ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection);
@@ -91,7 +91,7 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 static struct sk_buff *seville_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	void *injection;
 
 	ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection);
@@ -111,12 +111,12 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	u16 vlan_tpid;
 	u64 rew_val;
 
-	/* Revert skb->data by the amount consumed by the DSA master,
+	/* Revert skb->data by the amount consumed by the DSA conduit,
 	 * so it points to the beginning of the frame.
 	 */
 	skb_push(skb, ETH_HLEN);
 	/* We don't care about the short prefix, it is just for easy entrance
-	 * into the DSA master's RX filter. Discard it now by moving it into
+	 * into the DSA conduit's RX filter. Discard it now by moving it into
 	 * the headroom.
 	 */
 	skb_pull(skb, OCELOT_SHORT_PREFIX_LEN);
@@ -141,12 +141,12 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	ocelot_xfh_get_vlan_tci(extraction, &vlan_tci);
 	ocelot_xfh_get_rew_val(extraction, &rew_val);
 
-	skb->dev = dsa_master_find_slave(netdev, 0, src_port);
+	skb->dev = dsa_conduit_find_user(netdev, 0, src_port);
 	if (!skb->dev)
 		/* The switch will reflect back some frames sent through
-		 * sockets opened on the bare DSA master. These will come back
+		 * sockets opened on the bare DSA conduit. These will come back
 		 * with src_port equal to the index of the CPU port, for which
-		 * there is no slave registered. So don't print any error
+		 * there is no user registered. So don't print any error
 		 * message here (ignore and drop those frames).
 		 */
 		return NULL;
@@ -170,7 +170,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	 * equal to the pvid of the ingress port and should not be used for
 	 * processing.
 	 */
-	dp = dsa_slave_to_port(skb->dev);
+	dp = dsa_user_to_port(skb->dev);
 	vlan_tpid = tag_type ? ETH_P_8021AD : ETH_P_8021Q;
 
 	if (dsa_port_is_vlan_filtering(dp) &&
@@ -192,7 +192,7 @@ static const struct dsa_device_ops ocelot_netdev_ops = {
 	.xmit			= ocelot_xmit,
 	.rcv			= ocelot_rcv,
 	.needed_headroom	= OCELOT_TOTAL_TAG_LEN,
-	.promisc_on_master	= true,
+	.promisc_on_conduit	= true,
 };
 
 DSA_TAG_DRIVER(ocelot_netdev_ops);
@@ -204,7 +204,7 @@ static const struct dsa_device_ops seville_netdev_ops = {
 	.xmit			= seville_xmit,
 	.rcv			= ocelot_rcv,
 	.needed_headroom	= OCELOT_TOTAL_TAG_LEN,
-	.promisc_on_master	= true,
+	.promisc_on_conduit	= true,
 };
 
 DSA_TAG_DRIVER(seville_netdev_ops);
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 1f0b8c20eba5..210039320888 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -37,8 +37,8 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
 		return NULL;
 
 	/* PTP over IP packets need UDP checksumming. We may have inherited
-	 * NETIF_F_HW_CSUM from the DSA master, but these packets are not sent
-	 * through the DSA master, so calculate the checksum here.
+	 * NETIF_F_HW_CSUM from the DSA conduit, but these packets are not sent
+	 * through the DSA conduit, so calculate the checksum here.
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
 		return NULL;
@@ -49,7 +49,7 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
 
 	/* Calls felix_port_deferred_xmit in felix.c */
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_slave_xmit
+	/* Increase refcount so the kfree_skb in dsa_user_xmit
 	 * won't really free the packet.
 	 */
 	xmit_work->dp = dp;
@@ -63,7 +63,7 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
 static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 				   struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
 	u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
@@ -83,7 +83,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 
 	dsa_8021q_rcv(skb, &src_port, &switch_id, NULL);
 
-	skb->dev = dsa_master_find_slave(netdev, switch_id, src_port);
+	skb->dev = dsa_conduit_find_user(netdev, switch_id, src_port);
 	if (!skb->dev)
 		return NULL;
 
@@ -130,7 +130,7 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = {
 	.connect		= ocelot_connect,
 	.disconnect		= ocelot_disconnect,
 	.needed_headroom	= VLAN_HLEN,
-	.promisc_on_master	= true,
+	.promisc_on_conduit	= true,
 };
 
 MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index e5ff7c34e577..6514aa7993ce 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -14,7 +14,7 @@
 
 static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__be16 *phdr;
 	u16 hdr;
 
@@ -78,7 +78,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	/* Get source port information */
 	port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr);
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
@@ -116,7 +116,7 @@ static const struct dsa_device_ops qca_netdev_ops = {
 	.xmit	= qca_tag_xmit,
 	.rcv	= qca_tag_rcv,
 	.needed_headroom = QCA_HDR_LEN,
-	.promisc_on_master = true,
+	.promisc_on_conduit = true,
 };
 
 MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
index c327314b95e3..4da5bad1a7aa 100644
--- a/net/dsa/tag_rtl4_a.c
+++ b/net/dsa/tag_rtl4_a.c
@@ -36,7 +36,7 @@
 static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__be16 *p;
 	u8 *tag;
 	u16 out;
@@ -97,9 +97,9 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
 	}
 	port = protport & 0xff;
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
-		netdev_dbg(dev, "could not find slave for port %d\n", port);
+		netdev_dbg(dev, "could not find user for port %d\n", port);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c
index 4f67834fd121..07e857debabf 100644
--- a/net/dsa/tag_rtl8_4.c
+++ b/net/dsa/tag_rtl8_4.c
@@ -103,7 +103,7 @@
 static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev,
 			     void *tag)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__be16 tag16[RTL8_4_TAG_LEN / 2];
 
 	/* Set Realtek EtherType */
@@ -180,10 +180,10 @@ static int rtl8_4_read_tag(struct sk_buff *skb, struct net_device *dev,
 
 	/* Parse TX (switch->CPU) */
 	port = FIELD_GET(RTL8_4_TX, ntohs(tag16[3]));
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev,
-				     "could not find slave for port %d\n",
+				     "could not find user for port %d\n",
 				     port);
 		return -ENOENT;
 	}
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
index 437a6820ac42..2ce866b45615 100644
--- a/net/dsa/tag_rzn1_a5psw.c
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -39,7 +39,7 @@ struct a5psw_tag {
 
 static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct a5psw_tag *ptag;
 	u32 data2_val;
 
@@ -90,7 +90,7 @@ static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
 
 	port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index ade3eeb2f3e6..1fffe8c2b589 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -157,7 +157,7 @@ static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp,
 		return NULL;
 
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_slave_xmit
+	/* Increase refcount so the kfree_skb in dsa_user_xmit
 	 * won't really free the packet.
 	 */
 	xmit_work->dp = dp;
@@ -210,7 +210,7 @@ static u16 sja1105_xmit_tpid(struct dsa_port *dp)
 static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
 					      struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	unsigned int bridge_num = dsa_port_bridge_num_get(dp);
 	struct net_device *br = dsa_port_bridge_dev_get(dp);
 	u16 tx_vid;
@@ -235,7 +235,7 @@ static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
 
 /* Transform untagged control packets into pvid-tagged control packets so that
  * all packets sent by this tagger are VLAN-tagged and we can configure the
- * switch to drop untagged packets coming from the DSA master.
+ * switch to drop untagged packets coming from the DSA conduit.
  */
 static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp,
 						    struct sk_buff *skb, u8 pcp)
@@ -266,7 +266,7 @@ static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp,
 static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
 	u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
@@ -294,7 +294,7 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
 	struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone;
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
 	u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
@@ -383,7 +383,7 @@ static struct sk_buff
 	 * Buffer it until we get its meta frame.
 	 */
 	if (is_link_local) {
-		struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+		struct dsa_port *dp = dsa_user_to_port(skb->dev);
 		struct sja1105_tagger_private *priv;
 		struct dsa_switch *ds = dp->ds;
 
@@ -396,7 +396,7 @@ static struct sk_buff
 		if (priv->stampable_skb) {
 			dev_err_ratelimited(ds->dev,
 					    "Expected meta frame, is %12llx "
-					    "in the DSA master multicast filter?\n",
+					    "in the DSA conduit multicast filter?\n",
 					    SJA1105_META_DMAC);
 			kfree_skb(priv->stampable_skb);
 		}
@@ -417,7 +417,7 @@ static struct sk_buff
 	 * frame, which serves no further purpose).
 	 */
 	} else if (is_meta) {
-		struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+		struct dsa_port *dp = dsa_user_to_port(skb->dev);
 		struct sja1105_tagger_private *priv;
 		struct dsa_switch *ds = dp->ds;
 		struct sk_buff *stampable_skb;
@@ -550,7 +550,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	}
 
 	if (source_port != -1 && switch_id != -1)
-		skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
+		skb->dev = dsa_conduit_find_user(netdev, switch_id, source_port);
 	else if (vbid >= 1)
 		skb->dev = dsa_tag_8021q_find_port_by_vbid(netdev, vbid);
 	else
@@ -573,16 +573,16 @@ static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
 	int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header);
 	int n_ts = SJA1110_RX_HEADER_N_TS(rx_header);
 	struct sja1105_tagger_data *tagger_data;
-	struct net_device *master = skb->dev;
+	struct net_device *conduit = skb->dev;
 	struct dsa_port *cpu_dp;
 	struct dsa_switch *ds;
 	int i;
 
-	cpu_dp = master->dsa_ptr;
+	cpu_dp = conduit->dsa_ptr;
 	ds = dsa_switch_find(cpu_dp->dst->index, switch_id);
 	if (!ds) {
 		net_err_ratelimited("%s: cannot find switch id %d\n",
-				    master->name, switch_id);
+				    conduit->name, switch_id);
 		return NULL;
 	}
 
@@ -649,7 +649,7 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 
 		/* skb->len counts from skb->data, while start_of_padding
 		 * counts from the destination MAC address. Right now skb->data
-		 * is still as set by the DSA master, so to trim away the
+		 * is still as set by the DSA conduit, so to trim away the
 		 * padding and trailer we need to account for the fact that
 		 * skb->data points to skb_mac_header(skb) + ETH_HLEN.
 		 */
@@ -698,7 +698,7 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
 	else if (source_port == -1 || switch_id == -1)
 		skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid);
 	else
-		skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
+		skb->dev = dsa_conduit_find_user(netdev, switch_id, source_port);
 	if (!skb->dev) {
 		netdev_warn(netdev, "Couldn't decode source port\n");
 		return NULL;
@@ -778,7 +778,7 @@ static const struct dsa_device_ops sja1105_netdev_ops = {
 	.disconnect = sja1105_disconnect,
 	.needed_headroom = VLAN_HLEN,
 	.flow_dissect = sja1105_flow_dissect,
-	.promisc_on_master = true,
+	.promisc_on_conduit = true,
 };
 
 DSA_TAG_DRIVER(sja1105_netdev_ops);
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 7361b9106382..1ebb25a8b140 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -14,7 +14,7 @@
 
 static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 *trailer;
 
 	trailer = skb_put(skb, 4);
@@ -41,7 +41,7 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev)
 
 	source_port = trailer[1] & 7;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c
index af19969f9bc4..c9c163598ef2 100644
--- a/net/dsa/tag_xrs700x.c
+++ b/net/dsa/tag_xrs700x.c
@@ -13,7 +13,7 @@
 
 static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *partner, *dp = dsa_slave_to_port(dev);
+	struct dsa_port *partner, *dp = dsa_user_to_port(dev);
 	u8 *trailer;
 
 	trailer = skb_put(skb, 1);
@@ -39,7 +39,7 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev)
 	if (source_port < 0)
 		return NULL;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/user.c b/net/dsa/user.c
new file mode 100644
index 000000000000..d438884a4eb0
--- /dev/null
+++ b/net/dsa/user.c
@@ -0,0 +1,3727 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/dsa/user.c - user device handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ */
+
+#include <linux/list.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/phy_fixed.h>
+#include <linux/phylink.h>
+#include <linux/of_net.h>
+#include <linux/of_mdio.h>
+#include <linux/mdio.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/selftests.h>
+#include <net/tc_act/tc_mirred.h>
+#include <linux/if_bridge.h>
+#include <linux/if_hsr.h>
+#include <net/dcbnl.h>
+#include <linux/netpoll.h>
+#include <linux/string.h>
+
+#include "conduit.h"
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "user.h"
+
+struct dsa_switchdev_event_work {
+	struct net_device *dev;
+	struct net_device *orig_dev;
+	struct work_struct work;
+	unsigned long event;
+	/* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
+	 * SWITCHDEV_FDB_DEL_TO_DEVICE
+	 */
+	unsigned char addr[ETH_ALEN];
+	u16 vid;
+	bool host_addr;
+};
+
+enum dsa_standalone_event {
+	DSA_UC_ADD,
+	DSA_UC_DEL,
+	DSA_MC_ADD,
+	DSA_MC_DEL,
+};
+
+struct dsa_standalone_event_work {
+	struct work_struct work;
+	struct net_device *dev;
+	enum dsa_standalone_event event;
+	unsigned char addr[ETH_ALEN];
+	u16 vid;
+};
+
+struct dsa_host_vlan_rx_filtering_ctx {
+	struct net_device *dev;
+	const unsigned char *addr;
+	enum dsa_standalone_event event;
+};
+
+static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
+{
+	return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
+	       ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+	       !ds->needs_standalone_vlan_filtering;
+}
+
+static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
+{
+	return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
+	       ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+	       !ds->needs_standalone_vlan_filtering;
+}
+
+static void dsa_user_standalone_event_work(struct work_struct *work)
+{
+	struct dsa_standalone_event_work *standalone_work =
+		container_of(work, struct dsa_standalone_event_work, work);
+	const unsigned char *addr = standalone_work->addr;
+	struct net_device *dev = standalone_work->dev;
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_mdb mdb;
+	struct dsa_switch *ds = dp->ds;
+	u16 vid = standalone_work->vid;
+	int err;
+
+	switch (standalone_work->event) {
+	case DSA_UC_ADD:
+		err = dsa_port_standalone_host_fdb_add(dp, addr, vid);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to add %pM vid %d to fdb: %d\n",
+				dp->index, addr, vid, err);
+			break;
+		}
+		break;
+
+	case DSA_UC_DEL:
+		err = dsa_port_standalone_host_fdb_del(dp, addr, vid);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to delete %pM vid %d from fdb: %d\n",
+				dp->index, addr, vid, err);
+		}
+
+		break;
+	case DSA_MC_ADD:
+		ether_addr_copy(mdb.addr, addr);
+		mdb.vid = vid;
+
+		err = dsa_port_standalone_host_mdb_add(dp, &mdb);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to add %pM vid %d to mdb: %d\n",
+				dp->index, addr, vid, err);
+			break;
+		}
+		break;
+	case DSA_MC_DEL:
+		ether_addr_copy(mdb.addr, addr);
+		mdb.vid = vid;
+
+		err = dsa_port_standalone_host_mdb_del(dp, &mdb);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to delete %pM vid %d from mdb: %d\n",
+				dp->index, addr, vid, err);
+		}
+
+		break;
+	}
+
+	kfree(standalone_work);
+}
+
+static int dsa_user_schedule_standalone_work(struct net_device *dev,
+					     enum dsa_standalone_event event,
+					     const unsigned char *addr,
+					     u16 vid)
+{
+	struct dsa_standalone_event_work *standalone_work;
+
+	standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC);
+	if (!standalone_work)
+		return -ENOMEM;
+
+	INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work);
+	standalone_work->event = event;
+	standalone_work->dev = dev;
+
+	ether_addr_copy(standalone_work->addr, addr);
+	standalone_work->vid = vid;
+
+	dsa_schedule_work(&standalone_work->work);
+
+	return 0;
+}
+
+static int dsa_user_host_vlan_rx_filtering(void *arg, int vid)
+{
+	struct dsa_host_vlan_rx_filtering_ctx *ctx = arg;
+
+	return dsa_user_schedule_standalone_work(ctx->dev, ctx->event,
+						  ctx->addr, vid);
+}
+
+static int dsa_user_vlan_for_each(struct net_device *dev,
+				  int (*cb)(void *arg, int vid), void *arg)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_vlan *v;
+	int err;
+
+	lockdep_assert_held(&dev->addr_list_lock);
+
+	err = cb(arg, 0);
+	if (err)
+		return err;
+
+	list_for_each_entry(v, &dp->user_vlans, list) {
+		err = cb(arg, v->vid);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int dsa_user_sync_uc(struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_host_vlan_rx_filtering_ctx ctx = {
+		.dev = dev,
+		.addr = addr,
+		.event = DSA_UC_ADD,
+	};
+
+	dev_uc_add(conduit, addr);
+
+	if (!dsa_switch_supports_uc_filtering(dp->ds))
+		return 0;
+
+	return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+				       &ctx);
+}
+
+static int dsa_user_unsync_uc(struct net_device *dev,
+			      const unsigned char *addr)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_host_vlan_rx_filtering_ctx ctx = {
+		.dev = dev,
+		.addr = addr,
+		.event = DSA_UC_DEL,
+	};
+
+	dev_uc_del(conduit, addr);
+
+	if (!dsa_switch_supports_uc_filtering(dp->ds))
+		return 0;
+
+	return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+				       &ctx);
+}
+
+static int dsa_user_sync_mc(struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_host_vlan_rx_filtering_ctx ctx = {
+		.dev = dev,
+		.addr = addr,
+		.event = DSA_MC_ADD,
+	};
+
+	dev_mc_add(conduit, addr);
+
+	if (!dsa_switch_supports_mc_filtering(dp->ds))
+		return 0;
+
+	return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+				       &ctx);
+}
+
+static int dsa_user_unsync_mc(struct net_device *dev,
+			      const unsigned char *addr)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_host_vlan_rx_filtering_ctx ctx = {
+		.dev = dev,
+		.addr = addr,
+		.event = DSA_MC_DEL,
+	};
+
+	dev_mc_del(conduit, addr);
+
+	if (!dsa_switch_supports_mc_filtering(dp->ds))
+		return 0;
+
+	return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+				       &ctx);
+}
+
+void dsa_user_sync_ha(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(dev);
+
+	netdev_for_each_synced_mc_addr(ha, dev)
+		dsa_user_sync_mc(dev, ha->addr);
+
+	netdev_for_each_synced_uc_addr(ha, dev)
+		dsa_user_sync_uc(dev, ha->addr);
+
+	netif_addr_unlock_bh(dev);
+
+	if (dsa_switch_supports_uc_filtering(ds) ||
+	    dsa_switch_supports_mc_filtering(ds))
+		dsa_flush_workqueue();
+}
+
+void dsa_user_unsync_ha(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(dev);
+
+	netdev_for_each_synced_uc_addr(ha, dev)
+		dsa_user_unsync_uc(dev, ha->addr);
+
+	netdev_for_each_synced_mc_addr(ha, dev)
+		dsa_user_unsync_mc(dev, ha->addr);
+
+	netif_addr_unlock_bh(dev);
+
+	if (dsa_switch_supports_uc_filtering(ds) ||
+	    dsa_switch_supports_mc_filtering(ds))
+		dsa_flush_workqueue();
+}
+
+/* user mii_bus handling ***************************************************/
+static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg)
+{
+	struct dsa_switch *ds = bus->priv;
+
+	if (ds->phys_mii_mask & (1 << addr))
+		return ds->ops->phy_read(ds, addr, reg);
+
+	return 0xffff;
+}
+
+static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
+{
+	struct dsa_switch *ds = bus->priv;
+
+	if (ds->phys_mii_mask & (1 << addr))
+		return ds->ops->phy_write(ds, addr, reg, val);
+
+	return 0;
+}
+
+void dsa_user_mii_bus_init(struct dsa_switch *ds)
+{
+	ds->user_mii_bus->priv = (void *)ds;
+	ds->user_mii_bus->name = "dsa user smi";
+	ds->user_mii_bus->read = dsa_user_phy_read;
+	ds->user_mii_bus->write = dsa_user_phy_write;
+	snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
+		 ds->dst->index, ds->index);
+	ds->user_mii_bus->parent = ds->dev;
+	ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask;
+}
+
+
+/* user device handling ****************************************************/
+static int dsa_user_get_iflink(const struct net_device *dev)
+{
+	return dsa_user_to_conduit(dev)->ifindex;
+}
+
+static int dsa_user_open(struct net_device *dev)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int err;
+
+	err = dev_open(conduit, NULL);
+	if (err < 0) {
+		netdev_err(dev, "failed to open conduit %s\n", conduit->name);
+		goto out;
+	}
+
+	if (dsa_switch_supports_uc_filtering(ds)) {
+		err = dsa_port_standalone_host_fdb_add(dp, dev->dev_addr, 0);
+		if (err)
+			goto out;
+	}
+
+	if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) {
+		err = dev_uc_add(conduit, dev->dev_addr);
+		if (err < 0)
+			goto del_host_addr;
+	}
+
+	err = dsa_port_enable_rt(dp, dev->phydev);
+	if (err)
+		goto del_unicast;
+
+	return 0;
+
+del_unicast:
+	if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
+		dev_uc_del(conduit, dev->dev_addr);
+del_host_addr:
+	if (dsa_switch_supports_uc_filtering(ds))
+		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
+out:
+	return err;
+}
+
+static int dsa_user_close(struct net_device *dev)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	dsa_port_disable_rt(dp);
+
+	if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
+		dev_uc_del(conduit, dev->dev_addr);
+
+	if (dsa_switch_supports_uc_filtering(ds))
+		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
+
+	return 0;
+}
+
+static void dsa_user_manage_host_flood(struct net_device *dev)
+{
+	bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	bool uc = dev->flags & IFF_PROMISC;
+
+	dsa_port_set_host_flood(dp, uc, mc);
+}
+
+static void dsa_user_change_rx_flags(struct net_device *dev, int change)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(conduit,
+				 dev->flags & IFF_ALLMULTI ? 1 : -1);
+	if (change & IFF_PROMISC)
+		dev_set_promiscuity(conduit,
+				    dev->flags & IFF_PROMISC ? 1 : -1);
+
+	if (dsa_switch_supports_uc_filtering(ds) &&
+	    dsa_switch_supports_mc_filtering(ds))
+		dsa_user_manage_host_flood(dev);
+}
+
+static void dsa_user_set_rx_mode(struct net_device *dev)
+{
+	__dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc);
+	__dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc);
+}
+
+static int dsa_user_set_mac_address(struct net_device *dev, void *a)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct sockaddr *addr = a;
+	int err;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (ds->ops->port_set_mac_address) {
+		err = ds->ops->port_set_mac_address(ds, dp->index,
+						    addr->sa_data);
+		if (err)
+			return err;
+	}
+
+	/* If the port is down, the address isn't synced yet to hardware or
+	 * to the DSA conduit, so there is nothing to change.
+	 */
+	if (!(dev->flags & IFF_UP))
+		goto out_change_dev_addr;
+
+	if (dsa_switch_supports_uc_filtering(ds)) {
+		err = dsa_port_standalone_host_fdb_add(dp, addr->sa_data, 0);
+		if (err)
+			return err;
+	}
+
+	if (!ether_addr_equal(addr->sa_data, conduit->dev_addr)) {
+		err = dev_uc_add(conduit, addr->sa_data);
+		if (err < 0)
+			goto del_unicast;
+	}
+
+	if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
+		dev_uc_del(conduit, dev->dev_addr);
+
+	if (dsa_switch_supports_uc_filtering(ds))
+		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
+
+out_change_dev_addr:
+	eth_hw_addr_set(dev, addr->sa_data);
+
+	return 0;
+
+del_unicast:
+	if (dsa_switch_supports_uc_filtering(ds))
+		dsa_port_standalone_host_fdb_del(dp, addr->sa_data, 0);
+
+	return err;
+}
+
+struct dsa_user_dump_ctx {
+	struct net_device *dev;
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	int idx;
+};
+
+static int
+dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid,
+			  bool is_static, void *data)
+{
+	struct dsa_user_dump_ctx *dump = data;
+	u32 portid = NETLINK_CB(dump->cb->skb).portid;
+	u32 seq = dump->cb->nlh->nlmsg_seq;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	if (dump->idx < dump->cb->args[2])
+		goto skip;
+
+	nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
+			sizeof(*ndm), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family  = AF_BRIDGE;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags   = NTF_SELF;
+	ndm->ndm_type    = 0;
+	ndm->ndm_ifindex = dump->dev->ifindex;
+	ndm->ndm_state   = is_static ? NUD_NOARP : NUD_REACHABLE;
+
+	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
+		goto nla_put_failure;
+
+	if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
+		goto nla_put_failure;
+
+	nlmsg_end(dump->skb, nlh);
+
+skip:
+	dump->idx++;
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(dump->skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int
+dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+		  struct net_device *dev, struct net_device *filter_dev,
+		  int *idx)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_user_dump_ctx dump = {
+		.dev = dev,
+		.skb = skb,
+		.cb = cb,
+		.idx = *idx,
+	};
+	int err;
+
+	err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump);
+	*idx = dump.idx;
+
+	return err;
+}
+
+static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+	int port = p->dp->index;
+
+	/* Pass through to switch driver if it supports timestamping */
+	switch (cmd) {
+	case SIOCGHWTSTAMP:
+		if (ds->ops->port_hwtstamp_get)
+			return ds->ops->port_hwtstamp_get(ds, port, ifr);
+		break;
+	case SIOCSHWTSTAMP:
+		if (ds->ops->port_hwtstamp_set)
+			return ds->ops->port_hwtstamp_set(ds, port, ifr);
+		break;
+	}
+
+	return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
+}
+
+static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx,
+				  const struct switchdev_attr *attr,
+				  struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	int ret;
+
+	if (ctx && ctx != dp)
+		return 0;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_set_state(dp, attr->u.stp_state, true);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_MST_STATE:
+		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering,
+					      extack);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_ageing_time(dp, attr->u.ageing_time);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_MST:
+		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_mst_enable(dp, attr->u.mst, extack);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS:
+		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags,
+						extack);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack);
+		break;
+	case SWITCHDEV_ATTR_ID_VLAN_MSTI:
+		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	return ret;
+}
+
+/* Must be called under rcu_read_lock() */
+static int
+dsa_user_vlan_check_for_8021q_uppers(struct net_device *user,
+				     const struct switchdev_obj_port_vlan *vlan)
+{
+	struct net_device *upper_dev;
+	struct list_head *iter;
+
+	netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
+		u16 vid;
+
+		if (!is_vlan_dev(upper_dev))
+			continue;
+
+		vid = vlan_dev_vlan_id(upper_dev);
+		if (vid == vlan->vid)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int dsa_user_vlan_add(struct net_device *dev,
+			     const struct switchdev_obj *obj,
+			     struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan *vlan;
+	int err;
+
+	if (dsa_port_skip_vlan_configuration(dp)) {
+		NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+		return 0;
+	}
+
+	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	/* Deny adding a bridge VLAN when there is already an 802.1Q upper with
+	 * the same VID.
+	 */
+	if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) {
+		rcu_read_lock();
+		err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan);
+		rcu_read_unlock();
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Port already has a VLAN upper with this VID");
+			return err;
+		}
+	}
+
+	return dsa_port_vlan_add(dp, vlan, extack);
+}
+
+/* Offload a VLAN installed on the bridge or on a foreign interface by
+ * installing it as a VLAN towards the CPU port.
+ */
+static int dsa_user_host_vlan_add(struct net_device *dev,
+				  const struct switchdev_obj *obj,
+				  struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan vlan;
+
+	/* Do nothing if this is a software bridge */
+	if (!dp->bridge)
+		return -EOPNOTSUPP;
+
+	if (dsa_port_skip_vlan_configuration(dp)) {
+		NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+		return 0;
+	}
+
+	vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	/* Even though drivers often handle CPU membership in special ways,
+	 * it doesn't make sense to program a PVID, so clear this flag.
+	 */
+	vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;
+
+	return dsa_port_host_vlan_add(dp, &vlan, extack);
+}
+
+static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx,
+				 const struct switchdev_obj *obj,
+				 struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	int err;
+
+	if (ctx && ctx != dp)
+		return 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_HOST_MDB:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+			err = dsa_user_vlan_add(dev, obj, extack);
+		else
+			err = dsa_user_host_vlan_add(dev, obj, extack);
+		break;
+	case SWITCHDEV_OBJ_ID_MRP:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mrp_add_ring_role(dp,
+						 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int dsa_user_vlan_del(struct net_device *dev,
+			     const struct switchdev_obj *obj)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan *vlan;
+
+	if (dsa_port_skip_vlan_configuration(dp))
+		return 0;
+
+	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	return dsa_port_vlan_del(dp, vlan);
+}
+
+static int dsa_user_host_vlan_del(struct net_device *dev,
+				  const struct switchdev_obj *obj)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan *vlan;
+
+	/* Do nothing if this is a software bridge */
+	if (!dp->bridge)
+		return -EOPNOTSUPP;
+
+	if (dsa_port_skip_vlan_configuration(dp))
+		return 0;
+
+	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	return dsa_port_host_vlan_del(dp, vlan);
+}
+
+static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx,
+				 const struct switchdev_obj *obj)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	int err;
+
+	if (ctx && ctx != dp)
+		return 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_HOST_MDB:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+			err = dsa_user_vlan_del(dev, obj);
+		else
+			err = dsa_user_host_vlan_del(dev, obj);
+		break;
+	case SWITCHDEV_OBJ_ID_MRP:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mrp_del_ring_role(dp,
+						 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static inline netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev,
+						    struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct dsa_user_priv *p = netdev_priv(dev);
+
+	return netpoll_send_skb(p->netpoll, skb);
+#else
+	BUG();
+	return NETDEV_TX_OK;
+#endif
+}
+
+static void dsa_skb_tx_timestamp(struct dsa_user_priv *p,
+				 struct sk_buff *skb)
+{
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
+		return;
+
+	if (!ds->ops->port_txtstamp)
+		return;
+
+	ds->ops->port_txtstamp(ds, p->dp->index, skb);
+}
+
+netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
+{
+	/* SKB for netpoll still need to be mangled with the protocol-specific
+	 * tag to be successfully transmitted
+	 */
+	if (unlikely(netpoll_tx_running(dev)))
+		return dsa_user_netpoll_send_skb(dev, skb);
+
+	/* Queue the SKB for transmission on the parent interface, but
+	 * do not modify its EtherType
+	 */
+	skb->dev = dsa_user_to_conduit(dev);
+	dev_queue_xmit(skb);
+
+	return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
+
+static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
+{
+	int needed_headroom = dev->needed_headroom;
+	int needed_tailroom = dev->needed_tailroom;
+
+	/* For tail taggers, we need to pad short frames ourselves, to ensure
+	 * that the tail tag does not fail at its role of being at the end of
+	 * the packet, once the conduit interface pads the frame. Account for
+	 * that pad length here, and pad later.
+	 */
+	if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
+		needed_tailroom += ETH_ZLEN - skb->len;
+	/* skb_headroom() returns unsigned int... */
+	needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
+	needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
+
+	if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
+		/* No reallocation needed, yay! */
+		return 0;
+
+	return pskb_expand_head(skb, needed_headroom, needed_tailroom,
+				GFP_ATOMIC);
+}
+
+static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct sk_buff *nskb;
+
+	dev_sw_netstats_tx_add(dev, 1, skb->len);
+
+	memset(skb->cb, 0, sizeof(skb->cb));
+
+	/* Handle tx timestamp if any */
+	dsa_skb_tx_timestamp(p, skb);
+
+	if (dsa_realloc_skb(skb, dev)) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	/* needed_tailroom should still be 'warm' in the cache line from
+	 * dsa_realloc_skb(), which has also ensured that padding is safe.
+	 */
+	if (dev->needed_tailroom)
+		eth_skb_pad(skb);
+
+	/* Transmit function may have to reallocate the original SKB,
+	 * in which case it must have freed it. Only free it here on error.
+	 */
+	nskb = p->xmit(skb, dev);
+	if (!nskb) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	return dsa_enqueue_skb(nskb, dev);
+}
+
+/* ethtool operations *******************************************************/
+
+static void dsa_user_get_drvinfo(struct net_device *dev,
+				 struct ethtool_drvinfo *drvinfo)
+{
+	strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
+	strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
+	strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
+}
+
+static int dsa_user_get_regs_len(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_regs_len)
+		return ds->ops->get_regs_len(ds, dp->index);
+
+	return -EOPNOTSUPP;
+}
+
+static void
+dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_regs)
+		ds->ops->get_regs(ds, dp->index, regs, _p);
+}
+
+static int dsa_user_nway_reset(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return phylink_ethtool_nway_reset(dp->pl);
+}
+
+static int dsa_user_get_eeprom_len(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->cd && ds->cd->eeprom_len)
+		return ds->cd->eeprom_len;
+
+	if (ds->ops->get_eeprom_len)
+		return ds->ops->get_eeprom_len(ds);
+
+	return 0;
+}
+
+static int dsa_user_get_eeprom(struct net_device *dev,
+			       struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_eeprom)
+		return ds->ops->get_eeprom(ds, eeprom, data);
+
+	return -EOPNOTSUPP;
+}
+
+static int dsa_user_set_eeprom(struct net_device *dev,
+			       struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->set_eeprom)
+		return ds->ops->set_eeprom(ds, eeprom, data);
+
+	return -EOPNOTSUPP;
+}
+
+static void dsa_user_get_strings(struct net_device *dev,
+				 uint32_t stringset, uint8_t *data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (stringset == ETH_SS_STATS) {
+		int len = ETH_GSTRING_LEN;
+
+		strscpy_pad(data, "tx_packets", len);
+		strscpy_pad(data + len, "tx_bytes", len);
+		strscpy_pad(data + 2 * len, "rx_packets", len);
+		strscpy_pad(data + 3 * len, "rx_bytes", len);
+		if (ds->ops->get_strings)
+			ds->ops->get_strings(ds, dp->index, stringset,
+					     data + 4 * len);
+	} else if (stringset ==  ETH_SS_TEST) {
+		net_selftest_get_strings(data);
+	}
+
+}
+
+static void dsa_user_get_ethtool_stats(struct net_device *dev,
+				       struct ethtool_stats *stats,
+				       uint64_t *data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct pcpu_sw_netstats *s;
+	unsigned int start;
+	int i;
+
+	for_each_possible_cpu(i) {
+		u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
+
+		s = per_cpu_ptr(dev->tstats, i);
+		do {
+			start = u64_stats_fetch_begin(&s->syncp);
+			tx_packets = u64_stats_read(&s->tx_packets);
+			tx_bytes = u64_stats_read(&s->tx_bytes);
+			rx_packets = u64_stats_read(&s->rx_packets);
+			rx_bytes = u64_stats_read(&s->rx_bytes);
+		} while (u64_stats_fetch_retry(&s->syncp, start));
+		data[0] += tx_packets;
+		data[1] += tx_bytes;
+		data[2] += rx_packets;
+		data[3] += rx_bytes;
+	}
+	if (ds->ops->get_ethtool_stats)
+		ds->ops->get_ethtool_stats(ds, dp->index, data + 4);
+}
+
+static int dsa_user_get_sset_count(struct net_device *dev, int sset)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (sset == ETH_SS_STATS) {
+		int count = 0;
+
+		if (ds->ops->get_sset_count) {
+			count = ds->ops->get_sset_count(ds, dp->index, sset);
+			if (count < 0)
+				return count;
+		}
+
+		return count + 4;
+	} else if (sset ==  ETH_SS_TEST) {
+		return net_selftest_get_count();
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static void dsa_user_get_eth_phy_stats(struct net_device *dev,
+				       struct ethtool_eth_phy_stats *phy_stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_eth_phy_stats)
+		ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats);
+}
+
+static void dsa_user_get_eth_mac_stats(struct net_device *dev,
+				       struct ethtool_eth_mac_stats *mac_stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_eth_mac_stats)
+		ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats);
+}
+
+static void
+dsa_user_get_eth_ctrl_stats(struct net_device *dev,
+			    struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_eth_ctrl_stats)
+		ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
+}
+
+static void
+dsa_user_get_rmon_stats(struct net_device *dev,
+			struct ethtool_rmon_stats *rmon_stats,
+			const struct ethtool_rmon_hist_range **ranges)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_rmon_stats)
+		ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
+}
+
+static void dsa_user_net_selftest(struct net_device *ndev,
+				  struct ethtool_test *etest, u64 *buf)
+{
+	struct dsa_port *dp = dsa_user_to_port(ndev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->self_test) {
+		ds->ops->self_test(ds, dp->index, etest, buf);
+		return;
+	}
+
+	net_selftest(ndev, etest, buf);
+}
+
+static int dsa_user_get_mm(struct net_device *dev,
+			   struct ethtool_mm_state *state)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->get_mm)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_mm(ds, dp->index, state);
+}
+
+static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg,
+			   struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->set_mm)
+		return -EOPNOTSUPP;
+
+	return ds->ops->set_mm(ds, dp->index, cfg, extack);
+}
+
+static void dsa_user_get_mm_stats(struct net_device *dev,
+				  struct ethtool_mm_stats *stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_mm_stats)
+		ds->ops->get_mm_stats(ds, dp->index, stats);
+}
+
+static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	phylink_ethtool_get_wol(dp->pl, w);
+
+	if (ds->ops->get_wol)
+		ds->ops->get_wol(ds, dp->index, w);
+}
+
+static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int ret = -EOPNOTSUPP;
+
+	phylink_ethtool_set_wol(dp->pl, w);
+
+	if (ds->ops->set_wol)
+		ret = ds->ops->set_wol(ds, dp->index, w);
+
+	return ret;
+}
+
+static int dsa_user_set_eee(struct net_device *dev, struct ethtool_eee *e)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int ret;
+
+	/* Port's PHY and MAC both need to be EEE capable */
+	if (!dev->phydev || !dp->pl)
+		return -ENODEV;
+
+	if (!ds->ops->set_mac_eee)
+		return -EOPNOTSUPP;
+
+	ret = ds->ops->set_mac_eee(ds, dp->index, e);
+	if (ret)
+		return ret;
+
+	return phylink_ethtool_set_eee(dp->pl, e);
+}
+
+static int dsa_user_get_eee(struct net_device *dev, struct ethtool_eee *e)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int ret;
+
+	/* Port's PHY and MAC both need to be EEE capable */
+	if (!dev->phydev || !dp->pl)
+		return -ENODEV;
+
+	if (!ds->ops->get_mac_eee)
+		return -EOPNOTSUPP;
+
+	ret = ds->ops->get_mac_eee(ds, dp->index, e);
+	if (ret)
+		return ret;
+
+	return phylink_ethtool_get_eee(dp->pl, e);
+}
+
+static int dsa_user_get_link_ksettings(struct net_device *dev,
+				       struct ethtool_link_ksettings *cmd)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return phylink_ethtool_ksettings_get(dp->pl, cmd);
+}
+
+static int dsa_user_set_link_ksettings(struct net_device *dev,
+				       const struct ethtool_link_ksettings *cmd)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return phylink_ethtool_ksettings_set(dp->pl, cmd);
+}
+
+static void dsa_user_get_pause_stats(struct net_device *dev,
+				     struct ethtool_pause_stats *pause_stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_pause_stats)
+		ds->ops->get_pause_stats(ds, dp->index, pause_stats);
+}
+
+static void dsa_user_get_pauseparam(struct net_device *dev,
+				    struct ethtool_pauseparam *pause)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	phylink_ethtool_get_pauseparam(dp->pl, pause);
+}
+
+static int dsa_user_set_pauseparam(struct net_device *dev,
+				   struct ethtool_pauseparam *pause)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return phylink_ethtool_set_pauseparam(dp->pl, pause);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static int dsa_user_netpoll_setup(struct net_device *dev,
+				  struct netpoll_info *ni)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct netpoll *netpoll;
+	int err = 0;
+
+	netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+	if (!netpoll)
+		return -ENOMEM;
+
+	err = __netpoll_setup(netpoll, conduit);
+	if (err) {
+		kfree(netpoll);
+		goto out;
+	}
+
+	p->netpoll = netpoll;
+out:
+	return err;
+}
+
+static void dsa_user_netpoll_cleanup(struct net_device *dev)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct netpoll *netpoll = p->netpoll;
+
+	if (!netpoll)
+		return;
+
+	p->netpoll = NULL;
+
+	__netpoll_free(netpoll);
+}
+
+static void dsa_user_poll_controller(struct net_device *dev)
+{
+}
+#endif
+
+static struct dsa_mall_tc_entry *
+dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_mall_tc_entry *mall_tc_entry;
+
+	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
+		if (mall_tc_entry->cookie == cookie)
+			return mall_tc_entry;
+
+	return NULL;
+}
+
+static int
+dsa_user_add_cls_matchall_mirred(struct net_device *dev,
+				 struct tc_cls_matchall_offload *cls,
+				 bool ingress)
+{
+	struct netlink_ext_ack *extack = cls->common.extack;
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_mall_mirror_tc_entry *mirror;
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = dp->ds;
+	struct flow_action_entry *act;
+	struct dsa_port *to_dp;
+	int err;
+
+	if (!ds->ops->port_mirror_add)
+		return -EOPNOTSUPP;
+
+	if (!flow_action_basic_hw_stats_check(&cls->rule->action,
+					      cls->common.extack))
+		return -EOPNOTSUPP;
+
+	act = &cls->rule->action.entries[0];
+
+	if (!act->dev)
+		return -EINVAL;
+
+	if (!dsa_user_dev_check(act->dev))
+		return -EOPNOTSUPP;
+
+	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+	if (!mall_tc_entry)
+		return -ENOMEM;
+
+	mall_tc_entry->cookie = cls->cookie;
+	mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+	mirror = &mall_tc_entry->mirror;
+
+	to_dp = dsa_user_to_port(act->dev);
+
+	mirror->to_local_port = to_dp->index;
+	mirror->ingress = ingress;
+
+	err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack);
+	if (err) {
+		kfree(mall_tc_entry);
+		return err;
+	}
+
+	list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+	return err;
+}
+
+static int
+dsa_user_add_cls_matchall_police(struct net_device *dev,
+				 struct tc_cls_matchall_offload *cls,
+				 bool ingress)
+{
+	struct netlink_ext_ack *extack = cls->common.extack;
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_mall_policer_tc_entry *policer;
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = dp->ds;
+	struct flow_action_entry *act;
+	int err;
+
+	if (!ds->ops->port_policer_add) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Policing offload not implemented");
+		return -EOPNOTSUPP;
+	}
+
+	if (!ingress) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Only supported on ingress qdisc");
+		return -EOPNOTSUPP;
+	}
+
+	if (!flow_action_basic_hw_stats_check(&cls->rule->action,
+					      cls->common.extack))
+		return -EOPNOTSUPP;
+
+	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
+		if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Only one port policer allowed");
+			return -EEXIST;
+		}
+	}
+
+	act = &cls->rule->action.entries[0];
+
+	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+	if (!mall_tc_entry)
+		return -ENOMEM;
+
+	mall_tc_entry->cookie = cls->cookie;
+	mall_tc_entry->type = DSA_PORT_MALL_POLICER;
+	policer = &mall_tc_entry->policer;
+	policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
+	policer->burst = act->police.burst;
+
+	err = ds->ops->port_policer_add(ds, dp->index, policer);
+	if (err) {
+		kfree(mall_tc_entry);
+		return err;
+	}
+
+	list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+	return err;
+}
+
+static int dsa_user_add_cls_matchall(struct net_device *dev,
+				     struct tc_cls_matchall_offload *cls,
+				     bool ingress)
+{
+	int err = -EOPNOTSUPP;
+
+	if (cls->common.protocol == htons(ETH_P_ALL) &&
+	    flow_offload_has_one_action(&cls->rule->action) &&
+	    cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED)
+		err = dsa_user_add_cls_matchall_mirred(dev, cls, ingress);
+	else if (flow_offload_has_one_action(&cls->rule->action) &&
+		 cls->rule->action.entries[0].id == FLOW_ACTION_POLICE)
+		err = dsa_user_add_cls_matchall_police(dev, cls, ingress);
+
+	return err;
+}
+
+static void dsa_user_del_cls_matchall(struct net_device *dev,
+				      struct tc_cls_matchall_offload *cls)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = dp->ds;
+
+	mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie);
+	if (!mall_tc_entry)
+		return;
+
+	list_del(&mall_tc_entry->list);
+
+	switch (mall_tc_entry->type) {
+	case DSA_PORT_MALL_MIRROR:
+		if (ds->ops->port_mirror_del)
+			ds->ops->port_mirror_del(ds, dp->index,
+						 &mall_tc_entry->mirror);
+		break;
+	case DSA_PORT_MALL_POLICER:
+		if (ds->ops->port_policer_del)
+			ds->ops->port_policer_del(ds, dp->index);
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	kfree(mall_tc_entry);
+}
+
+static int dsa_user_setup_tc_cls_matchall(struct net_device *dev,
+					  struct tc_cls_matchall_offload *cls,
+					  bool ingress)
+{
+	if (cls->common.chain_index)
+		return -EOPNOTSUPP;
+
+	switch (cls->command) {
+	case TC_CLSMATCHALL_REPLACE:
+		return dsa_user_add_cls_matchall(dev, cls, ingress);
+	case TC_CLSMATCHALL_DESTROY:
+		dsa_user_del_cls_matchall(dev, cls);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dsa_user_add_cls_flower(struct net_device *dev,
+				   struct flow_cls_offload *cls,
+				   bool ingress)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->cls_flower_add)
+		return -EOPNOTSUPP;
+
+	return ds->ops->cls_flower_add(ds, port, cls, ingress);
+}
+
+static int dsa_user_del_cls_flower(struct net_device *dev,
+				   struct flow_cls_offload *cls,
+				   bool ingress)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->cls_flower_del)
+		return -EOPNOTSUPP;
+
+	return ds->ops->cls_flower_del(ds, port, cls, ingress);
+}
+
+static int dsa_user_stats_cls_flower(struct net_device *dev,
+				     struct flow_cls_offload *cls,
+				     bool ingress)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->cls_flower_stats)
+		return -EOPNOTSUPP;
+
+	return ds->ops->cls_flower_stats(ds, port, cls, ingress);
+}
+
+static int dsa_user_setup_tc_cls_flower(struct net_device *dev,
+					struct flow_cls_offload *cls,
+					bool ingress)
+{
+	switch (cls->command) {
+	case FLOW_CLS_REPLACE:
+		return dsa_user_add_cls_flower(dev, cls, ingress);
+	case FLOW_CLS_DESTROY:
+		return dsa_user_del_cls_flower(dev, cls, ingress);
+	case FLOW_CLS_STATS:
+		return dsa_user_stats_cls_flower(dev, cls, ingress);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				      void *cb_priv, bool ingress)
+{
+	struct net_device *dev = cb_priv;
+
+	if (!tc_can_offload(dev))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSMATCHALL:
+		return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress);
+	case TC_SETUP_CLSFLOWER:
+		return dsa_user_setup_tc_cls_flower(dev, type_data, ingress);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type,
+					 void *type_data, void *cb_priv)
+{
+	return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true);
+}
+
+static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type,
+					 void *type_data, void *cb_priv)
+{
+	return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false);
+}
+
+static LIST_HEAD(dsa_user_block_cb_list);
+
+static int dsa_user_setup_tc_block(struct net_device *dev,
+				   struct flow_block_offload *f)
+{
+	struct flow_block_cb *block_cb;
+	flow_setup_cb_t *cb;
+
+	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		cb = dsa_user_setup_tc_block_cb_ig;
+	else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+		cb = dsa_user_setup_tc_block_cb_eg;
+	else
+		return -EOPNOTSUPP;
+
+	f->driver_block_list = &dsa_user_block_cb_list;
+
+	switch (f->command) {
+	case FLOW_BLOCK_BIND:
+		if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list))
+			return -EBUSY;
+
+		block_cb = flow_block_cb_alloc(cb, dev, dev, NULL);
+		if (IS_ERR(block_cb))
+			return PTR_ERR(block_cb);
+
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list);
+		return 0;
+	case FLOW_BLOCK_UNBIND:
+		block_cb = flow_block_cb_lookup(f->block, cb, dev);
+		if (!block_cb)
+			return -ENOENT;
+
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port,
+				   void *type_data)
+{
+	struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port));
+
+	if (!conduit->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data);
+}
+
+static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			     void *type_data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return dsa_user_setup_tc_block(dev, type_data);
+	case TC_SETUP_FT:
+		return dsa_user_setup_ft_block(ds, dp->index, type_data);
+	default:
+		break;
+	}
+
+	if (!ds->ops->port_setup_tc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
+}
+
+static int dsa_user_get_rxnfc(struct net_device *dev,
+			      struct ethtool_rxnfc *nfc, u32 *rule_locs)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->get_rxnfc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs);
+}
+
+static int dsa_user_set_rxnfc(struct net_device *dev,
+			      struct ethtool_rxnfc *nfc)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->set_rxnfc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->set_rxnfc(ds, dp->index, nfc);
+}
+
+static int dsa_user_get_ts_info(struct net_device *dev,
+				struct ethtool_ts_info *ts)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->get_ts_info)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_ts_info(ds, p->dp->index, ts);
+}
+
+static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+				    u16 vid)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan vlan = {
+		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+		.vid = vid,
+		/* This API only allows programming tagged, non-PVID VIDs */
+		.flags = 0,
+	};
+	struct netlink_ext_ack extack = {0};
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+	struct dsa_vlan *v;
+	int ret;
+
+	/* User port... */
+	ret = dsa_port_vlan_add(dp, &vlan, &extack);
+	if (ret) {
+		if (extack._msg)
+			netdev_err(dev, "%s\n", extack._msg);
+		return ret;
+	}
+
+	/* And CPU port... */
+	ret = dsa_port_host_vlan_add(dp, &vlan, &extack);
+	if (ret) {
+		if (extack._msg)
+			netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index,
+				   extack._msg);
+		return ret;
+	}
+
+	if (!dsa_switch_supports_uc_filtering(ds) &&
+	    !dsa_switch_supports_mc_filtering(ds))
+		return 0;
+
+	v = kzalloc(sizeof(*v), GFP_KERNEL);
+	if (!v) {
+		ret = -ENOMEM;
+		goto rollback;
+	}
+
+	netif_addr_lock_bh(dev);
+
+	v->vid = vid;
+	list_add_tail(&v->list, &dp->user_vlans);
+
+	if (dsa_switch_supports_mc_filtering(ds)) {
+		netdev_for_each_synced_mc_addr(ha, dev) {
+			dsa_user_schedule_standalone_work(dev, DSA_MC_ADD,
+							  ha->addr, vid);
+		}
+	}
+
+	if (dsa_switch_supports_uc_filtering(ds)) {
+		netdev_for_each_synced_uc_addr(ha, dev) {
+			dsa_user_schedule_standalone_work(dev, DSA_UC_ADD,
+							  ha->addr, vid);
+		}
+	}
+
+	netif_addr_unlock_bh(dev);
+
+	dsa_flush_workqueue();
+
+	return 0;
+
+rollback:
+	dsa_port_host_vlan_del(dp, &vlan);
+	dsa_port_vlan_del(dp, &vlan);
+
+	return ret;
+}
+
+static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+				     u16 vid)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan vlan = {
+		.vid = vid,
+		/* This API only allows programming tagged, non-PVID VIDs */
+		.flags = 0,
+	};
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+	struct dsa_vlan *v;
+	int err;
+
+	err = dsa_port_vlan_del(dp, &vlan);
+	if (err)
+		return err;
+
+	err = dsa_port_host_vlan_del(dp, &vlan);
+	if (err)
+		return err;
+
+	if (!dsa_switch_supports_uc_filtering(ds) &&
+	    !dsa_switch_supports_mc_filtering(ds))
+		return 0;
+
+	netif_addr_lock_bh(dev);
+
+	v = dsa_vlan_find(&dp->user_vlans, &vlan);
+	if (!v) {
+		netif_addr_unlock_bh(dev);
+		return -ENOENT;
+	}
+
+	list_del(&v->list);
+	kfree(v);
+
+	if (dsa_switch_supports_mc_filtering(ds)) {
+		netdev_for_each_synced_mc_addr(ha, dev) {
+			dsa_user_schedule_standalone_work(dev, DSA_MC_DEL,
+							  ha->addr, vid);
+		}
+	}
+
+	if (dsa_switch_supports_uc_filtering(ds)) {
+		netdev_for_each_synced_uc_addr(ha, dev) {
+			dsa_user_schedule_standalone_work(dev, DSA_UC_DEL,
+							  ha->addr, vid);
+		}
+	}
+
+	netif_addr_unlock_bh(dev);
+
+	dsa_flush_workqueue();
+
+	return 0;
+}
+
+static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg)
+{
+	__be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
+
+	return dsa_user_vlan_rx_add_vid(arg, proto, vid);
+}
+
+static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg)
+{
+	__be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
+
+	return dsa_user_vlan_rx_kill_vid(arg, proto, vid);
+}
+
+/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN
+ * filtering is enabled. The baseline is that only ports that offload a
+ * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware,
+ * but there are exceptions for quirky hardware.
+ *
+ * If ds->vlan_filtering_is_global = true, then standalone ports which share
+ * the same switch with other ports that offload a VLAN-aware bridge are also
+ * inevitably VLAN-aware.
+ *
+ * To summarize, a DSA switch port offloads:
+ *
+ * - If standalone (this includes software bridge, software LAG):
+ *     - if ds->needs_standalone_vlan_filtering = true, OR if
+ *       (ds->vlan_filtering_is_global = true AND there are bridges spanning
+ *       this switch chip which have vlan_filtering=1)
+ *         - the 8021q upper VLANs
+ *     - else (standalone VLAN filtering is not needed, VLAN filtering is not
+ *       global, or it is, but no port is under a VLAN-aware bridge):
+ *         - no VLAN (any 8021q upper is a software VLAN)
+ *
+ * - If under a vlan_filtering=0 bridge which it offload:
+ *     - if ds->configure_vlan_while_not_filtering = true (default):
+ *         - the bridge VLANs. These VLANs are committed to hardware but inactive.
+ *     - else (deprecated):
+ *         - no VLAN. The bridge VLANs are not restored when VLAN awareness is
+ *           enabled, so this behavior is broken and discouraged.
+ *
+ * - If under a vlan_filtering=1 bridge which it offload:
+ *     - the bridge VLANs
+ *     - the 8021q upper VLANs
+ */
+int dsa_user_manage_vlan_filtering(struct net_device *user,
+				   bool vlan_filtering)
+{
+	int err;
+
+	if (vlan_filtering) {
+		user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+		err = vlan_for_each(user, dsa_user_restore_vlan, user);
+		if (err) {
+			vlan_for_each(user, dsa_user_clear_vlan, user);
+			user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
+			return err;
+		}
+	} else {
+		err = vlan_for_each(user, dsa_user_clear_vlan, user);
+		if (err)
+			return err;
+
+		user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
+	}
+
+	return 0;
+}
+
+struct dsa_hw_port {
+	struct list_head list;
+	struct net_device *dev;
+	int old_mtu;
+};
+
+static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
+{
+	const struct dsa_hw_port *p;
+	int err;
+
+	list_for_each_entry(p, hw_port_list, list) {
+		if (p->dev->mtu == mtu)
+			continue;
+
+		err = dev_set_mtu(p->dev, mtu);
+		if (err)
+			goto rollback;
+	}
+
+	return 0;
+
+rollback:
+	list_for_each_entry_continue_reverse(p, hw_port_list, list) {
+		if (p->dev->mtu == p->old_mtu)
+			continue;
+
+		if (dev_set_mtu(p->dev, p->old_mtu))
+			netdev_err(p->dev, "Failed to restore MTU\n");
+	}
+
+	return err;
+}
+
+static void dsa_hw_port_list_free(struct list_head *hw_port_list)
+{
+	struct dsa_hw_port *p, *n;
+
+	list_for_each_entry_safe(p, n, hw_port_list, list)
+		kfree(p);
+}
+
+/* Make the hardware datapath to/from @dev limited to a common MTU */
+static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
+{
+	struct list_head hw_port_list;
+	struct dsa_switch_tree *dst;
+	int min_mtu = ETH_MAX_MTU;
+	struct dsa_port *other_dp;
+	int err;
+
+	if (!dp->ds->mtu_enforcement_ingress)
+		return;
+
+	if (!dp->bridge)
+		return;
+
+	INIT_LIST_HEAD(&hw_port_list);
+
+	/* Populate the list of ports that are part of the same bridge
+	 * as the newly added/modified port
+	 */
+	list_for_each_entry(dst, &dsa_tree_list, list) {
+		list_for_each_entry(other_dp, &dst->ports, list) {
+			struct dsa_hw_port *hw_port;
+			struct net_device *user;
+
+			if (other_dp->type != DSA_PORT_TYPE_USER)
+				continue;
+
+			if (!dsa_port_bridge_same(dp, other_dp))
+				continue;
+
+			if (!other_dp->ds->mtu_enforcement_ingress)
+				continue;
+
+			user = other_dp->user;
+
+			if (min_mtu > user->mtu)
+				min_mtu = user->mtu;
+
+			hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
+			if (!hw_port)
+				goto out;
+
+			hw_port->dev = user;
+			hw_port->old_mtu = user->mtu;
+
+			list_add(&hw_port->list, &hw_port_list);
+		}
+	}
+
+	/* Attempt to configure the entire hardware bridge to the newly added
+	 * interface's MTU first, regardless of whether the intention of the
+	 * user was to raise or lower it.
+	 */
+	err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu);
+	if (!err)
+		goto out;
+
+	/* Clearly that didn't work out so well, so just set the minimum MTU on
+	 * all hardware bridge ports now. If this fails too, then all ports will
+	 * still have their old MTU rolled back anyway.
+	 */
+	dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);
+
+out:
+	dsa_hw_port_list_free(&hw_port_list);
+}
+
+int dsa_user_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_port *cpu_dp = dp->cpu_dp;
+	struct dsa_switch *ds = dp->ds;
+	struct dsa_port *other_dp;
+	int largest_mtu = 0;
+	int new_conduit_mtu;
+	int old_conduit_mtu;
+	int mtu_limit;
+	int overhead;
+	int cpu_mtu;
+	int err;
+
+	if (!ds->ops->port_change_mtu)
+		return -EOPNOTSUPP;
+
+	dsa_tree_for_each_user_port(other_dp, ds->dst) {
+		int user_mtu;
+
+		/* During probe, this function will be called for each user
+		 * device, while not all of them have been allocated. That's
+		 * ok, it doesn't change what the maximum is, so ignore it.
+		 */
+		if (!other_dp->user)
+			continue;
+
+		/* Pretend that we already applied the setting, which we
+		 * actually haven't (still haven't done all integrity checks)
+		 */
+		if (dp == other_dp)
+			user_mtu = new_mtu;
+		else
+			user_mtu = other_dp->user->mtu;
+
+		if (largest_mtu < user_mtu)
+			largest_mtu = user_mtu;
+	}
+
+	overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops);
+	mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead);
+	old_conduit_mtu = conduit->mtu;
+	new_conduit_mtu = largest_mtu + overhead;
+	if (new_conduit_mtu > mtu_limit)
+		return -ERANGE;
+
+	/* If the conduit MTU isn't over limit, there's no need to check the CPU
+	 * MTU, since that surely isn't either.
+	 */
+	cpu_mtu = largest_mtu;
+
+	/* Start applying stuff */
+	if (new_conduit_mtu != old_conduit_mtu) {
+		err = dev_set_mtu(conduit, new_conduit_mtu);
+		if (err < 0)
+			goto out_conduit_failed;
+
+		/* We only need to propagate the MTU of the CPU port to
+		 * upstream switches, so emit a notifier which updates them.
+		 */
+		err = dsa_port_mtu_change(cpu_dp, cpu_mtu);
+		if (err)
+			goto out_cpu_failed;
+	}
+
+	err = ds->ops->port_change_mtu(ds, dp->index, new_mtu);
+	if (err)
+		goto out_port_failed;
+
+	dev->mtu = new_mtu;
+
+	dsa_bridge_mtu_normalization(dp);
+
+	return 0;
+
+out_port_failed:
+	if (new_conduit_mtu != old_conduit_mtu)
+		dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead);
+out_cpu_failed:
+	if (new_conduit_mtu != old_conduit_mtu)
+		dev_set_mtu(conduit, old_conduit_mtu);
+out_conduit_failed:
+	return err;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	unsigned long mask, new_prio;
+	int err, port = dp->index;
+
+	if (!ds->ops->port_set_default_prio)
+		return -EOPNOTSUPP;
+
+	err = dcb_ieee_setapp(dev, app);
+	if (err)
+		return err;
+
+	mask = dcb_ieee_getapp_mask(dev, app);
+	new_prio = __fls(mask);
+
+	err = ds->ops->port_set_default_prio(ds, port, new_prio);
+	if (err) {
+		dcb_ieee_delapp(dev, app);
+		return err;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	unsigned long mask, new_prio;
+	int err, port = dp->index;
+	u8 dscp = app->protocol;
+
+	if (!ds->ops->port_add_dscp_prio)
+		return -EOPNOTSUPP;
+
+	if (dscp >= 64) {
+		netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n",
+			   dscp);
+		return -EINVAL;
+	}
+
+	err = dcb_ieee_setapp(dev, app);
+	if (err)
+		return err;
+
+	mask = dcb_ieee_getapp_mask(dev, app);
+	new_prio = __fls(mask);
+
+	err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio);
+	if (err) {
+		dcb_ieee_delapp(dev, app);
+		return err;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev,
+						     struct dcb_app *app)
+{
+	switch (app->selector) {
+	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+		switch (app->protocol) {
+		case 0:
+			return dsa_user_dcbnl_set_default_prio(dev, app);
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	case IEEE_8021QAZ_APP_SEL_DSCP:
+		return dsa_user_dcbnl_add_dscp_prio(dev, app);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	unsigned long mask, new_prio;
+	int err, port = dp->index;
+
+	if (!ds->ops->port_set_default_prio)
+		return -EOPNOTSUPP;
+
+	err = dcb_ieee_delapp(dev, app);
+	if (err)
+		return err;
+
+	mask = dcb_ieee_getapp_mask(dev, app);
+	new_prio = mask ? __fls(mask) : 0;
+
+	err = ds->ops->port_set_default_prio(ds, port, new_prio);
+	if (err) {
+		dcb_ieee_setapp(dev, app);
+		return err;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int err, port = dp->index;
+	u8 dscp = app->protocol;
+
+	if (!ds->ops->port_del_dscp_prio)
+		return -EOPNOTSUPP;
+
+	err = dcb_ieee_delapp(dev, app);
+	if (err)
+		return err;
+
+	err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority);
+	if (err) {
+		dcb_ieee_setapp(dev, app);
+		return err;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev,
+						     struct dcb_app *app)
+{
+	switch (app->selector) {
+	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+		switch (app->protocol) {
+		case 0:
+			return dsa_user_dcbnl_del_default_prio(dev, app);
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	case IEEE_8021QAZ_APP_SEL_DSCP:
+		return dsa_user_dcbnl_del_dscp_prio(dev, app);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/* Pre-populate the DCB application priority table with the priorities
+ * configured during switch setup, which we read from hardware here.
+ */
+static int dsa_user_dcbnl_init(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+	int err;
+
+	if (ds->ops->port_get_default_prio) {
+		int prio = ds->ops->port_get_default_prio(ds, port);
+		struct dcb_app app = {
+			.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE,
+			.protocol = 0,
+			.priority = prio,
+		};
+
+		if (prio < 0)
+			return prio;
+
+		err = dcb_ieee_setapp(dev, &app);
+		if (err)
+			return err;
+	}
+
+	if (ds->ops->port_get_dscp_prio) {
+		int protocol;
+
+		for (protocol = 0; protocol < 64; protocol++) {
+			struct dcb_app app = {
+				.selector = IEEE_8021QAZ_APP_SEL_DSCP,
+				.protocol = protocol,
+			};
+			int prio;
+
+			prio = ds->ops->port_get_dscp_prio(ds, port, protocol);
+			if (prio == -EOPNOTSUPP)
+				continue;
+			if (prio < 0)
+				return prio;
+
+			app.priority = prio;
+
+			err = dcb_ieee_setapp(dev, &app);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static const struct ethtool_ops dsa_user_ethtool_ops = {
+	.get_drvinfo		= dsa_user_get_drvinfo,
+	.get_regs_len		= dsa_user_get_regs_len,
+	.get_regs		= dsa_user_get_regs,
+	.nway_reset		= dsa_user_nway_reset,
+	.get_link		= ethtool_op_get_link,
+	.get_eeprom_len		= dsa_user_get_eeprom_len,
+	.get_eeprom		= dsa_user_get_eeprom,
+	.set_eeprom		= dsa_user_set_eeprom,
+	.get_strings		= dsa_user_get_strings,
+	.get_ethtool_stats	= dsa_user_get_ethtool_stats,
+	.get_sset_count		= dsa_user_get_sset_count,
+	.get_eth_phy_stats	= dsa_user_get_eth_phy_stats,
+	.get_eth_mac_stats	= dsa_user_get_eth_mac_stats,
+	.get_eth_ctrl_stats	= dsa_user_get_eth_ctrl_stats,
+	.get_rmon_stats		= dsa_user_get_rmon_stats,
+	.set_wol		= dsa_user_set_wol,
+	.get_wol		= dsa_user_get_wol,
+	.set_eee		= dsa_user_set_eee,
+	.get_eee		= dsa_user_get_eee,
+	.get_link_ksettings	= dsa_user_get_link_ksettings,
+	.set_link_ksettings	= dsa_user_set_link_ksettings,
+	.get_pause_stats	= dsa_user_get_pause_stats,
+	.get_pauseparam		= dsa_user_get_pauseparam,
+	.set_pauseparam		= dsa_user_set_pauseparam,
+	.get_rxnfc		= dsa_user_get_rxnfc,
+	.set_rxnfc		= dsa_user_set_rxnfc,
+	.get_ts_info		= dsa_user_get_ts_info,
+	.self_test		= dsa_user_net_selftest,
+	.get_mm			= dsa_user_get_mm,
+	.set_mm			= dsa_user_set_mm,
+	.get_mm_stats		= dsa_user_get_mm_stats,
+};
+
+static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = {
+	.ieee_setapp		= dsa_user_dcbnl_ieee_setapp,
+	.ieee_delapp		= dsa_user_dcbnl_ieee_delapp,
+};
+
+static void dsa_user_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *s)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_stats64)
+		ds->ops->get_stats64(ds, dp->index, s);
+	else
+		dev_get_tstats64(dev, s);
+}
+
+static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx,
+				      struct net_device_path *path)
+{
+	struct dsa_port *dp = dsa_user_to_port(ctx->dev);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
+	struct dsa_port *cpu_dp = dp->cpu_dp;
+
+	path->dev = ctx->dev;
+	path->type = DEV_PATH_DSA;
+	path->dsa.proto = cpu_dp->tag_ops->proto;
+	path->dsa.port = dp->index;
+	ctx->dev = conduit;
+
+	return 0;
+}
+
+static const struct net_device_ops dsa_user_netdev_ops = {
+	.ndo_open		= dsa_user_open,
+	.ndo_stop		= dsa_user_close,
+	.ndo_start_xmit		= dsa_user_xmit,
+	.ndo_change_rx_flags	= dsa_user_change_rx_flags,
+	.ndo_set_rx_mode	= dsa_user_set_rx_mode,
+	.ndo_set_mac_address	= dsa_user_set_mac_address,
+	.ndo_fdb_dump		= dsa_user_fdb_dump,
+	.ndo_eth_ioctl		= dsa_user_ioctl,
+	.ndo_get_iflink		= dsa_user_get_iflink,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_netpoll_setup	= dsa_user_netpoll_setup,
+	.ndo_netpoll_cleanup	= dsa_user_netpoll_cleanup,
+	.ndo_poll_controller	= dsa_user_poll_controller,
+#endif
+	.ndo_setup_tc		= dsa_user_setup_tc,
+	.ndo_get_stats64	= dsa_user_get_stats64,
+	.ndo_vlan_rx_add_vid	= dsa_user_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= dsa_user_vlan_rx_kill_vid,
+	.ndo_change_mtu		= dsa_user_change_mtu,
+	.ndo_fill_forward_path	= dsa_user_fill_forward_path,
+};
+
+static struct device_type dsa_type = {
+	.name	= "dsa",
+};
+
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+	const struct dsa_port *dp = dsa_to_port(ds, port);
+
+	if (dp->pl)
+		phylink_mac_change(dp->pl, up);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
+static void dsa_user_phylink_fixed_state(struct phylink_config *config,
+					 struct phylink_link_state *state)
+{
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct dsa_switch *ds = dp->ds;
+
+	/* No need to check that this operation is valid, the callback would
+	 * not be called if it was not.
+	 */
+	ds->ops->phylink_fixed_state(ds, dp->index, state);
+}
+
+/* user device setup *******************************************************/
+static int dsa_user_phy_connect(struct net_device *user_dev, int addr,
+				u32 flags)
+{
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+	struct dsa_switch *ds = dp->ds;
+
+	user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr);
+	if (!user_dev->phydev) {
+		netdev_err(user_dev, "no phy at %d\n", addr);
+		return -ENODEV;
+	}
+
+	user_dev->phydev->dev_flags |= flags;
+
+	return phylink_connect_phy(dp->pl, user_dev->phydev);
+}
+
+static int dsa_user_phy_setup(struct net_device *user_dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+	struct device_node *port_dn = dp->dn;
+	struct dsa_switch *ds = dp->ds;
+	u32 phy_flags = 0;
+	int ret;
+
+	dp->pl_config.dev = &user_dev->dev;
+	dp->pl_config.type = PHYLINK_NETDEV;
+
+	/* The get_fixed_state callback takes precedence over polling the
+	 * link GPIO in PHYLINK (see phylink_get_fixed_state).  Only set
+	 * this if the switch provides such a callback.
+	 */
+	if (ds->ops->phylink_fixed_state) {
+		dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state;
+		dp->pl_config.poll_fixed_state = true;
+	}
+
+	ret = dsa_port_phylink_create(dp);
+	if (ret)
+		return ret;
+
+	if (ds->ops->get_phy_flags)
+		phy_flags = ds->ops->get_phy_flags(ds, dp->index);
+
+	ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
+	if (ret == -ENODEV && ds->user_mii_bus) {
+		/* We could not connect to a designated PHY or SFP, so try to
+		 * use the switch internal MDIO bus instead
+		 */
+		ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags);
+	}
+	if (ret) {
+		netdev_err(user_dev, "failed to connect to PHY: %pe\n",
+			   ERR_PTR(ret));
+		dsa_port_phylink_destroy(dp);
+	}
+
+	return ret;
+}
+
+void dsa_user_setup_tagger(struct net_device *user)
+{
+	struct dsa_port *dp = dsa_user_to_port(user);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
+	struct dsa_user_priv *p = netdev_priv(user);
+	const struct dsa_port *cpu_dp = dp->cpu_dp;
+	const struct dsa_switch *ds = dp->ds;
+
+	user->needed_headroom = cpu_dp->tag_ops->needed_headroom;
+	user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom;
+	/* Try to save one extra realloc later in the TX path (in the conduit)
+	 * by also inheriting the conduit's needed headroom and tailroom.
+	 * The 8021q driver also does this.
+	 */
+	user->needed_headroom += conduit->needed_headroom;
+	user->needed_tailroom += conduit->needed_tailroom;
+
+	p->xmit = cpu_dp->tag_ops->xmit;
+
+	user->features = conduit->vlan_features | NETIF_F_HW_TC;
+	user->hw_features |= NETIF_F_HW_TC;
+	user->features |= NETIF_F_LLTX;
+	if (user->needed_tailroom)
+		user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST);
+	if (ds->needs_standalone_vlan_filtering)
+		user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+}
+
+int dsa_user_suspend(struct net_device *user_dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+
+	if (!netif_running(user_dev))
+		return 0;
+
+	netif_device_detach(user_dev);
+
+	rtnl_lock();
+	phylink_stop(dp->pl);
+	rtnl_unlock();
+
+	return 0;
+}
+
+int dsa_user_resume(struct net_device *user_dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+
+	if (!netif_running(user_dev))
+		return 0;
+
+	netif_device_attach(user_dev);
+
+	rtnl_lock();
+	phylink_start(dp->pl);
+	rtnl_unlock();
+
+	return 0;
+}
+
+int dsa_user_create(struct dsa_port *port)
+{
+	struct net_device *conduit = dsa_port_to_conduit(port);
+	struct dsa_switch *ds = port->ds;
+	struct net_device *user_dev;
+	struct dsa_user_priv *p;
+	const char *name;
+	int assign_type;
+	int ret;
+
+	if (!ds->num_tx_queues)
+		ds->num_tx_queues = 1;
+
+	if (port->name) {
+		name = port->name;
+		assign_type = NET_NAME_PREDICTABLE;
+	} else {
+		name = "eth%d";
+		assign_type = NET_NAME_ENUM;
+	}
+
+	user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name,
+				    assign_type, ether_setup,
+				    ds->num_tx_queues, 1);
+	if (user_dev == NULL)
+		return -ENOMEM;
+
+	user_dev->rtnl_link_ops = &dsa_link_ops;
+	user_dev->ethtool_ops = &dsa_user_ethtool_ops;
+#if IS_ENABLED(CONFIG_DCB)
+	user_dev->dcbnl_ops = &dsa_user_dcbnl_ops;
+#endif
+	if (!is_zero_ether_addr(port->mac))
+		eth_hw_addr_set(user_dev, port->mac);
+	else
+		eth_hw_addr_inherit(user_dev, conduit);
+	user_dev->priv_flags |= IFF_NO_QUEUE;
+	if (dsa_switch_supports_uc_filtering(ds))
+		user_dev->priv_flags |= IFF_UNICAST_FLT;
+	user_dev->netdev_ops = &dsa_user_netdev_ops;
+	if (ds->ops->port_max_mtu)
+		user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
+	SET_NETDEV_DEVTYPE(user_dev, &dsa_type);
+
+	SET_NETDEV_DEV(user_dev, port->ds->dev);
+	SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port);
+	user_dev->dev.of_node = port->dn;
+	user_dev->vlan_features = conduit->vlan_features;
+
+	p = netdev_priv(user_dev);
+	user_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!user_dev->tstats) {
+		free_netdev(user_dev);
+		return -ENOMEM;
+	}
+
+	ret = gro_cells_init(&p->gcells, user_dev);
+	if (ret)
+		goto out_free;
+
+	p->dp = port;
+	INIT_LIST_HEAD(&p->mall_tc_list);
+	port->user = user_dev;
+	dsa_user_setup_tagger(user_dev);
+
+	netif_carrier_off(user_dev);
+
+	ret = dsa_user_phy_setup(user_dev);
+	if (ret) {
+		netdev_err(user_dev,
+			   "error %d setting up PHY for tree %d, switch %d, port %d\n",
+			   ret, ds->dst->index, ds->index, port->index);
+		goto out_gcells;
+	}
+
+	rtnl_lock();
+
+	ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN);
+	if (ret && ret != -EOPNOTSUPP)
+		dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n",
+			 ret, ETH_DATA_LEN, port->index);
+
+	ret = register_netdevice(user_dev);
+	if (ret) {
+		netdev_err(conduit, "error %d registering interface %s\n",
+			   ret, user_dev->name);
+		rtnl_unlock();
+		goto out_phy;
+	}
+
+	if (IS_ENABLED(CONFIG_DCB)) {
+		ret = dsa_user_dcbnl_init(user_dev);
+		if (ret) {
+			netdev_err(user_dev,
+				   "failed to initialize DCB: %pe\n",
+				   ERR_PTR(ret));
+			rtnl_unlock();
+			goto out_unregister;
+		}
+	}
+
+	ret = netdev_upper_dev_link(conduit, user_dev, NULL);
+
+	rtnl_unlock();
+
+	if (ret)
+		goto out_unregister;
+
+	return 0;
+
+out_unregister:
+	unregister_netdev(user_dev);
+out_phy:
+	rtnl_lock();
+	phylink_disconnect_phy(p->dp->pl);
+	rtnl_unlock();
+	dsa_port_phylink_destroy(p->dp);
+out_gcells:
+	gro_cells_destroy(&p->gcells);
+out_free:
+	free_percpu(user_dev->tstats);
+	free_netdev(user_dev);
+	port->user = NULL;
+	return ret;
+}
+
+void dsa_user_destroy(struct net_device *user_dev)
+{
+	struct net_device *conduit = dsa_user_to_conduit(user_dev);
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+	struct dsa_user_priv *p = netdev_priv(user_dev);
+
+	netif_carrier_off(user_dev);
+	rtnl_lock();
+	netdev_upper_dev_unlink(conduit, user_dev);
+	unregister_netdevice(user_dev);
+	phylink_disconnect_phy(dp->pl);
+	rtnl_unlock();
+
+	dsa_port_phylink_destroy(dp);
+	gro_cells_destroy(&p->gcells);
+	free_percpu(user_dev->tstats);
+	free_netdev(user_dev);
+}
+
+int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
+			    struct netlink_ext_ack *extack)
+{
+	struct net_device *old_conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct net_device *upper;
+	struct list_head *iter;
+	int err;
+
+	if (conduit == old_conduit)
+		return 0;
+
+	if (!ds->ops->port_change_conduit) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Driver does not support changing DSA conduit");
+		return -EOPNOTSUPP;
+	}
+
+	if (!netdev_uses_dsa(conduit)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Interface not eligible as DSA conduit");
+		return -EOPNOTSUPP;
+	}
+
+	netdev_for_each_upper_dev_rcu(conduit, upper, iter) {
+		if (dsa_user_dev_check(upper))
+			continue;
+		if (netif_is_bridge_master(upper))
+			continue;
+		NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers");
+		return -EOPNOTSUPP;
+	}
+
+	/* Since we allow live-changing the DSA conduit, plus we auto-open the
+	 * DSA conduit when the user port opens => we need to ensure that the
+	 * new DSA conduit is open too.
+	 */
+	if (dev->flags & IFF_UP) {
+		err = dev_open(conduit, extack);
+		if (err)
+			return err;
+	}
+
+	netdev_upper_dev_unlink(old_conduit, dev);
+
+	err = netdev_upper_dev_link(conduit, dev, extack);
+	if (err)
+		goto out_revert_old_conduit_unlink;
+
+	err = dsa_port_change_conduit(dp, conduit, extack);
+	if (err)
+		goto out_revert_conduit_link;
+
+	/* Update the MTU of the new CPU port through cross-chip notifiers */
+	err = dsa_user_change_mtu(dev, dev->mtu);
+	if (err && err != -EOPNOTSUPP) {
+		netdev_warn(dev,
+			    "nonfatal error updating MTU with new conduit: %pe\n",
+			    ERR_PTR(err));
+	}
+
+	/* If the port doesn't have its own MAC address and relies on the DSA
+	 * conduit's one, inherit it again from the new DSA conduit.
+	 */
+	if (is_zero_ether_addr(dp->mac))
+		eth_hw_addr_inherit(dev, conduit);
+
+	return 0;
+
+out_revert_conduit_link:
+	netdev_upper_dev_unlink(conduit, dev);
+out_revert_old_conduit_unlink:
+	netdev_upper_dev_link(old_conduit, dev, NULL);
+	return err;
+}
+
+bool dsa_user_dev_check(const struct net_device *dev)
+{
+	return dev->netdev_ops == &dsa_user_netdev_ops;
+}
+EXPORT_SYMBOL_GPL(dsa_user_dev_check);
+
+static int dsa_user_changeupper(struct net_device *dev,
+				struct netdev_notifier_changeupper_info *info)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct netlink_ext_ack *extack;
+	int err = NOTIFY_DONE;
+
+	if (!dsa_user_dev_check(dev))
+		return err;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	if (netif_is_bridge_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_port_bridge_join(dp, info->upper_dev, extack);
+			if (!err)
+				dsa_bridge_mtu_normalization(dp);
+			if (err == -EOPNOTSUPP) {
+				NL_SET_ERR_MSG_WEAK_MOD(extack,
+							"Offloading not supported");
+				err = 0;
+			}
+			err = notifier_from_errno(err);
+		} else {
+			dsa_port_bridge_leave(dp, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	} else if (netif_is_lag_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_port_lag_join(dp, info->upper_dev,
+						info->upper_info, extack);
+			if (err == -EOPNOTSUPP) {
+				NL_SET_ERR_MSG_WEAK_MOD(extack,
+							"Offloading not supported");
+				err = 0;
+			}
+			err = notifier_from_errno(err);
+		} else {
+			dsa_port_lag_leave(dp, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	} else if (is_hsr_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_port_hsr_join(dp, info->upper_dev, extack);
+			if (err == -EOPNOTSUPP) {
+				NL_SET_ERR_MSG_WEAK_MOD(extack,
+							"Offloading not supported");
+				err = 0;
+			}
+			err = notifier_from_errno(err);
+		} else {
+			dsa_port_hsr_leave(dp, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	}
+
+	return err;
+}
+
+static int dsa_user_prechangeupper(struct net_device *dev,
+				   struct netdev_notifier_changeupper_info *info)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	if (!dsa_user_dev_check(dev))
+		return NOTIFY_DONE;
+
+	if (netif_is_bridge_master(info->upper_dev) && !info->linking)
+		dsa_port_pre_bridge_leave(dp, info->upper_dev);
+	else if (netif_is_lag_master(info->upper_dev) && !info->linking)
+		dsa_port_pre_lag_leave(dp, info->upper_dev);
+	/* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot
+	 * meaningfully placed under a bridge yet
+	 */
+
+	return NOTIFY_DONE;
+}
+
+static int
+dsa_user_lag_changeupper(struct net_device *dev,
+			 struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *lower;
+	struct list_head *iter;
+	int err = NOTIFY_DONE;
+	struct dsa_port *dp;
+
+	if (!netif_is_lag_master(dev))
+		return err;
+
+	netdev_for_each_lower_dev(dev, lower, iter) {
+		if (!dsa_user_dev_check(lower))
+			continue;
+
+		dp = dsa_user_to_port(lower);
+		if (!dp->lag)
+			/* Software LAG */
+			continue;
+
+		err = dsa_user_changeupper(lower, info);
+		if (notifier_to_errno(err))
+			break;
+	}
+
+	return err;
+}
+
+/* Same as dsa_user_lag_changeupper() except that it calls
+ * dsa_user_prechangeupper()
+ */
+static int
+dsa_user_lag_prechangeupper(struct net_device *dev,
+			    struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *lower;
+	struct list_head *iter;
+	int err = NOTIFY_DONE;
+	struct dsa_port *dp;
+
+	if (!netif_is_lag_master(dev))
+		return err;
+
+	netdev_for_each_lower_dev(dev, lower, iter) {
+		if (!dsa_user_dev_check(lower))
+			continue;
+
+		dp = dsa_user_to_port(lower);
+		if (!dp->lag)
+			/* Software LAG */
+			continue;
+
+		err = dsa_user_prechangeupper(lower, info);
+		if (notifier_to_errno(err))
+			break;
+	}
+
+	return err;
+}
+
+static int
+dsa_prevent_bridging_8021q_upper(struct net_device *dev,
+				 struct netdev_notifier_changeupper_info *info)
+{
+	struct netlink_ext_ack *ext_ack;
+	struct net_device *user, *br;
+	struct dsa_port *dp;
+
+	ext_ack = netdev_notifier_info_to_extack(&info->info);
+
+	if (!is_vlan_dev(dev))
+		return NOTIFY_DONE;
+
+	user = vlan_dev_real_dev(dev);
+	if (!dsa_user_dev_check(user))
+		return NOTIFY_DONE;
+
+	dp = dsa_user_to_port(user);
+	br = dsa_port_bridge_dev_get(dp);
+	if (!br)
+		return NOTIFY_DONE;
+
+	/* Deny enslaving a VLAN device into a VLAN-aware bridge */
+	if (br_vlan_enabled(br) &&
+	    netif_is_bridge_master(info->upper_dev) && info->linking) {
+		NL_SET_ERR_MSG_MOD(ext_ack,
+				   "Cannot make VLAN device join VLAN-aware bridge");
+		return notifier_from_errno(-EINVAL);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int
+dsa_user_check_8021q_upper(struct net_device *dev,
+			   struct netdev_notifier_changeupper_info *info)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct net_device *br = dsa_port_bridge_dev_get(dp);
+	struct bridge_vlan_info br_info;
+	struct netlink_ext_ack *extack;
+	int err = NOTIFY_DONE;
+	u16 vid;
+
+	if (!br || !br_vlan_enabled(br))
+		return NOTIFY_DONE;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+	vid = vlan_dev_vlan_id(info->upper_dev);
+
+	/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+	 * device, respectively the VID is not found, returning
+	 * 0 means success, which is a failure for us here.
+	 */
+	err = br_vlan_get_info(br, vid, &br_info);
+	if (err == 0) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "This VLAN is already configured by the bridge");
+		return notifier_from_errno(-EBUSY);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int
+dsa_user_prechangeupper_sanity_check(struct net_device *dev,
+				     struct netdev_notifier_changeupper_info *info)
+{
+	struct dsa_switch *ds;
+	struct dsa_port *dp;
+	int err;
+
+	if (!dsa_user_dev_check(dev))
+		return dsa_prevent_bridging_8021q_upper(dev, info);
+
+	dp = dsa_user_to_port(dev);
+	ds = dp->ds;
+
+	if (ds->ops->port_prechangeupper) {
+		err = ds->ops->port_prechangeupper(ds, dp->index, info);
+		if (err)
+			return notifier_from_errno(err);
+	}
+
+	if (is_vlan_dev(info->upper_dev))
+		return dsa_user_check_8021q_upper(dev, info);
+
+	return NOTIFY_DONE;
+}
+
+/* To be eligible as a DSA conduit, a LAG must have all lower interfaces be
+ * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of
+ * switches in the same switch tree.
+ */
+static int dsa_lag_conduit_validate(struct net_device *lag_dev,
+				    struct netlink_ext_ack *extack)
+{
+	struct net_device *lower1, *lower2;
+	struct list_head *iter1, *iter2;
+
+	netdev_for_each_lower_dev(lag_dev, lower1, iter1) {
+		netdev_for_each_lower_dev(lag_dev, lower2, iter2) {
+			if (!netdev_uses_dsa(lower1) ||
+			    !netdev_uses_dsa(lower2)) {
+				NL_SET_ERR_MSG_MOD(extack,
+						   "All LAG ports must be eligible as DSA conduits");
+				return notifier_from_errno(-EINVAL);
+			}
+
+			if (lower1 == lower2)
+				continue;
+
+			if (!dsa_port_tree_same(lower1->dsa_ptr,
+						lower2->dsa_ptr)) {
+				NL_SET_ERR_MSG_MOD(extack,
+						   "LAG contains DSA conduits of disjoint switch trees");
+				return notifier_from_errno(-EINVAL);
+			}
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int
+dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit,
+					struct netdev_notifier_changeupper_info *info)
+{
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
+
+	if (!netdev_uses_dsa(conduit))
+		return NOTIFY_DONE;
+
+	if (!info->linking)
+		return NOTIFY_DONE;
+
+	/* Allow DSA switch uppers */
+	if (dsa_user_dev_check(info->upper_dev))
+		return NOTIFY_DONE;
+
+	/* Allow bridge uppers of DSA conduits, subject to further
+	 * restrictions in dsa_bridge_prechangelower_sanity_check()
+	 */
+	if (netif_is_bridge_master(info->upper_dev))
+		return NOTIFY_DONE;
+
+	/* Allow LAG uppers, subject to further restrictions in
+	 * dsa_lag_conduit_prechangelower_sanity_check()
+	 */
+	if (netif_is_lag_master(info->upper_dev))
+		return dsa_lag_conduit_validate(info->upper_dev, extack);
+
+	NL_SET_ERR_MSG_MOD(extack,
+			   "DSA conduit cannot join unknown upper interfaces");
+	return notifier_from_errno(-EBUSY);
+}
+
+static int
+dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev,
+					    struct netdev_notifier_changeupper_info *info)
+{
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
+	struct net_device *lag_dev = info->upper_dev;
+	struct net_device *lower;
+	struct list_head *iter;
+
+	if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev))
+		return NOTIFY_DONE;
+
+	if (!info->linking)
+		return NOTIFY_DONE;
+
+	if (!netdev_uses_dsa(dev)) {
+		NL_SET_ERR_MSG(extack,
+			       "Only DSA conduits can join a LAG DSA conduit");
+		return notifier_from_errno(-EINVAL);
+	}
+
+	netdev_for_each_lower_dev(lag_dev, lower, iter) {
+		if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) {
+			NL_SET_ERR_MSG(extack,
+				       "Interface is DSA conduit for a different switch tree than this LAG");
+			return notifier_from_errno(-EINVAL);
+		}
+
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/* Don't allow bridging of DSA conduits, since the bridge layer rx_handler
+ * prevents the DSA fake ethertype handler to be invoked, so we don't get the
+ * chance to strip off and parse the DSA switch tag protocol header (the bridge
+ * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these
+ * frames).
+ * The only case where that would not be an issue is when bridging can already
+ * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev
+ * port, and is bridged only with other ports from the same hardware device.
+ */
+static int
+dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower,
+				       struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *br = info->upper_dev;
+	struct netlink_ext_ack *extack;
+	struct net_device *lower;
+	struct list_head *iter;
+
+	if (!netif_is_bridge_master(br))
+		return NOTIFY_DONE;
+
+	if (!info->linking)
+		return NOTIFY_DONE;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	netdev_for_each_lower_dev(br, lower, iter) {
+		if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower))
+			continue;
+
+		if (!netdev_port_same_parent_id(lower, new_lower)) {
+			NL_SET_ERR_MSG(extack,
+				       "Cannot do software bridging with a DSA conduit");
+			return notifier_from_errno(-EINVAL);
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst,
+						    struct net_device *lag_dev)
+{
+	struct net_device *new_conduit = dsa_tree_find_first_conduit(dst);
+	struct dsa_port *dp;
+	int err;
+
+	dsa_tree_for_each_user_port(dp, dst) {
+		if (dsa_port_to_conduit(dp) != lag_dev)
+			continue;
+
+		err = dsa_user_change_conduit(dp->user, new_conduit, NULL);
+		if (err) {
+			netdev_err(dp->user,
+				   "failed to restore conduit to %s: %pe\n",
+				   new_conduit->name, ERR_PTR(err));
+		}
+	}
+}
+
+static int dsa_conduit_lag_join(struct net_device *conduit,
+				struct net_device *lag_dev,
+				struct netdev_lag_upper_info *uinfo,
+				struct netlink_ext_ack *extack)
+{
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
+	struct dsa_switch_tree *dst = cpu_dp->dst;
+	struct dsa_port *dp;
+	int err;
+
+	err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack);
+	if (err)
+		return err;
+
+	dsa_tree_for_each_user_port(dp, dst) {
+		if (dsa_port_to_conduit(dp) != conduit)
+			continue;
+
+		err = dsa_user_change_conduit(dp->user, lag_dev, extack);
+		if (err)
+			goto restore;
+	}
+
+	return 0;
+
+restore:
+	dsa_tree_for_each_user_port_continue_reverse(dp, dst) {
+		if (dsa_port_to_conduit(dp) != lag_dev)
+			continue;
+
+		err = dsa_user_change_conduit(dp->user, conduit, NULL);
+		if (err) {
+			netdev_err(dp->user,
+				   "failed to restore conduit to %s: %pe\n",
+				   conduit->name, ERR_PTR(err));
+		}
+	}
+
+	dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
+
+	return err;
+}
+
+static void dsa_conduit_lag_leave(struct net_device *conduit,
+				  struct net_device *lag_dev)
+{
+	struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr;
+	struct dsa_switch_tree *dst = cpu_dp->dst;
+	struct dsa_port *new_cpu_dp = NULL;
+	struct net_device *lower;
+	struct list_head *iter;
+
+	netdev_for_each_lower_dev(lag_dev, lower, iter) {
+		if (netdev_uses_dsa(lower)) {
+			new_cpu_dp = lower->dsa_ptr;
+			break;
+		}
+	}
+
+	if (new_cpu_dp) {
+		/* Update the CPU port of the user ports still under the LAG
+		 * so that dsa_port_to_conduit() continues to work properly
+		 */
+		dsa_tree_for_each_user_port(dp, dst)
+			if (dsa_port_to_conduit(dp) == lag_dev)
+				dp->cpu_dp = new_cpu_dp;
+
+		/* Update the index of the virtual CPU port to match the lowest
+		 * physical CPU port
+		 */
+		lag_dev->dsa_ptr = new_cpu_dp;
+		wmb();
+	} else {
+		/* If the LAG DSA conduit has no ports left, migrate back all
+		 * user ports to the first physical CPU port
+		 */
+		dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev);
+	}
+
+	/* This DSA conduit has left its LAG in any case, so let
+	 * the CPU port leave the hardware LAG as well
+	 */
+	dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
+}
+
+static int dsa_conduit_changeupper(struct net_device *dev,
+				   struct netdev_notifier_changeupper_info *info)
+{
+	struct netlink_ext_ack *extack;
+	int err = NOTIFY_DONE;
+
+	if (!netdev_uses_dsa(dev))
+		return err;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	if (netif_is_lag_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_conduit_lag_join(dev, info->upper_dev,
+						   info->upper_info, extack);
+			err = notifier_from_errno(err);
+		} else {
+			dsa_conduit_lag_leave(dev, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	}
+
+	return err;
+}
+
+static int dsa_user_netdevice_event(struct notifier_block *nb,
+				    unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER: {
+		struct netdev_notifier_changeupper_info *info = ptr;
+		int err;
+
+		err = dsa_user_prechangeupper_sanity_check(dev, info);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_conduit_prechangeupper_sanity_check(dev, info);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_lag_conduit_prechangelower_sanity_check(dev, info);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_bridge_prechangelower_sanity_check(dev, info);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_user_prechangeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_user_lag_prechangeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		break;
+	}
+	case NETDEV_CHANGEUPPER: {
+		int err;
+
+		err = dsa_user_changeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_user_lag_changeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_conduit_changeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		break;
+	}
+	case NETDEV_CHANGELOWERSTATE: {
+		struct netdev_notifier_changelowerstate_info *info = ptr;
+		struct dsa_port *dp;
+		int err = 0;
+
+		if (dsa_user_dev_check(dev)) {
+			dp = dsa_user_to_port(dev);
+
+			err = dsa_port_lag_change(dp, info->lower_state_info);
+		}
+
+		/* Mirror LAG port events on DSA conduits that are in
+		 * a LAG towards their respective switch CPU ports
+		 */
+		if (netdev_uses_dsa(dev)) {
+			dp = dev->dsa_ptr;
+
+			err = dsa_port_lag_change(dp, info->lower_state_info);
+		}
+
+		return notifier_from_errno(err);
+	}
+	case NETDEV_CHANGE:
+	case NETDEV_UP: {
+		/* Track state of conduit port.
+		 * DSA driver may require the conduit port (and indirectly
+		 * the tagger) to be available for some special operation.
+		 */
+		if (netdev_uses_dsa(dev)) {
+			struct dsa_port *cpu_dp = dev->dsa_ptr;
+			struct dsa_switch_tree *dst = cpu_dp->ds->dst;
+
+			/* Track when the conduit port is UP */
+			dsa_tree_conduit_oper_state_change(dst, dev,
+							   netif_oper_up(dev));
+
+			/* Track when the conduit port is ready and can accept
+			 * packet.
+			 * NETDEV_UP event is not enough to flag a port as ready.
+			 * We also have to wait for linkwatch_do_dev to dev_activate
+			 * and emit a NETDEV_CHANGE event.
+			 * We check if a conduit port is ready by checking if the dev
+			 * have a qdisc assigned and is not noop.
+			 */
+			dsa_tree_conduit_admin_state_change(dst, dev,
+							    !qdisc_tx_is_noop(dev));
+
+			return NOTIFY_OK;
+		}
+
+		return NOTIFY_DONE;
+	}
+	case NETDEV_GOING_DOWN: {
+		struct dsa_port *dp, *cpu_dp;
+		struct dsa_switch_tree *dst;
+		LIST_HEAD(close_list);
+
+		if (!netdev_uses_dsa(dev))
+			return NOTIFY_DONE;
+
+		cpu_dp = dev->dsa_ptr;
+		dst = cpu_dp->ds->dst;
+
+		dsa_tree_conduit_admin_state_change(dst, dev, false);
+
+		list_for_each_entry(dp, &dst->ports, list) {
+			if (!dsa_port_is_user(dp))
+				continue;
+
+			if (dp->cpu_dp != cpu_dp)
+				continue;
+
+			list_add(&dp->user->close_list, &close_list);
+		}
+
+		dev_close_many(&close_list, true);
+
+		return NOTIFY_OK;
+	}
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void
+dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work)
+{
+	struct switchdev_notifier_fdb_info info = {};
+
+	info.addr = switchdev_work->addr;
+	info.vid = switchdev_work->vid;
+	info.offloaded = true;
+	call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
+				 switchdev_work->orig_dev, &info.info, NULL);
+}
+
+static void dsa_user_switchdev_event_work(struct work_struct *work)
+{
+	struct dsa_switchdev_event_work *switchdev_work =
+		container_of(work, struct dsa_switchdev_event_work, work);
+	const unsigned char *addr = switchdev_work->addr;
+	struct net_device *dev = switchdev_work->dev;
+	u16 vid = switchdev_work->vid;
+	struct dsa_switch *ds;
+	struct dsa_port *dp;
+	int err;
+
+	dp = dsa_user_to_port(dev);
+	ds = dp->ds;
+
+	switch (switchdev_work->event) {
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+		if (switchdev_work->host_addr)
+			err = dsa_port_bridge_host_fdb_add(dp, addr, vid);
+		else if (dp->lag)
+			err = dsa_port_lag_fdb_add(dp, addr, vid);
+		else
+			err = dsa_port_fdb_add(dp, addr, vid);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to add %pM vid %d to fdb: %d\n",
+				dp->index, addr, vid, err);
+			break;
+		}
+		dsa_fdb_offload_notify(switchdev_work);
+		break;
+
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		if (switchdev_work->host_addr)
+			err = dsa_port_bridge_host_fdb_del(dp, addr, vid);
+		else if (dp->lag)
+			err = dsa_port_lag_fdb_del(dp, addr, vid);
+		else
+			err = dsa_port_fdb_del(dp, addr, vid);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to delete %pM vid %d from fdb: %d\n",
+				dp->index, addr, vid, err);
+		}
+
+		break;
+	}
+
+	kfree(switchdev_work);
+}
+
+static bool dsa_foreign_dev_check(const struct net_device *dev,
+				  const struct net_device *foreign_dev)
+{
+	const struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch_tree *dst = dp->ds->dst;
+
+	if (netif_is_bridge_master(foreign_dev))
+		return !dsa_tree_offloads_bridge_dev(dst, foreign_dev);
+
+	if (netif_is_bridge_port(foreign_dev))
+		return !dsa_tree_offloads_bridge_port(dst, foreign_dev);
+
+	/* Everything else is foreign */
+	return true;
+}
+
+static int dsa_user_fdb_event(struct net_device *dev,
+			      struct net_device *orig_dev,
+			      unsigned long event, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info)
+{
+	struct dsa_switchdev_event_work *switchdev_work;
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	bool host_addr = fdb_info->is_local;
+	struct dsa_switch *ds = dp->ds;
+
+	if (ctx && ctx != dp)
+		return 0;
+
+	if (!dp->bridge)
+		return 0;
+
+	if (switchdev_fdb_is_dynamically_learned(fdb_info)) {
+		if (dsa_port_offloads_bridge_port(dp, orig_dev))
+			return 0;
+
+		/* FDB entries learned by the software bridge or by foreign
+		 * bridge ports should be installed as host addresses only if
+		 * the driver requests assisted learning.
+		 */
+		if (!ds->assisted_learning_on_cpu_port)
+			return 0;
+	}
+
+	/* Also treat FDB entries on foreign interfaces bridged with us as host
+	 * addresses.
+	 */
+	if (dsa_foreign_dev_check(dev, orig_dev))
+		host_addr = true;
+
+	/* Check early that we're not doing work in vain.
+	 * Host addresses on LAG ports still require regular FDB ops,
+	 * since the CPU port isn't in a LAG.
+	 */
+	if (dp->lag && !host_addr) {
+		if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del)
+			return -EOPNOTSUPP;
+	} else {
+		if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del)
+			return -EOPNOTSUPP;
+	}
+
+	switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
+	if (!switchdev_work)
+		return -ENOMEM;
+
+	netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n",
+		   event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting",
+		   orig_dev->name, fdb_info->addr, fdb_info->vid,
+		   host_addr ? " as host address" : "");
+
+	INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work);
+	switchdev_work->event = event;
+	switchdev_work->dev = dev;
+	switchdev_work->orig_dev = orig_dev;
+
+	ether_addr_copy(switchdev_work->addr, fdb_info->addr);
+	switchdev_work->vid = fdb_info->vid;
+	switchdev_work->host_addr = host_addr;
+
+	dsa_schedule_work(&switchdev_work->work);
+
+	return 0;
+}
+
+/* Called under rcu_read_lock() */
+static int dsa_user_switchdev_event(struct notifier_block *unused,
+				    unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	int err;
+
+	switch (event) {
+	case SWITCHDEV_PORT_ATTR_SET:
+		err = switchdev_handle_port_attr_set(dev, ptr,
+						     dsa_user_dev_check,
+						     dsa_user_port_attr_set);
+		return notifier_from_errno(err);
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		err = switchdev_handle_fdb_event_to_device(dev, event, ptr,
+							   dsa_user_dev_check,
+							   dsa_foreign_dev_check,
+							   dsa_user_fdb_event);
+		return notifier_from_errno(err);
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static int dsa_user_switchdev_blocking_event(struct notifier_block *unused,
+					     unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	int err;
+
+	switch (event) {
+	case SWITCHDEV_PORT_OBJ_ADD:
+		err = switchdev_handle_port_obj_add_foreign(dev, ptr,
+							    dsa_user_dev_check,
+							    dsa_foreign_dev_check,
+							    dsa_user_port_obj_add);
+		return notifier_from_errno(err);
+	case SWITCHDEV_PORT_OBJ_DEL:
+		err = switchdev_handle_port_obj_del_foreign(dev, ptr,
+							    dsa_user_dev_check,
+							    dsa_foreign_dev_check,
+							    dsa_user_port_obj_del);
+		return notifier_from_errno(err);
+	case SWITCHDEV_PORT_ATTR_SET:
+		err = switchdev_handle_port_attr_set(dev, ptr,
+						     dsa_user_dev_check,
+						     dsa_user_port_attr_set);
+		return notifier_from_errno(err);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block dsa_user_nb __read_mostly = {
+	.notifier_call  = dsa_user_netdevice_event,
+};
+
+struct notifier_block dsa_user_switchdev_notifier = {
+	.notifier_call = dsa_user_switchdev_event,
+};
+
+struct notifier_block dsa_user_switchdev_blocking_notifier = {
+	.notifier_call = dsa_user_switchdev_blocking_event,
+};
+
+int dsa_user_register_notifier(void)
+{
+	struct notifier_block *nb;
+	int err;
+
+	err = register_netdevice_notifier(&dsa_user_nb);
+	if (err)
+		return err;
+
+	err = register_switchdev_notifier(&dsa_user_switchdev_notifier);
+	if (err)
+		goto err_switchdev_nb;
+
+	nb = &dsa_user_switchdev_blocking_notifier;
+	err = register_switchdev_blocking_notifier(nb);
+	if (err)
+		goto err_switchdev_blocking_nb;
+
+	return 0;
+
+err_switchdev_blocking_nb:
+	unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
+err_switchdev_nb:
+	unregister_netdevice_notifier(&dsa_user_nb);
+	return err;
+}
+
+void dsa_user_unregister_notifier(void)
+{
+	struct notifier_block *nb;
+	int err;
+
+	nb = &dsa_user_switchdev_blocking_notifier;
+	err = unregister_switchdev_blocking_notifier(nb);
+	if (err)
+		pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err);
+
+	err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
+	if (err)
+		pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);
+
+	err = unregister_netdevice_notifier(&dsa_user_nb);
+	if (err)
+		pr_err("DSA: failed to unregister user notifier (%d)\n", err);
+}
diff --git a/net/dsa/user.h b/net/dsa/user.h
new file mode 100644
index 000000000000..996069130bea
--- /dev/null
+++ b/net/dsa/user.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_USER_H
+#define __DSA_USER_H
+
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/netpoll.h>
+#include <linux/types.h>
+#include <net/dsa.h>
+#include <net/gro_cells.h>
+
+struct net_device;
+struct netlink_ext_ack;
+
+extern struct notifier_block dsa_user_switchdev_notifier;
+extern struct notifier_block dsa_user_switchdev_blocking_notifier;
+
+struct dsa_user_priv {
+	/* Copy of CPU port xmit for faster access in user transmit hot path */
+	struct sk_buff *	(*xmit)(struct sk_buff *skb,
+					struct net_device *dev);
+
+	struct gro_cells	gcells;
+
+	/* DSA port data, such as switch, port index, etc. */
+	struct dsa_port		*dp;
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct netpoll		*netpoll;
+#endif
+
+	/* TC context */
+	struct list_head	mall_tc_list;
+};
+
+void dsa_user_mii_bus_init(struct dsa_switch *ds);
+int dsa_user_create(struct dsa_port *dp);
+void dsa_user_destroy(struct net_device *user_dev);
+int dsa_user_suspend(struct net_device *user_dev);
+int dsa_user_resume(struct net_device *user_dev);
+int dsa_user_register_notifier(void);
+void dsa_user_unregister_notifier(void);
+void dsa_user_sync_ha(struct net_device *dev);
+void dsa_user_unsync_ha(struct net_device *dev);
+void dsa_user_setup_tagger(struct net_device *user);
+int dsa_user_change_mtu(struct net_device *dev, int new_mtu);
+int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
+			    struct netlink_ext_ack *extack);
+int dsa_user_manage_vlan_filtering(struct net_device *dev,
+				   bool vlan_filtering);
+
+static inline struct dsa_port *dsa_user_to_port(const struct net_device *dev)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+
+	return p->dp;
+}
+
+static inline struct net_device *
+dsa_user_to_conduit(const struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return dsa_port_to_conduit(dp);
+}
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 87cd83714f30ef2f19f0390e98beb8d78e173f0f Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Mon, 23 Oct 2023 11:17:29 -0700
Subject: net: dsa: Rename IFLA_DSA_MASTER to IFLA_DSA_CONDUIT

This preserves the existing IFLA_DSA_MASTER which is part of the uAPI
and creates an alias named IFLA_DSA_CONDUIT.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20231023181729.1191071-3-florian.fainelli@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/dsa/configuration.rst |  4 ++--
 include/uapi/linux/if_link.h                   |  4 +++-
 net/dsa/netlink.c                              | 10 +++++-----
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst
index e6c9719874b0..6cc4ded3cc23 100644
--- a/Documentation/networking/dsa/configuration.rst
+++ b/Documentation/networking/dsa/configuration.rst
@@ -393,7 +393,7 @@ description which has an ``ethernet`` property. It is up to the user to
 configure the system for the switch to use other conduits.
 
 DSA uses the ``rtnl_link_ops`` mechanism (with a "dsa" ``kind``) to allow
-changing the DSA conduit of a user port. The ``IFLA_DSA_MASTER`` u32 netlink
+changing the DSA conduit of a user port. The ``IFLA_DSA_CONDUIT`` u32 netlink
 attribute contains the ifindex of the conduit device that handles each user
 device. The DSA conduit must be a valid candidate based on firmware node
 information, or a LAG interface which contains only slaves which are valid
@@ -435,7 +435,7 @@ Using iproute2, the following manipulations are possible:
         dsa master bond0
 
 Notice that in the case of CPU ports under a LAG, the use of the
-``IFLA_DSA_MASTER`` netlink attribute is not strictly needed, but rather, DSA
+``IFLA_DSA_CONDUIT`` netlink attribute is not strictly needed, but rather, DSA
 reacts to the ``IFLA_MASTER`` attribute change of its present conduit (``eth0``)
 and migrates all user ports to the new upper of ``eth0``, ``bond0``. Similarly,
 when ``bond0`` is destroyed using ``RTM_DELLINK``, DSA migrates the user ports
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9f8a3da0f14f..f4191be137a4 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1394,7 +1394,9 @@ enum {
 
 enum {
 	IFLA_DSA_UNSPEC,
-	IFLA_DSA_MASTER,
+	IFLA_DSA_CONDUIT,
+	/* Deprecated, use IFLA_DSA_CONDUIT instead */
+	IFLA_DSA_MASTER = IFLA_DSA_CONDUIT,
 	__IFLA_DSA_MAX,
 };
 
diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c
index f56f90a25b99..1332e56349e5 100644
--- a/net/dsa/netlink.c
+++ b/net/dsa/netlink.c
@@ -8,7 +8,7 @@
 #include "user.h"
 
 static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = {
-	[IFLA_DSA_MASTER]	= { .type = NLA_U32 },
+	[IFLA_DSA_CONDUIT]	= { .type = NLA_U32 },
 };
 
 static int dsa_changelink(struct net_device *dev, struct nlattr *tb[],
@@ -20,8 +20,8 @@ static int dsa_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (!data)
 		return 0;
 
-	if (data[IFLA_DSA_MASTER]) {
-		u32 ifindex = nla_get_u32(data[IFLA_DSA_MASTER]);
+	if (data[IFLA_DSA_CONDUIT]) {
+		u32 ifindex = nla_get_u32(data[IFLA_DSA_CONDUIT]);
 		struct net_device *conduit;
 
 		conduit = __dev_get_by_index(dev_net(dev), ifindex);
@@ -38,7 +38,7 @@ static int dsa_changelink(struct net_device *dev, struct nlattr *tb[],
 
 static size_t dsa_get_size(const struct net_device *dev)
 {
-	return nla_total_size(sizeof(u32)) +	/* IFLA_DSA_MASTER  */
+	return nla_total_size(sizeof(u32)) +	/* IFLA_DSA_CONDUIT  */
 	       0;
 }
 
@@ -46,7 +46,7 @@ static int dsa_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct net_device *conduit = dsa_user_to_conduit(dev);
 
-	if (nla_put_u32(skb, IFLA_DSA_MASTER, conduit->ifindex))
+	if (nla_put_u32(skb, IFLA_DSA_CONDUIT, conduit->ifindex))
 		return -EMSGSIZE;
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 35dfaad7188cdc043fde31709c796f5a692ba2bd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 24 Oct 2023 23:48:58 +0200
Subject: netkit, bpf: Add bpf programmable net device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This work adds a new, minimal BPF-programmable device called "netkit"
(former PoC code-name "meta") we recently presented at LSF/MM/BPF. The
core idea is that BPF programs are executed within the drivers xmit routine
and therefore e.g. in case of containers/Pods moving BPF processing closer
to the source.

One of the goals was that in case of Pod egress traffic, this allows to
move BPF programs from hostns tcx ingress into the device itself, providing
earlier drop or forward mechanisms, for example, if the BPF program
determines that the skb must be sent out of the node, then a redirect to
the physical device can take place directly without going through per-CPU
backlog queue. This helps to shift processing for such traffic from softirq
to process context, leading to better scheduling decisions/performance (see
measurements in the slides).

In this initial version, the netkit device ships as a pair, but we plan to
extend this further so it can also operate in single device mode. The pair
comes with a primary and a peer device. Only the primary device, typically
residing in hostns, can manage BPF programs for itself and its peer. The
peer device is designated for containers/Pods and cannot attach/detach
BPF programs. Upon the device creation, the user can set the default policy
to 'pass' or 'drop' for the case when no BPF program is attached.

Additionally, the device can be operated in L3 (default) or L2 mode. The
management of BPF programs is done via bpf_mprog, so that multi-attach is
supported right from the beginning with similar API and dependency controls
as tcx. For details on the latter see commit 053c8e1f235d ("bpf: Add generic
attach/detach/query API for multi-progs"). tc BPF compatibility is provided,
so that existing programs can be easily migrated.

Going forward, we plan to use netkit devices in Cilium as the main device
type for connecting Pods. They will be operated in L3 mode in order to
simplify a Pod's neighbor management and the peer will operate in default
drop mode, so that no traffic is leaving between the time when a Pod is
brought up by the CNI plugin and programs attached by the agent.
Additionally, the programs we attach via tcx on the physical devices are
using bpf_redirect_peer() for inbound traffic into netkit device, hence the
latter is also supporting the ndo_get_peer_dev callback. Similarly, we use
bpf_redirect_neigh() for the way out, pushing from netkit peer to phys device
directly. Also, BIG TCP is supported on netkit device. For the follow-up
work in single device mode, we plan to convert Cilium's cilium_host/_net
devices into a single one.

An extensive test suite for checking device operations and the BPF program
and link management API comes as BPF selftests in this series.

Co-developed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://github.com/borkmann/iproute2/tree/pr/netkit
Link: http://vger.kernel.org/bpfconf2023_material/tcx_meta_netdev_borkmann.pdf (24ff.)
Link: https://lore.kernel.org/r/20231024214904.29825-2-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 MAINTAINERS                    |   9 +
 drivers/net/Kconfig            |   9 +
 drivers/net/Makefile           |   1 +
 drivers/net/netkit.c           | 940 +++++++++++++++++++++++++++++++++++++++++
 include/net/netkit.h           |  38 ++
 include/uapi/linux/bpf.h       |  14 +
 include/uapi/linux/if_link.h   |  24 ++
 kernel/bpf/syscall.c           |  30 +-
 tools/include/uapi/linux/bpf.h |  14 +
 9 files changed, 1074 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/netkit.c
 create mode 100644 include/net/netkit.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index ed33b9df8b3d..43be6197e655 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3795,6 +3795,15 @@ L:	bpf@vger.kernel.org
 S:	Odd Fixes
 K:	(?:\b|_)bpf(?:\b|_)
 
+BPF [NETKIT] (BPF-programmable network device)
+M:	Daniel Borkmann <daniel@iogearbox.net>
+M:	Nikolay Aleksandrov <razor@blackwall.org>
+L:	bpf@vger.kernel.org
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	drivers/net/netkit.c
+F:	include/net/netkit.h
+
 BPF [NETWORKING] (struct_ops, reuseport)
 M:	Martin KaFai Lau <martin.lau@linux.dev>
 L:	bpf@vger.kernel.org
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 44eeb5d61ba9..af0da4bb429b 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -448,6 +448,15 @@ config NLMON
 	  diagnostics, etc. This is mostly intended for developers or support
 	  to debug netlink issues. If unsure, say N.
 
+config NETKIT
+	bool "BPF-programmable network device"
+	depends on BPF_SYSCALL
+	help
+	  The netkit device is a virtual networking device where BPF programs
+	  can be attached to the device(s) transmission routine in order to
+	  implement the driver's internal logic. The device can be configured
+	  to operate in L3 or L2 mode. If unsure, say N.
+
 config NET_VRF
 	tristate "Virtual Routing and Forwarding (Lite)"
 	depends on IP_MULTIPLE_TABLES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 8a83db32509d..7cab36f94782 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_MDIO) += mdio.o
 obj-$(CONFIG_NET) += loopback.o
 obj-$(CONFIG_NETDEV_LEGACY_INIT) += Space.o
 obj-$(CONFIG_NETCONSOLE) += netconsole.o
+obj-$(CONFIG_NETKIT) += netkit.o
 obj-y += phy/
 obj-y += pse-pd/
 obj-y += mdio/
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
new file mode 100644
index 000000000000..7e484f9fd3ae
--- /dev/null
+++ b/drivers/net/netkit.c
@@ -0,0 +1,940 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+#include <linux/filter.h>
+#include <linux/netfilter_netdev.h>
+#include <linux/bpf_mprog.h>
+
+#include <net/netkit.h>
+#include <net/dst.h>
+#include <net/tcx.h>
+
+#define DRV_NAME "netkit"
+
+struct netkit {
+	/* Needed in fast-path */
+	struct net_device __rcu *peer;
+	struct bpf_mprog_entry __rcu *active;
+	enum netkit_action policy;
+	struct bpf_mprog_bundle	bundle;
+
+	/* Needed in slow-path */
+	enum netkit_mode mode;
+	bool primary;
+	u32 headroom;
+};
+
+struct netkit_link {
+	struct bpf_link link;
+	struct net_device *dev;
+	u32 location;
+};
+
+static __always_inline int
+netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
+	   enum netkit_action ret)
+{
+	const struct bpf_mprog_fp *fp;
+	const struct bpf_prog *prog;
+
+	bpf_mprog_foreach_prog(entry, fp, prog) {
+		bpf_compute_data_pointers(skb);
+		ret = bpf_prog_run(prog, skb);
+		if (ret != NETKIT_NEXT)
+			break;
+	}
+	return ret;
+}
+
+static void netkit_prep_forward(struct sk_buff *skb, bool xnet)
+{
+	skb_scrub_packet(skb, xnet);
+	skb->priority = 0;
+	nf_skip_egress(skb, true);
+}
+
+static struct netkit *netkit_priv(const struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
+static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	enum netkit_action ret = READ_ONCE(nk->policy);
+	netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
+	const struct bpf_mprog_entry *entry;
+	struct net_device *peer;
+
+	rcu_read_lock();
+	peer = rcu_dereference(nk->peer);
+	if (unlikely(!peer || !(peer->flags & IFF_UP) ||
+		     !pskb_may_pull(skb, ETH_HLEN) ||
+		     skb_orphan_frags(skb, GFP_ATOMIC)))
+		goto drop;
+	netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)));
+	skb->dev = peer;
+	entry = rcu_dereference(nk->active);
+	if (entry)
+		ret = netkit_run(entry, skb, ret);
+	switch (ret) {
+	case NETKIT_NEXT:
+	case NETKIT_PASS:
+		skb->protocol = eth_type_trans(skb, skb->dev);
+		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+		__netif_rx(skb);
+		break;
+	case NETKIT_REDIRECT:
+		skb_do_redirect(skb);
+		break;
+	case NETKIT_DROP:
+	default:
+drop:
+		kfree_skb(skb);
+		dev_core_stats_tx_dropped_inc(dev);
+		ret_dev = NET_XMIT_DROP;
+		break;
+	}
+	rcu_read_unlock();
+	return ret_dev;
+}
+
+static int netkit_open(struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	if (!peer)
+		return -ENOTCONN;
+	if (peer->flags & IFF_UP) {
+		netif_carrier_on(dev);
+		netif_carrier_on(peer);
+	}
+	return 0;
+}
+
+static int netkit_close(struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	netif_carrier_off(dev);
+	if (peer)
+		netif_carrier_off(peer);
+	return 0;
+}
+
+static int netkit_get_iflink(const struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer;
+	int iflink = 0;
+
+	rcu_read_lock();
+	peer = rcu_dereference(nk->peer);
+	if (peer)
+		iflink = peer->ifindex;
+	rcu_read_unlock();
+	return iflink;
+}
+
+static void netkit_set_multicast(struct net_device *dev)
+{
+	/* Nothing to do, we receive whatever gets pushed to us! */
+}
+
+static void netkit_set_headroom(struct net_device *dev, int headroom)
+{
+	struct netkit *nk = netkit_priv(dev), *nk2;
+	struct net_device *peer;
+
+	if (headroom < 0)
+		headroom = NET_SKB_PAD;
+
+	rcu_read_lock();
+	peer = rcu_dereference(nk->peer);
+	if (unlikely(!peer))
+		goto out;
+
+	nk2 = netkit_priv(peer);
+	nk->headroom = headroom;
+	headroom = max(nk->headroom, nk2->headroom);
+
+	peer->needed_headroom = headroom;
+	dev->needed_headroom = headroom;
+out:
+	rcu_read_unlock();
+}
+
+static struct net_device *netkit_peer_dev(struct net_device *dev)
+{
+	return rcu_dereference(netkit_priv(dev)->peer);
+}
+
+static void netkit_uninit(struct net_device *dev);
+
+static const struct net_device_ops netkit_netdev_ops = {
+	.ndo_open		= netkit_open,
+	.ndo_stop		= netkit_close,
+	.ndo_start_xmit		= netkit_xmit,
+	.ndo_set_rx_mode	= netkit_set_multicast,
+	.ndo_set_rx_headroom	= netkit_set_headroom,
+	.ndo_get_iflink		= netkit_get_iflink,
+	.ndo_get_peer_dev	= netkit_peer_dev,
+	.ndo_uninit		= netkit_uninit,
+	.ndo_features_check	= passthru_features_check,
+};
+
+static void netkit_get_drvinfo(struct net_device *dev,
+			       struct ethtool_drvinfo *info)
+{
+	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
+}
+
+static const struct ethtool_ops netkit_ethtool_ops = {
+	.get_drvinfo		= netkit_get_drvinfo,
+};
+
+static void netkit_setup(struct net_device *dev)
+{
+	static const netdev_features_t netkit_features_hw_vlan =
+		NETIF_F_HW_VLAN_CTAG_TX |
+		NETIF_F_HW_VLAN_CTAG_RX |
+		NETIF_F_HW_VLAN_STAG_TX |
+		NETIF_F_HW_VLAN_STAG_RX;
+	static const netdev_features_t netkit_features =
+		netkit_features_hw_vlan |
+		NETIF_F_SG |
+		NETIF_F_FRAGLIST |
+		NETIF_F_HW_CSUM |
+		NETIF_F_RXCSUM |
+		NETIF_F_SCTP_CRC |
+		NETIF_F_HIGHDMA |
+		NETIF_F_GSO_SOFTWARE |
+		NETIF_F_GSO_ENCAP_ALL;
+
+	ether_setup(dev);
+	dev->max_mtu = ETH_MAX_MTU;
+
+	dev->flags |= IFF_NOARP;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	dev->priv_flags |= IFF_PHONY_HEADROOM;
+	dev->priv_flags |= IFF_NO_QUEUE;
+
+	dev->ethtool_ops = &netkit_ethtool_ops;
+	dev->netdev_ops  = &netkit_netdev_ops;
+
+	dev->features |= netkit_features | NETIF_F_LLTX;
+	dev->hw_features = netkit_features;
+	dev->hw_enc_features = netkit_features;
+	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
+	dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
+
+	dev->needs_free_netdev = true;
+
+	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
+}
+
+static struct net *netkit_get_link_net(const struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	return peer ? dev_net(peer) : dev_net(dev);
+}
+
+static int netkit_check_policy(int policy, struct nlattr *tb,
+			       struct netlink_ext_ack *extack)
+{
+	switch (policy) {
+	case NETKIT_PASS:
+	case NETKIT_DROP:
+		return 0;
+	default:
+		NL_SET_ERR_MSG_ATTR(extack, tb,
+				    "Provided default xmit policy not supported");
+		return -EINVAL;
+	}
+}
+
+static int netkit_check_mode(int mode, struct nlattr *tb,
+			     struct netlink_ext_ack *extack)
+{
+	switch (mode) {
+	case NETKIT_L2:
+	case NETKIT_L3:
+		return 0;
+	default:
+		NL_SET_ERR_MSG_ATTR(extack, tb,
+				    "Provided device mode can only be L2 or L3");
+		return -EINVAL;
+	}
+}
+
+static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *attr = tb[IFLA_ADDRESS];
+
+	if (!attr)
+		return 0;
+	NL_SET_ERR_MSG_ATTR(extack, attr,
+			    "Setting Ethernet address is not supported");
+	return -EOPNOTSUPP;
+}
+
+static struct rtnl_link_ops netkit_link_ops;
+
+static int netkit_new_link(struct net *src_net, struct net_device *dev,
+			   struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr;
+	enum netkit_action default_prim = NETKIT_PASS;
+	enum netkit_action default_peer = NETKIT_PASS;
+	enum netkit_mode mode = NETKIT_L3;
+	unsigned char ifname_assign_type;
+	struct ifinfomsg *ifmp = NULL;
+	struct net_device *peer;
+	char ifname[IFNAMSIZ];
+	struct netkit *nk;
+	struct net *net;
+	int err;
+
+	if (data) {
+		if (data[IFLA_NETKIT_MODE]) {
+			attr = data[IFLA_NETKIT_MODE];
+			mode = nla_get_u32(attr);
+			err = netkit_check_mode(mode, attr, extack);
+			if (err < 0)
+				return err;
+		}
+		if (data[IFLA_NETKIT_PEER_INFO]) {
+			attr = data[IFLA_NETKIT_PEER_INFO];
+			ifmp = nla_data(attr);
+			err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
+			if (err < 0)
+				return err;
+			err = netkit_validate(peer_tb, NULL, extack);
+			if (err < 0)
+				return err;
+			tbp = peer_tb;
+		}
+		if (data[IFLA_NETKIT_POLICY]) {
+			attr = data[IFLA_NETKIT_POLICY];
+			default_prim = nla_get_u32(attr);
+			err = netkit_check_policy(default_prim, attr, extack);
+			if (err < 0)
+				return err;
+		}
+		if (data[IFLA_NETKIT_PEER_POLICY]) {
+			attr = data[IFLA_NETKIT_PEER_POLICY];
+			default_peer = nla_get_u32(attr);
+			err = netkit_check_policy(default_peer, attr, extack);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	if (ifmp && tbp[IFLA_IFNAME]) {
+		nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
+		ifname_assign_type = NET_NAME_USER;
+	} else {
+		strscpy(ifname, "nk%d", IFNAMSIZ);
+		ifname_assign_type = NET_NAME_ENUM;
+	}
+
+	net = rtnl_link_get_net(src_net, tbp);
+	if (IS_ERR(net))
+		return PTR_ERR(net);
+
+	peer = rtnl_create_link(net, ifname, ifname_assign_type,
+				&netkit_link_ops, tbp, extack);
+	if (IS_ERR(peer)) {
+		put_net(net);
+		return PTR_ERR(peer);
+	}
+
+	netif_inherit_tso_max(peer, dev);
+
+	if (mode == NETKIT_L2)
+		eth_hw_addr_random(peer);
+	if (ifmp && dev->ifindex)
+		peer->ifindex = ifmp->ifi_index;
+
+	nk = netkit_priv(peer);
+	nk->primary = false;
+	nk->policy = default_peer;
+	nk->mode = mode;
+	bpf_mprog_bundle_init(&nk->bundle);
+	RCU_INIT_POINTER(nk->active, NULL);
+	RCU_INIT_POINTER(nk->peer, NULL);
+
+	err = register_netdevice(peer);
+	put_net(net);
+	if (err < 0)
+		goto err_register_peer;
+	netif_carrier_off(peer);
+	if (mode == NETKIT_L2)
+		dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
+
+	err = rtnl_configure_link(peer, NULL, 0, NULL);
+	if (err < 0)
+		goto err_configure_peer;
+
+	if (mode == NETKIT_L2)
+		eth_hw_addr_random(dev);
+	if (tb[IFLA_IFNAME])
+		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
+	else
+		strscpy(dev->name, "nk%d", IFNAMSIZ);
+
+	nk = netkit_priv(dev);
+	nk->primary = true;
+	nk->policy = default_prim;
+	nk->mode = mode;
+	bpf_mprog_bundle_init(&nk->bundle);
+	RCU_INIT_POINTER(nk->active, NULL);
+	RCU_INIT_POINTER(nk->peer, NULL);
+
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto err_configure_peer;
+	netif_carrier_off(dev);
+	if (mode == NETKIT_L2)
+		dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
+
+	rcu_assign_pointer(netkit_priv(dev)->peer, peer);
+	rcu_assign_pointer(netkit_priv(peer)->peer, dev);
+	return 0;
+err_configure_peer:
+	unregister_netdevice(peer);
+	return err;
+err_register_peer:
+	free_netdev(peer);
+	return err;
+}
+
+static struct bpf_mprog_entry *netkit_entry_fetch(struct net_device *dev,
+						  bool bundle_fallback)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct bpf_mprog_entry *entry;
+
+	ASSERT_RTNL();
+	entry = rcu_dereference_rtnl(nk->active);
+	if (entry)
+		return entry;
+	if (bundle_fallback)
+		return &nk->bundle.a;
+	return NULL;
+}
+
+static void netkit_entry_update(struct net_device *dev,
+				struct bpf_mprog_entry *entry)
+{
+	struct netkit *nk = netkit_priv(dev);
+
+	ASSERT_RTNL();
+	rcu_assign_pointer(nk->active, entry);
+}
+
+static void netkit_entry_sync(void)
+{
+	synchronize_rcu();
+}
+
+static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 which)
+{
+	struct net_device *dev;
+	struct netkit *nk;
+
+	ASSERT_RTNL();
+
+	switch (which) {
+	case BPF_NETKIT_PRIMARY:
+	case BPF_NETKIT_PEER:
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	dev = __dev_get_by_index(net, ifindex);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+	if (dev->netdev_ops != &netkit_netdev_ops)
+		return ERR_PTR(-ENXIO);
+
+	nk = netkit_priv(dev);
+	if (!nk->primary)
+		return ERR_PTR(-EACCES);
+	if (which == BPF_NETKIT_PEER) {
+		dev = rcu_dereference_rtnl(nk->peer);
+		if (!dev)
+			return ERR_PTR(-ENODEV);
+	}
+	return dev;
+}
+
+int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct bpf_prog *replace_prog = NULL;
+	struct net_device *dev;
+	int ret;
+
+	rtnl_lock();
+	dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
+			       attr->attach_type);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+	entry = netkit_entry_fetch(dev, true);
+	if (attr->attach_flags & BPF_F_REPLACE) {
+		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
+						 prog->type);
+		if (IS_ERR(replace_prog)) {
+			ret = PTR_ERR(replace_prog);
+			replace_prog = NULL;
+			goto out;
+		}
+	}
+	ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
+			       attr->attach_flags, attr->relative_fd,
+			       attr->expected_revision);
+	if (!ret) {
+		if (entry != entry_new) {
+			netkit_entry_update(dev, entry_new);
+			netkit_entry_sync();
+		}
+		bpf_mprog_commit(entry);
+	}
+out:
+	if (replace_prog)
+		bpf_prog_put(replace_prog);
+	rtnl_unlock();
+	return ret;
+}
+
+int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct net_device *dev;
+	int ret;
+
+	rtnl_lock();
+	dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
+			       attr->attach_type);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+	entry = netkit_entry_fetch(dev, false);
+	if (!entry) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
+			       attr->relative_fd, attr->expected_revision);
+	if (!ret) {
+		if (!bpf_mprog_total(entry_new))
+			entry_new = NULL;
+		netkit_entry_update(dev, entry_new);
+		netkit_entry_sync();
+		bpf_mprog_commit(entry);
+	}
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	struct net_device *dev;
+	int ret;
+
+	rtnl_lock();
+	dev = netkit_dev_fetch(current->nsproxy->net_ns,
+			       attr->query.target_ifindex,
+			       attr->query.attach_type);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+	ret = bpf_mprog_query(attr, uattr, netkit_entry_fetch(dev, false));
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+static struct netkit_link *netkit_link(const struct bpf_link *link)
+{
+	return container_of(link, struct netkit_link, link);
+}
+
+static int netkit_link_prog_attach(struct bpf_link *link, u32 flags,
+				   u32 id_or_fd, u64 revision)
+{
+	struct netkit_link *nkl = netkit_link(link);
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct net_device *dev = nkl->dev;
+	int ret;
+
+	ASSERT_RTNL();
+	entry = netkit_entry_fetch(dev, true);
+	ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
+			       id_or_fd, revision);
+	if (!ret) {
+		if (entry != entry_new) {
+			netkit_entry_update(dev, entry_new);
+			netkit_entry_sync();
+		}
+		bpf_mprog_commit(entry);
+	}
+	return ret;
+}
+
+static void netkit_link_release(struct bpf_link *link)
+{
+	struct netkit_link *nkl = netkit_link(link);
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct net_device *dev;
+	int ret = 0;
+
+	rtnl_lock();
+	dev = nkl->dev;
+	if (!dev)
+		goto out;
+	entry = netkit_entry_fetch(dev, false);
+	if (!entry) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
+	if (!ret) {
+		if (!bpf_mprog_total(entry_new))
+			entry_new = NULL;
+		netkit_entry_update(dev, entry_new);
+		netkit_entry_sync();
+		bpf_mprog_commit(entry);
+		nkl->dev = NULL;
+	}
+out:
+	WARN_ON_ONCE(ret);
+	rtnl_unlock();
+}
+
+static int netkit_link_update(struct bpf_link *link, struct bpf_prog *nprog,
+			      struct bpf_prog *oprog)
+{
+	struct netkit_link *nkl = netkit_link(link);
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct net_device *dev;
+	int ret = 0;
+
+	rtnl_lock();
+	dev = nkl->dev;
+	if (!dev) {
+		ret = -ENOLINK;
+		goto out;
+	}
+	if (oprog && link->prog != oprog) {
+		ret = -EPERM;
+		goto out;
+	}
+	oprog = link->prog;
+	if (oprog == nprog) {
+		bpf_prog_put(nprog);
+		goto out;
+	}
+	entry = netkit_entry_fetch(dev, false);
+	if (!entry) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
+			       BPF_F_REPLACE | BPF_F_ID,
+			       link->prog->aux->id, 0);
+	if (!ret) {
+		WARN_ON_ONCE(entry != entry_new);
+		oprog = xchg(&link->prog, nprog);
+		bpf_prog_put(oprog);
+		bpf_mprog_commit(entry);
+	}
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+static void netkit_link_dealloc(struct bpf_link *link)
+{
+	kfree(netkit_link(link));
+}
+
+static void netkit_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+	const struct netkit_link *nkl = netkit_link(link);
+	u32 ifindex = 0;
+
+	rtnl_lock();
+	if (nkl->dev)
+		ifindex = nkl->dev->ifindex;
+	rtnl_unlock();
+
+	seq_printf(seq, "ifindex:\t%u\n", ifindex);
+	seq_printf(seq, "attach_type:\t%u (%s)\n",
+		   nkl->location,
+		   nkl->location == BPF_NETKIT_PRIMARY ? "primary" : "peer");
+}
+
+static int netkit_link_fill_info(const struct bpf_link *link,
+				 struct bpf_link_info *info)
+{
+	const struct netkit_link *nkl = netkit_link(link);
+	u32 ifindex = 0;
+
+	rtnl_lock();
+	if (nkl->dev)
+		ifindex = nkl->dev->ifindex;
+	rtnl_unlock();
+
+	info->netkit.ifindex = ifindex;
+	info->netkit.attach_type = nkl->location;
+	return 0;
+}
+
+static int netkit_link_detach(struct bpf_link *link)
+{
+	netkit_link_release(link);
+	return 0;
+}
+
+static const struct bpf_link_ops netkit_link_lops = {
+	.release	= netkit_link_release,
+	.detach		= netkit_link_detach,
+	.dealloc	= netkit_link_dealloc,
+	.update_prog	= netkit_link_update,
+	.show_fdinfo	= netkit_link_fdinfo,
+	.fill_link_info	= netkit_link_fill_info,
+};
+
+static int netkit_link_init(struct netkit_link *nkl,
+			    struct bpf_link_primer *link_primer,
+			    const union bpf_attr *attr,
+			    struct net_device *dev,
+			    struct bpf_prog *prog)
+{
+	bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT,
+		      &netkit_link_lops, prog);
+	nkl->location = attr->link_create.attach_type;
+	nkl->dev = dev;
+	return bpf_link_prime(&nkl->link, link_primer);
+}
+
+int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct netkit_link *nkl;
+	struct net_device *dev;
+	int ret;
+
+	rtnl_lock();
+	dev = netkit_dev_fetch(current->nsproxy->net_ns,
+			       attr->link_create.target_ifindex,
+			       attr->link_create.attach_type);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+	nkl = kzalloc(sizeof(*nkl), GFP_KERNEL_ACCOUNT);
+	if (!nkl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = netkit_link_init(nkl, &link_primer, attr, dev, prog);
+	if (ret) {
+		kfree(nkl);
+		goto out;
+	}
+	ret = netkit_link_prog_attach(&nkl->link,
+				      attr->link_create.flags,
+				      attr->link_create.netkit.relative_fd,
+				      attr->link_create.netkit.expected_revision);
+	if (ret) {
+		nkl->dev = NULL;
+		bpf_link_cleanup(&link_primer);
+		goto out;
+	}
+	ret = bpf_link_settle(&link_primer);
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+static void netkit_release_all(struct net_device *dev)
+{
+	struct bpf_mprog_entry *entry;
+	struct bpf_tuple tuple = {};
+	struct bpf_mprog_fp *fp;
+	struct bpf_mprog_cp *cp;
+
+	entry = netkit_entry_fetch(dev, false);
+	if (!entry)
+		return;
+	netkit_entry_update(dev, NULL);
+	netkit_entry_sync();
+	bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
+		if (tuple.link)
+			netkit_link(tuple.link)->dev = NULL;
+		else
+			bpf_prog_put(tuple.prog);
+	}
+}
+
+static void netkit_uninit(struct net_device *dev)
+{
+	netkit_release_all(dev);
+}
+
+static void netkit_del_link(struct net_device *dev, struct list_head *head)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	RCU_INIT_POINTER(nk->peer, NULL);
+	unregister_netdevice_queue(dev, head);
+	if (peer) {
+		nk = netkit_priv(peer);
+		RCU_INIT_POINTER(nk->peer, NULL);
+		unregister_netdevice_queue(peer, head);
+	}
+}
+
+static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
+			      struct nlattr *data[],
+			      struct netlink_ext_ack *extack)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+	enum netkit_action policy;
+	struct nlattr *attr;
+	int err;
+
+	if (!nk->primary) {
+		NL_SET_ERR_MSG(extack,
+			       "netkit link settings can be changed only through the primary device");
+		return -EACCES;
+	}
+
+	if (data[IFLA_NETKIT_MODE]) {
+		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE],
+				    "netkit link operating mode cannot be changed after device creation");
+		return -EACCES;
+	}
+
+	if (data[IFLA_NETKIT_POLICY]) {
+		attr = data[IFLA_NETKIT_POLICY];
+		policy = nla_get_u32(attr);
+		err = netkit_check_policy(policy, attr, extack);
+		if (err)
+			return err;
+		WRITE_ONCE(nk->policy, policy);
+	}
+
+	if (data[IFLA_NETKIT_PEER_POLICY]) {
+		err = -EOPNOTSUPP;
+		attr = data[IFLA_NETKIT_PEER_POLICY];
+		policy = nla_get_u32(attr);
+		if (peer)
+			err = netkit_check_policy(policy, attr, extack);
+		if (err)
+			return err;
+		nk = netkit_priv(peer);
+		WRITE_ONCE(nk->policy, policy);
+	}
+
+	return 0;
+}
+
+static size_t netkit_get_size(const struct net_device *dev)
+{
+	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
+	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */
+	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
+	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
+	       0;
+}
+
+static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	if (nla_put_u8(skb, IFLA_NETKIT_PRIMARY, nk->primary))
+		return -EMSGSIZE;
+	if (nla_put_u32(skb, IFLA_NETKIT_POLICY, nk->policy))
+		return -EMSGSIZE;
+	if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode))
+		return -EMSGSIZE;
+
+	if (peer) {
+		nk = netkit_priv(peer);
+		if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy))
+			return -EMSGSIZE;
+	}
+
+	return 0;
+}
+
+static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
+	[IFLA_NETKIT_PEER_INFO]		= { .len = sizeof(struct ifinfomsg) },
+	[IFLA_NETKIT_POLICY]		= { .type = NLA_U32 },
+	[IFLA_NETKIT_MODE]		= { .type = NLA_U32 },
+	[IFLA_NETKIT_PEER_POLICY]	= { .type = NLA_U32 },
+	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
+					    .reject_message = "Primary attribute is read-only" },
+};
+
+static struct rtnl_link_ops netkit_link_ops = {
+	.kind		= DRV_NAME,
+	.priv_size	= sizeof(struct netkit),
+	.setup		= netkit_setup,
+	.newlink	= netkit_new_link,
+	.dellink	= netkit_del_link,
+	.changelink	= netkit_change_link,
+	.get_link_net	= netkit_get_link_net,
+	.get_size	= netkit_get_size,
+	.fill_info	= netkit_fill_info,
+	.policy		= netkit_policy,
+	.validate	= netkit_validate,
+	.maxtype	= IFLA_NETKIT_MAX,
+};
+
+static __init int netkit_init(void)
+{
+	BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
+		     (int)NETKIT_PASS != (int)TCX_PASS ||
+		     (int)NETKIT_DROP != (int)TCX_DROP ||
+		     (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
+
+	return rtnl_link_register(&netkit_link_ops);
+}
+
+static __exit void netkit_exit(void)
+{
+	rtnl_link_unregister(&netkit_link_ops);
+}
+
+module_init(netkit_init);
+module_exit(netkit_exit);
+
+MODULE_DESCRIPTION("BPF-programmable network device");
+MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
+MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK(DRV_NAME);
diff --git a/include/net/netkit.h b/include/net/netkit.h
new file mode 100644
index 000000000000..0ba2e6b847ca
--- /dev/null
+++ b/include/net/netkit.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2023 Isovalent */
+#ifndef __NET_NETKIT_H
+#define __NET_NETKIT_H
+
+#include <linux/bpf.h>
+
+#ifdef CONFIG_NETKIT
+int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+#else
+static inline int netkit_prog_attach(const union bpf_attr *attr,
+				     struct bpf_prog *prog)
+{
+	return -EINVAL;
+}
+
+static inline int netkit_link_attach(const union bpf_attr *attr,
+				     struct bpf_prog *prog)
+{
+	return -EINVAL;
+}
+
+static inline int netkit_prog_detach(const union bpf_attr *attr,
+				     struct bpf_prog *prog)
+{
+	return -EINVAL;
+}
+
+static inline int netkit_prog_query(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_NETKIT */
+#endif /* __NET_NETKIT_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7ba61b75bc0e..0f6cdf52b1da 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1052,6 +1052,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_UNIX_RECVMSG,
 	BPF_CGROUP_UNIX_GETPEERNAME,
 	BPF_CGROUP_UNIX_GETSOCKNAME,
+	BPF_NETKIT_PRIMARY,
+	BPF_NETKIT_PEER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1071,6 +1073,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_NETFILTER = 10,
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
+	BPF_LINK_TYPE_NETKIT = 13,
 	MAX_BPF_LINK_TYPE,
 };
 
@@ -1656,6 +1659,13 @@ union bpf_attr {
 				__u32		flags;
 				__u32		pid;
 			} uprobe_multi;
+			struct {
+				union {
+					__u32	relative_fd;
+					__u32	relative_id;
+				};
+				__u64		expected_revision;
+			} netkit;
 		};
 	} link_create;
 
@@ -6576,6 +6586,10 @@ struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} tcx;
+		struct {
+			__u32 ifindex;
+			__u32 attach_type;
+		} netkit;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index fac351a93aed..a0aa05a28cf2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -756,6 +756,30 @@ struct tunnel_msg {
 	__u32 ifindex;
 };
 
+/* netkit section */
+enum netkit_action {
+	NETKIT_NEXT	= -1,
+	NETKIT_PASS	= 0,
+	NETKIT_DROP	= 2,
+	NETKIT_REDIRECT	= 7,
+};
+
+enum netkit_mode {
+	NETKIT_L2,
+	NETKIT_L3,
+};
+
+enum {
+	IFLA_NETKIT_UNSPEC,
+	IFLA_NETKIT_PEER_INFO,
+	IFLA_NETKIT_PRIMARY,
+	IFLA_NETKIT_POLICY,
+	IFLA_NETKIT_PEER_POLICY,
+	IFLA_NETKIT_MODE,
+	__IFLA_NETKIT_MAX,
+};
+#define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
+
 /* VXLAN section */
 
 /* include statistics in the dump */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a9b2cb500bf7..0ed286b8a0f0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,8 +35,9 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
 #include <linux/trace_events.h>
-#include <net/netfilter/nf_bpf_link.h>
 
+#include <net/netfilter/nf_bpf_link.h>
+#include <net/netkit.h>
 #include <net/tcx.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -3730,6 +3731,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_LSM;
 	case BPF_TCX_INGRESS:
 	case BPF_TCX_EGRESS:
+	case BPF_NETKIT_PRIMARY:
+	case BPF_NETKIT_PEER:
 		return BPF_PROG_TYPE_SCHED_CLS;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
@@ -3781,7 +3784,9 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 		return 0;
 	case BPF_PROG_TYPE_SCHED_CLS:
 		if (attach_type != BPF_TCX_INGRESS &&
-		    attach_type != BPF_TCX_EGRESS)
+		    attach_type != BPF_TCX_EGRESS &&
+		    attach_type != BPF_NETKIT_PRIMARY &&
+		    attach_type != BPF_NETKIT_PEER)
 			return -EINVAL;
 		return 0;
 	default:
@@ -3864,7 +3869,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 			ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 		break;
 	case BPF_PROG_TYPE_SCHED_CLS:
-		ret = tcx_prog_attach(attr, prog);
+		if (attr->attach_type == BPF_TCX_INGRESS ||
+		    attr->attach_type == BPF_TCX_EGRESS)
+			ret = tcx_prog_attach(attr, prog);
+		else
+			ret = netkit_prog_attach(attr, prog);
 		break;
 	default:
 		ret = -EINVAL;
@@ -3925,7 +3934,11 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		ret = cgroup_bpf_prog_detach(attr, ptype);
 		break;
 	case BPF_PROG_TYPE_SCHED_CLS:
-		ret = tcx_prog_detach(attr, prog);
+		if (attr->attach_type == BPF_TCX_INGRESS ||
+		    attr->attach_type == BPF_TCX_EGRESS)
+			ret = tcx_prog_detach(attr, prog);
+		else
+			ret = netkit_prog_detach(attr, prog);
 		break;
 	default:
 		ret = -EINVAL;
@@ -3992,6 +4005,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_TCX_INGRESS:
 	case BPF_TCX_EGRESS:
 		return tcx_prog_query(attr, uattr);
+	case BPF_NETKIT_PRIMARY:
+	case BPF_NETKIT_PEER:
+		return netkit_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
@@ -4973,7 +4989,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_SCHED_CLS:
-		ret = tcx_link_attach(attr, prog);
+		if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
+		    attr->link_create.attach_type == BPF_TCX_EGRESS)
+			ret = tcx_link_attach(attr, prog);
+		else
+			ret = netkit_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_NETFILTER:
 		ret = bpf_nf_link_attach(attr, prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7ba61b75bc0e..0f6cdf52b1da 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1052,6 +1052,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_UNIX_RECVMSG,
 	BPF_CGROUP_UNIX_GETPEERNAME,
 	BPF_CGROUP_UNIX_GETSOCKNAME,
+	BPF_NETKIT_PRIMARY,
+	BPF_NETKIT_PEER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1071,6 +1073,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_NETFILTER = 10,
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
+	BPF_LINK_TYPE_NETKIT = 13,
 	MAX_BPF_LINK_TYPE,
 };
 
@@ -1656,6 +1659,13 @@ union bpf_attr {
 				__u32		flags;
 				__u32		pid;
 			} uprobe_multi;
+			struct {
+				union {
+					__u32	relative_fd;
+					__u32	relative_id;
+				};
+				__u64		expected_revision;
+			} netkit;
 		};
 	} link_create;
 
@@ -6576,6 +6586,10 @@ struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} tcx;
+		struct {
+			__u32 ifindex;
+			__u32 attach_type;
+		} netkit;
 	};
 } __attribute__((aligned(8)));
 
-- 
cgit v1.2.3-70-g09d2


From 70f06c115bcca26ceeebf938e48bc8143668e38b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 24 Oct 2023 13:05:51 +0200
Subject: sched: act_ct: switch to per-action label counting

net->ct.labels_used was meant to convey 'number of ip/nftables rules
that need the label extension allocated'.

act_ct enables this for each net namespace, which voids all attempts
to avoid ct->ext allocation when possible.

Move this increment to the control plane to request label extension
space allocation only when its needed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Pedro Tammela <pctammela@mojatatu.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ct.h |  1 +
 net/sched/act_ct.c         | 41 ++++++++++++++++++-----------------------
 2 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index b24ea2d9400b..8a6dbfb23336 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -22,6 +22,7 @@ struct tcf_ct_params {
 
 	struct nf_nat_range2 range;
 	bool ipv4_range;
+	bool put_labels;
 
 	u16 ct_action;
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 7c652d14528b..43b06cb284ce 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -690,7 +690,6 @@ static struct tc_action_ops act_ct_ops;
 
 struct tc_ct_action_net {
 	struct tc_action_net tn; /* Must be first */
-	bool labels;
 };
 
 /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
@@ -829,8 +828,13 @@ static void tcf_ct_params_free(struct tcf_ct_params *params)
 	}
 	if (params->ct_ft)
 		tcf_ct_flow_table_put(params->ct_ft);
-	if (params->tmpl)
+	if (params->tmpl) {
+		if (params->put_labels)
+			nf_connlabels_put(nf_ct_net(params->tmpl));
+
 		nf_ct_put(params->tmpl);
+	}
+
 	kfree(params);
 }
 
@@ -1154,9 +1158,9 @@ static int tcf_ct_fill_params(struct net *net,
 			      struct nlattr **tb,
 			      struct netlink_ext_ack *extack)
 {
-	struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
 	struct nf_conntrack_zone zone;
 	int err, family, proto, len;
+	bool put_labels = false;
 	struct nf_conn *tmpl;
 	char *name;
 
@@ -1186,15 +1190,20 @@ static int tcf_ct_fill_params(struct net *net,
 	}
 
 	if (tb[TCA_CT_LABELS]) {
+		unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
+
 		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
 			NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
 			return -EOPNOTSUPP;
 		}
 
-		if (!tn->labels) {
+		if (nf_connlabels_get(net, n_bits - 1)) {
 			NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
 			return -EOPNOTSUPP;
+		} else {
+			put_labels = true;
 		}
+
 		tcf_ct_set_key_val(tb,
 				   p->labels, TCA_CT_LABELS,
 				   p->labels_mask, TCA_CT_LABELS_MASK,
@@ -1238,10 +1247,15 @@ static int tcf_ct_fill_params(struct net *net,
 		}
 	}
 
+	p->put_labels = put_labels;
+
 	if (p->ct_action & TCA_CT_ACT_COMMIT)
 		__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
 	return 0;
 err:
+	if (put_labels)
+		nf_connlabels_put(net);
+
 	nf_ct_put(p->tmpl);
 	p->tmpl = NULL;
 	return err;
@@ -1542,32 +1556,13 @@ static struct tc_action_ops act_ct_ops = {
 
 static __net_init int ct_init_net(struct net *net)
 {
-	unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
 	struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
 
-	if (nf_connlabels_get(net, n_bits - 1)) {
-		tn->labels = false;
-		pr_err("act_ct: Failed to set connlabels length");
-	} else {
-		tn->labels = true;
-	}
-
 	return tc_action_net_init(net, &tn->tn, &act_ct_ops);
 }
 
 static void __net_exit ct_exit_net(struct list_head *net_list)
 {
-	struct net *net;
-
-	rtnl_lock();
-	list_for_each_entry(net, net_list, exit_list) {
-		struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
-
-		if (tn->labels)
-			nf_connlabels_put(net);
-	}
-	rtnl_unlock();
-
 	tc_action_net_exit(net_list, act_ct_ops.net_id);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 849ee75a38b297187c760bb1d23d8f2a7b1fc73e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 23 Oct 2023 13:44:37 -0700
Subject: tcp: define initial scaling factor value as a macro

So that other users could access it. Notably MPTCP will use
it in the next patch.

No functional change intended.

Acked-by: Matthieu Baerts <matttbe@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <martineau@kernel.org>
Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-2-v1-4-9dc60939d371@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 39b731c900dd..993b7fcd4e46 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1489,13 +1489,15 @@ static inline int tcp_space_from_win(const struct sock *sk, int win)
 	return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
 }
 
+/* Assume a conservative default of 1200 bytes of payload per 4K page.
+ * This may be adjusted later in tcp_measure_rcv_mss().
+ */
+#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
+				   SKB_TRUESIZE(4096))
+
 static inline void tcp_scaling_ratio_init(struct sock *sk)
 {
-	/* Assume a conservative default of 1200 bytes of payload per 4K page.
-	 * This may be adjusted later in tcp_measure_rcv_mss().
-	 */
-	tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
-				    SKB_TRUESIZE(4096);
+	tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
 }
 
 /* Note: caller must be prepared to deal with negative returns */
-- 
cgit v1.2.3-70-g09d2


From e57a34478586fe3562560ccebd655b707a5b4a56 Mon Sep 17 00:00:00 2001
From: Yan Zhai <yan@cloudflare.com>
Date: Tue, 24 Oct 2023 07:26:33 -0700
Subject: ipv6: drop feature RTAX_FEATURE_ALLFRAG

RTAX_FEATURE_ALLFRAG was added before the first git commit:

https://www.mail-archive.com/bk-commits-head@vger.kernel.org/msg03399.html

The feature would send packets to the fragmentation path if a box
receives a PMTU value with less than 1280 byte. However, since commit
9d289715eb5c ("ipv6: stop sending PTB packets for MTU < 1280"), such
message would be simply discarded. The feature flag is neither supported
in iproute2 utility. In theory one can still manipulate it with direct
netlink message, but it is not ideal because it was based on obsoleted
guidance of RFC-2460 (replaced by RFC-8200).

The feature would always test false at the moment, so remove related
code or mark them as unused.

Signed-off-by: Yan Zhai <yan@cloudflare.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/d78e44dcd9968a252143ffe78460446476a472a1.1698156966.git.yan@cloudflare.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h                  |  7 -------
 include/net/inet_connection_sock.h |  1 -
 include/net/inet_sock.h            |  1 -
 include/uapi/linux/rtnetlink.h     |  2 +-
 net/ipv4/tcp_output.c              | 20 +-------------------
 net/ipv6/ip6_output.c              | 15 ++-------------
 net/ipv6/tcp_ipv6.c                |  1 -
 net/ipv6/xfrm6_output.c            |  2 +-
 net/mptcp/subflow.c                |  1 -
 9 files changed, 5 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index f8b8599a0600..f5dfc8fb7b37 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -222,13 +222,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr
 	return msecs_to_jiffies(dst_metric(dst, metric));
 }
 
-static inline u32
-dst_allfrag(const struct dst_entry *dst)
-{
-	int ret = dst_feature(dst,  RTAX_FEATURE_ALLFRAG);
-	return ret;
-}
-
 static inline int
 dst_metric_locked(const struct dst_entry *dst, int metric)
 {
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 086d1193c9ef..d0a2f827d5f2 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -44,7 +44,6 @@ struct inet_connection_sock_af_ops {
 				      struct request_sock *req_unhash,
 				      bool *own_req);
 	u16	    net_header_len;
-	u16	    net_frag_header_len;
 	u16	    sockaddr_len;
 	int	    (*setsockopt)(struct sock *sk, int level, int optname,
 				  sockptr_t optval, unsigned int optlen);
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 98e11958cdff..74db6d97cae1 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -244,7 +244,6 @@ struct inet_sock {
 };
 
 #define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
-#define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */
 
 enum {
 	INET_FLAGS_PKTINFO	= 0,
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index aa2482a0614a..3b687d20c9ed 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -505,7 +505,7 @@ enum {
 #define RTAX_FEATURE_ECN		(1 << 0)
 #define RTAX_FEATURE_SACK		(1 << 1) /* unused */
 #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
-#define RTAX_FEATURE_ALLFRAG		(1 << 3)
+#define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
 #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
 
 #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2866ccbccde0..ca4d7594efd4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1698,14 +1698,6 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
 	 */
 	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
 
-	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
-	if (icsk->icsk_af_ops->net_frag_header_len) {
-		const struct dst_entry *dst = __sk_dst_get(sk);
-
-		if (dst && dst_allfrag(dst))
-			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
-	}
-
 	/* Clamp it (mss_clamp does not include tcp options) */
 	if (mss_now > tp->rx_opt.mss_clamp)
 		mss_now = tp->rx_opt.mss_clamp;
@@ -1733,21 +1725,11 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	int mtu;
 
-	mtu = mss +
+	return mss +
 	      tp->tcp_header_len +
 	      icsk->icsk_ext_hdr_len +
 	      icsk->icsk_af_ops->net_header_len;
-
-	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
-	if (icsk->icsk_af_ops->net_frag_header_len) {
-		const struct dst_entry *dst = __sk_dst_get(sk);
-
-		if (dst && dst_allfrag(dst))
-			mtu += icsk->icsk_af_ops->net_frag_header_len;
-	}
-	return mtu;
 }
 EXPORT_SYMBOL(tcp_mss_to_mtu);
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3c7de89d6755..86efd901ee5a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -191,7 +191,6 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 
 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
-	    dst_allfrag(skb_dst(skb)) ||
 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
 	else
@@ -1017,9 +1016,6 @@ slow_path:
 	return err;
 
 fail_toobig:
-	if (skb->sk && dst_allfrag(skb_dst(skb)))
-		sk_gso_disable(skb->sk);
-
 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 	err = -EMSGSIZE;
 
@@ -1384,10 +1380,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	cork->base.mark = ipc6->sockc.mark;
 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
 
-	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
-		cork->base.flags |= IPCORK_ALLFRAG;
 	cork->base.length = 0;
-
 	cork->base.transmit_time = ipc6->sockc.transmit_time;
 
 	return 0;
@@ -1444,8 +1437,6 @@ static int __ip6_append_data(struct sock *sk,
 
 	headersize = sizeof(struct ipv6hdr) +
 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
-		     (dst_allfrag(&rt->dst) ?
-		      sizeof(struct frag_hdr) : 0) +
 		     rt->rt6i_nfheader_len;
 
 	if (mtu <= fragheaderlen ||
@@ -1555,7 +1546,7 @@ emsgsize:
 
 	while (length > 0) {
 		/* Check if the remaining data fits into current packet. */
-		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
+		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
 		if (copy < length)
 			copy = maxfraglen - skb->len;
 
@@ -1586,7 +1577,7 @@ alloc_new_skb:
 			 */
 			datalen = length + fraggap;
 
-			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
+			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
 			fraglen = datalen + fragheaderlen;
 			pagedlen = 0;
@@ -1835,7 +1826,6 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
 	struct dst_entry *dst = cork->base.dst;
 
 	cork->base.dst = NULL;
-	cork->base.flags &= ~IPCORK_ALLFRAG;
 	skb_dst_set(skb, dst);
 }
 
@@ -1856,7 +1846,6 @@ static void ip6_cork_release(struct inet_cork_full *cork,
 	if (cork->base.dst) {
 		dst_release(cork->base.dst);
 		cork->base.dst = NULL;
-		cork->base.flags &= ~IPCORK_ALLFRAG;
 	}
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0c8a14ba104f..dc27988512a6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1895,7 +1895,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = {
 	.conn_request	   = tcp_v6_conn_request,
 	.syn_recv_sock	   = tcp_v6_syn_recv_sock,
 	.net_header_len	   = sizeof(struct ipv6hdr),
-	.net_frag_header_len = sizeof(struct frag_hdr),
 	.setsockopt	   = ipv6_setsockopt,
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index ad07904642ca..5f7b1fdbffe6 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -95,7 +95,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 		return -EMSGSIZE;
 	}
 
-	if (toobig || dst_allfrag(skb_dst(skb)))
+	if (toobig)
 		return ip6_fragment(net, sk, skb,
 				    __xfrm6_output_finish);
 
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 2b43577f952e..e120e9616454 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -2051,7 +2051,6 @@ void __init mptcp_subflow_init(void)
 	subflow_v6m_specific.send_check = ipv4_specific.send_check;
 	subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
 	subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
-	subflow_v6m_specific.net_frag_header_len = 0;
 	subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
 
 	tcpv6_prot_override = tcpv6_prot;
-- 
cgit v1.2.3-70-g09d2


From ea23fbd2a8f7dadfa9cd9b9d73f3b8a69eec0671 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 25 Oct 2023 09:22:04 -0700
Subject: netlink: make range pointers in policies const

struct nla_policy is usually constant itself, but unless
we make the ranges inside constant we won't be able to
make range structs const. The ranges are not modified
by the core.

Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20231025162204.132528-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_netlink.c | 2 +-
 drivers/net/vxlan/vxlan_mdb.c      | 2 +-
 include/net/netlink.h              | 4 ++--
 net/ipv6/ioam6_iptunnel.c          | 2 +-
 net/sched/sch_fq.c                 | 2 +-
 net/sched/sch_fq_pie.c             | 2 +-
 net/sched/sch_qfq.c                | 2 +-
 net/sched/sch_taprio.c             | 2 +-
 net/wireless/nl80211.c             | 2 +-
 tools/net/ynl/ynl-gen-c.py         | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 27cbe148f0db..cfa74cf8bb1a 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -85,7 +85,7 @@ nla_put_failure:
 }
 
 /* Limit the max delay range to 300s */
-static struct netlink_range_validation delay_range = {
+static const struct netlink_range_validation delay_range = {
 	.max = 300000,
 };
 
diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c
index 5e041622261a..3a21389658ce 100644
--- a/drivers/net/vxlan/vxlan_mdb.c
+++ b/drivers/net/vxlan/vxlan_mdb.c
@@ -311,7 +311,7 @@ vxlan_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = {
 	[MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(vxlan_mdbe_src_list_entry_pol),
 };
 
-static struct netlink_range_validation vni_range = {
+static const struct netlink_range_validation vni_range = {
 	.max = VXLAN_N_VID - 1,
 };
 
diff --git a/include/net/netlink.h b/include/net/netlink.h
index aba2b162a226..83bdf787aeee 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -360,8 +360,8 @@ struct nla_policy {
 		const u32 mask;
 		const char *reject_message;
 		const struct nla_policy *nested_policy;
-		struct netlink_range_validation *range;
-		struct netlink_range_validation_signed *range_signed;
+		const struct netlink_range_validation *range;
+		const struct netlink_range_validation_signed *range_signed;
 		struct {
 			s16 min, max;
 		};
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
index f6f5b83dd954..7563f8c6aa87 100644
--- a/net/ipv6/ioam6_iptunnel.c
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -46,7 +46,7 @@ struct ioam6_lwt {
 	struct ioam6_lwt_encap	tuninfo;
 };
 
-static struct netlink_range_validation freq_range = {
+static const struct netlink_range_validation freq_range = {
 	.min = IOAM6_IPTUNNEL_FREQ_MIN,
 	.max = IOAM6_IPTUNNEL_FREQ_MAX,
 };
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index bf9d00518a60..0fd18c344ab5 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -897,7 +897,7 @@ static int fq_resize(struct Qdisc *sch, u32 log)
 	return 0;
 }
 
-static struct netlink_range_validation iq_range = {
+static const struct netlink_range_validation iq_range = {
 	.max = INT_MAX,
 };
 
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index 68e6acd0f130..5b595773e59b 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -202,7 +202,7 @@ out:
 	return NET_XMIT_CN;
 }
 
-static struct netlink_range_validation fq_pie_q_range = {
+static const struct netlink_range_validation fq_pie_q_range = {
 	.min = 1,
 	.max = 1 << 20,
 };
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 5598f8be18ae..28315166fe8e 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -213,7 +213,7 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
 	return container_of(clc, struct qfq_class, common);
 }
 
-static struct netlink_range_validation lmax_range = {
+static const struct netlink_range_validation lmax_range = {
 	.min = QFQ_MIN_LMAX,
 	.max = QFQ_MAX_LMAX,
 };
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 1cb5e41c0ec7..2e1949de4171 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -1015,7 +1015,7 @@ static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
 							      TC_FP_PREEMPTIBLE),
 };
 
-static struct netlink_range_validation_signed taprio_cycle_time_range = {
+static const struct netlink_range_validation_signed taprio_cycle_time_range = {
 	.min = 0,
 	.max = INT_MAX,
 };
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2650543dcebe..2f8353bf603c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -463,7 +463,7 @@ nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = {
 	[NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 },
 };
 
-static struct netlink_range_validation nl80211_punct_bitmap_range = {
+static const struct netlink_range_validation nl80211_punct_bitmap_range = {
 	.min = 0,
 	.max = 0xffff,
 };
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 0fee68863db4..31fd96f14fc0 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2038,7 +2038,7 @@ def print_kernel_policy_ranges(family, cw):
                 first = False
 
             sign = '' if attr.type[0] == 'u' else '_signed'
-            cw.block_start(line=f'struct netlink_range_validation{sign} {c_lower(attr.enum_name)}_range =')
+            cw.block_start(line=f'static const struct netlink_range_validation{sign} {c_lower(attr.enum_name)}_range =')
             members = []
             if 'min' in attr.checks:
                 members.append(('min', attr.get_limit('min')))
-- 
cgit v1.2.3-70-g09d2


From 8c73b26315aadb82218360d0a9a05e515f6e4118 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:53 +0100
Subject: net/tcp: Prepare tcp_md5sig_pool for TCP-AO

TCP-AO, similarly to TCP-MD5, needs to allocate tfms on a slow-path,
which is setsockopt() and use crypto ahash requests on fast paths,
which are RX/TX softirqs. Also, it needs a temporary/scratch buffer
for preparing the hash.

Rework tcp_md5sig_pool in order to support other hashing algorithms
than MD5. It will make it possible to share pre-allocated crypto_ahash
descriptors and scratch area between all TCP hash users.

Internally tcp_sigpool calls crypto_clone_ahash() API over pre-allocated
crypto ahash tfm. Kudos to Herbert, who provided this new crypto API.

I was a little concerned over GFP_ATOMIC allocations of ahash and
crypto_request in RX/TX (see tcp_sigpool_start()), so I benchmarked both
"backends" with different algorithms, using patched version of iperf3[2].
On my laptop with i7-7600U @ 2.80GHz:

                         clone-tfm                per-CPU-requests
TCP-MD5                  2.25 Gbits/sec           2.30 Gbits/sec
TCP-AO(hmac(sha1))       2.53 Gbits/sec           2.54 Gbits/sec
TCP-AO(hmac(sha512))     1.67 Gbits/sec           1.64 Gbits/sec
TCP-AO(hmac(sha384))     1.77 Gbits/sec           1.80 Gbits/sec
TCP-AO(hmac(sha224))     1.29 Gbits/sec           1.30 Gbits/sec
TCP-AO(hmac(sha3-512))    481 Mbits/sec            480 Mbits/sec
TCP-AO(hmac(md5))        2.07 Gbits/sec           2.12 Gbits/sec
TCP-AO(hmac(rmd160))     1.01 Gbits/sec            995 Mbits/sec
TCP-AO(cmac(aes128))     [not supporetd yet]      2.11 Gbits/sec

So, it seems that my concerns don't have strong grounds and per-CPU
crypto_request allocation can be dropped/removed from tcp_sigpool once
ciphers get crypto_clone_ahash() support.

[1]: https://lore.kernel.org/all/ZDefxOq6Ax0JeTRH@gondor.apana.org.au/T/#u
[2]: https://github.com/0x7f454c46/iperf/tree/tcp-md5-ao
Signed-off-by: Dmitry Safonov <dima@arista.com>
Reviewed-by: Steen Hegelund <Steen.Hegelund@microchip.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  50 +++++--
 net/ipv4/Kconfig         |   4 +
 net/ipv4/Makefile        |   1 +
 net/ipv4/tcp.c           | 145 ++++---------------
 net/ipv4/tcp_ipv4.c      |  97 +++++++------
 net/ipv4/tcp_minisocks.c |  21 ++-
 net/ipv4/tcp_sigpool.c   | 358 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c      |  60 ++++----
 8 files changed, 525 insertions(+), 211 deletions(-)
 create mode 100644 net/ipv4/tcp_sigpool.c

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 993b7fcd4e46..f6e2db5292b5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1737,12 +1737,39 @@ union tcp_md5sum_block {
 #endif
 };
 
-/* - pool: digest algorithm, hash description and scratch buffer */
-struct tcp_md5sig_pool {
-	struct ahash_request	*md5_req;
-	void			*scratch;
+/*
+ * struct tcp_sigpool - per-CPU pool of ahash_requests
+ * @scratch: per-CPU temporary area, that can be used between
+ *	     tcp_sigpool_start() and tcp_sigpool_end() to perform
+ *	     crypto request
+ * @req: pre-allocated ahash request
+ */
+struct tcp_sigpool {
+	void *scratch;
+	struct ahash_request *req;
 };
 
+int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size);
+void tcp_sigpool_get(unsigned int id);
+void tcp_sigpool_release(unsigned int id);
+int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp,
+			      const struct sk_buff *skb,
+			      unsigned int header_len);
+
+/**
+ * tcp_sigpool_start - disable bh and start using tcp_sigpool_ahash
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ * @c: returned tcp_sigpool for usage (uninitialized on failure)
+ *
+ * Returns 0 on success, error otherwise.
+ */
+int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c);
+/**
+ * tcp_sigpool_end - enable bh and stop using tcp_sigpool
+ * @c: tcp_sigpool context that was returned by tcp_sigpool_start()
+ */
+void tcp_sigpool_end(struct tcp_sigpool *c);
+size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len);
 /* - functions */
 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 			const struct sock *sk, const struct sk_buff *skb);
@@ -1798,17 +1825,12 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 #define tcp_twsk_md5_key(twsk)	NULL
 #endif
 
-bool tcp_alloc_md5sig_pool(void);
-
-struct tcp_md5sig_pool *tcp_get_md5sig_pool(void);
-static inline void tcp_put_md5sig_pool(void)
-{
-	local_bh_enable();
-}
+int tcp_md5_alloc_sigpool(void);
+void tcp_md5_release_sigpool(void);
+void tcp_md5_add_sigpool(void);
+extern int tcp_md5_sigpool_id;
 
-int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *,
-			  unsigned int header_len);
-int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
+int tcp_md5_hash_key(struct tcp_sigpool *hp,
 		     const struct tcp_md5sig_key *key);
 
 /* From tcp_fastopen.c */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 2dfb12230f08..89e2ab023272 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -741,10 +741,14 @@ config DEFAULT_TCP_CONG
 	default "bbr" if DEFAULT_BBR
 	default "cubic"
 
+config TCP_SIGPOOL
+	tristate
+
 config TCP_MD5SIG
 	bool "TCP: MD5 Signature Option support (RFC2385)"
 	select CRYPTO
 	select CRYPTO_MD5
+	select TCP_SIGPOOL
 	help
 	  RFC2385 specifies a method of giving MD5 protection to TCP sessions.
 	  Its main (only?) use is to protect BGP sessions between core routers
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b18ba8ef93ad..cd760793cfcb 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_TCP_SIGPOOL) += tcp_sigpool.o
 obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
 obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 156264531124..dca9ca2f1081 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4305,141 +4305,52 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 EXPORT_SYMBOL(tcp_getsockopt);
 
 #ifdef CONFIG_TCP_MD5SIG
-static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
-static DEFINE_MUTEX(tcp_md5sig_mutex);
-static bool tcp_md5sig_pool_populated = false;
+int tcp_md5_sigpool_id = -1;
+EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id);
 
-static void __tcp_alloc_md5sig_pool(void)
+int tcp_md5_alloc_sigpool(void)
 {
-	struct crypto_ahash *hash;
-	int cpu;
-
-	hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(hash))
-		return;
-
-	for_each_possible_cpu(cpu) {
-		void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
-		struct ahash_request *req;
-
-		if (!scratch) {
-			scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
-					       sizeof(struct tcphdr),
-					       GFP_KERNEL,
-					       cpu_to_node(cpu));
-			if (!scratch)
-				return;
-			per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
-		}
-		if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
-			continue;
-
-		req = ahash_request_alloc(hash, GFP_KERNEL);
-		if (!req)
-			return;
-
-		ahash_request_set_callback(req, 0, NULL, NULL);
-
-		per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
-	}
-	/* before setting tcp_md5sig_pool_populated, we must commit all writes
-	 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
-	 */
-	smp_wmb();
-	/* Paired with READ_ONCE() from tcp_alloc_md5sig_pool()
-	 * and tcp_get_md5sig_pool().
-	*/
-	WRITE_ONCE(tcp_md5sig_pool_populated, true);
-}
-
-bool tcp_alloc_md5sig_pool(void)
-{
-	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
-	if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
-		mutex_lock(&tcp_md5sig_mutex);
-
-		if (!tcp_md5sig_pool_populated)
-			__tcp_alloc_md5sig_pool();
+	size_t scratch_size;
+	int ret;
 
-		mutex_unlock(&tcp_md5sig_mutex);
+	scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr);
+	ret = tcp_sigpool_alloc_ahash("md5", scratch_size);
+	if (ret >= 0) {
+		/* As long as any md5 sigpool was allocated, the return
+		 * id would stay the same. Re-write the id only for the case
+		 * when previously all MD5 keys were deleted and this call
+		 * allocates the first MD5 key, which may return a different
+		 * sigpool id than was used previously.
+		 */
+		WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */
+		return 0;
 	}
-	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
-	return READ_ONCE(tcp_md5sig_pool_populated);
+	return ret;
 }
-EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
-
 
-/**
- *	tcp_get_md5sig_pool - get md5sig_pool for this user
- *
- *	We use percpu structure, so if we succeed, we exit with preemption
- *	and BH disabled, to make sure another thread or softirq handling
- *	wont try to get same context.
- */
-struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
+void tcp_md5_release_sigpool(void)
 {
-	local_bh_disable();
-
-	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
-	if (READ_ONCE(tcp_md5sig_pool_populated)) {
-		/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
-		smp_rmb();
-		return this_cpu_ptr(&tcp_md5sig_pool);
-	}
-	local_bh_enable();
-	return NULL;
+	tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id));
 }
-EXPORT_SYMBOL(tcp_get_md5sig_pool);
 
-int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
-			  const struct sk_buff *skb, unsigned int header_len)
+void tcp_md5_add_sigpool(void)
 {
-	struct scatterlist sg;
-	const struct tcphdr *tp = tcp_hdr(skb);
-	struct ahash_request *req = hp->md5_req;
-	unsigned int i;
-	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
-					   skb_headlen(skb) - header_len : 0;
-	const struct skb_shared_info *shi = skb_shinfo(skb);
-	struct sk_buff *frag_iter;
-
-	sg_init_table(&sg, 1);
-
-	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
-	ahash_request_set_crypt(req, &sg, NULL, head_data_len);
-	if (crypto_ahash_update(req))
-		return 1;
-
-	for (i = 0; i < shi->nr_frags; ++i) {
-		const skb_frag_t *f = &shi->frags[i];
-		unsigned int offset = skb_frag_off(f);
-		struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
-
-		sg_set_page(&sg, page, skb_frag_size(f),
-			    offset_in_page(offset));
-		ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
-		if (crypto_ahash_update(req))
-			return 1;
-	}
-
-	skb_walk_frags(skb, frag_iter)
-		if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
-			return 1;
-
-	return 0;
+	tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id));
 }
-EXPORT_SYMBOL(tcp_md5_hash_skb_data);
 
-int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
+int tcp_md5_hash_key(struct tcp_sigpool *hp,
+		     const struct tcp_md5sig_key *key)
 {
 	u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
 	struct scatterlist sg;
 
 	sg_init_one(&sg, key->key, keylen);
-	ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
+	ahash_request_set_crypt(hp->req, &sg, NULL, keylen);
 
-	/* We use data_race() because tcp_md5_do_add() might change key->key under us */
-	return data_race(crypto_ahash_update(hp->md5_req));
+	/* We use data_race() because tcp_md5_do_add() might change
+	 * key->key under us
+	 */
+	return data_race(crypto_ahash_update(hp->req));
 }
 EXPORT_SYMBOL(tcp_md5_hash_key);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7583d4e34c8c..7d81e90b6f5c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1221,10 +1221,6 @@ static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
 	if (!key)
 		return -ENOMEM;
-	if (!tcp_alloc_md5sig_pool()) {
-		sock_kfree_s(sk, key, sizeof(*key));
-		return -ENOMEM;
-	}
 
 	memcpy(key->key, newkey, newkeylen);
 	key->keylen = newkeylen;
@@ -1246,8 +1242,13 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
-		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
+		if (tcp_md5_alloc_sigpool())
+			return -ENOMEM;
+
+		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
+			tcp_md5_release_sigpool();
 			return -ENOMEM;
+		}
 
 		if (!static_branch_inc(&tcp_md5_needed.key)) {
 			struct tcp_md5sig_info *md5sig;
@@ -1255,6 +1256,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
 			rcu_assign_pointer(tp->md5sig_info, NULL);
 			kfree_rcu(md5sig, rcu);
+			tcp_md5_release_sigpool();
 			return -EUSERS;
 		}
 	}
@@ -1271,8 +1273,12 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
-		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
+		tcp_md5_add_sigpool();
+
+		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
+			tcp_md5_release_sigpool();
 			return -ENOMEM;
+		}
 
 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
 			struct tcp_md5sig_info *md5sig;
@@ -1281,6 +1287,7 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
 			rcu_assign_pointer(tp->md5sig_info, NULL);
 			kfree_rcu(md5sig, rcu);
+			tcp_md5_release_sigpool();
 			return -EUSERS;
 		}
 	}
@@ -1380,7 +1387,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
 			      cmd.tcpm_key, cmd.tcpm_keylen);
 }
 
-static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
+static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
 				   __be32 daddr, __be32 saddr,
 				   const struct tcphdr *th, int nbytes)
 {
@@ -1400,38 +1407,35 @@ static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
 	_th->check = 0;
 
 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
-	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+	ahash_request_set_crypt(hp->req, &sg, NULL,
 				sizeof(*bp) + sizeof(*th));
-	return crypto_ahash_update(hp->md5_req);
+	return crypto_ahash_update(hp->req);
 }
 
 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
 {
-	struct tcp_md5sig_pool *hp;
-	struct ahash_request *req;
+	struct tcp_sigpool hp;
 
-	hp = tcp_get_md5sig_pool();
-	if (!hp)
-		goto clear_hash_noput;
-	req = hp->md5_req;
+	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
+		goto clear_hash_nostart;
 
-	if (crypto_ahash_init(req))
+	if (crypto_ahash_init(hp.req))
 		goto clear_hash;
-	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
+	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
 		goto clear_hash;
-	if (tcp_md5_hash_key(hp, key))
+	if (tcp_md5_hash_key(&hp, key))
 		goto clear_hash;
-	ahash_request_set_crypt(req, NULL, md5_hash, 0);
-	if (crypto_ahash_final(req))
+	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(hp.req))
 		goto clear_hash;
 
-	tcp_put_md5sig_pool();
+	tcp_sigpool_end(&hp);
 	return 0;
 
 clear_hash:
-	tcp_put_md5sig_pool();
-clear_hash_noput:
+	tcp_sigpool_end(&hp);
+clear_hash_nostart:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
@@ -1440,9 +1444,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 			const struct sock *sk,
 			const struct sk_buff *skb)
 {
-	struct tcp_md5sig_pool *hp;
-	struct ahash_request *req;
 	const struct tcphdr *th = tcp_hdr(skb);
+	struct tcp_sigpool hp;
 	__be32 saddr, daddr;
 
 	if (sk) { /* valid for establish/request sockets */
@@ -1454,30 +1457,28 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 		daddr = iph->daddr;
 	}
 
-	hp = tcp_get_md5sig_pool();
-	if (!hp)
-		goto clear_hash_noput;
-	req = hp->md5_req;
+	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
+		goto clear_hash_nostart;
 
-	if (crypto_ahash_init(req))
+	if (crypto_ahash_init(hp.req))
 		goto clear_hash;
 
-	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
+	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
 		goto clear_hash;
-	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
+	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
 		goto clear_hash;
-	if (tcp_md5_hash_key(hp, key))
+	if (tcp_md5_hash_key(&hp, key))
 		goto clear_hash;
-	ahash_request_set_crypt(req, NULL, md5_hash, 0);
-	if (crypto_ahash_final(req))
+	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(hp.req))
 		goto clear_hash;
 
-	tcp_put_md5sig_pool();
+	tcp_sigpool_end(&hp);
 	return 0;
 
 clear_hash:
-	tcp_put_md5sig_pool();
-clear_hash_noput:
+	tcp_sigpool_end(&hp);
+clear_hash_nostart:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
@@ -2296,6 +2297,18 @@ static int tcp_v4_init_sock(struct sock *sk)
 	return 0;
 }
 
+#ifdef CONFIG_TCP_MD5SIG
+static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
+{
+	struct tcp_md5sig_info *md5sig;
+
+	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
+	kfree(md5sig);
+	static_branch_slow_dec_deferred(&tcp_md5_needed);
+	tcp_md5_release_sigpool();
+}
+#endif
+
 void tcp_v4_destroy_sock(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2320,10 +2333,12 @@ void tcp_v4_destroy_sock(struct sock *sk)
 #ifdef CONFIG_TCP_MD5SIG
 	/* Clean up the MD5 key list, if any */
 	if (tp->md5sig_info) {
+		struct tcp_md5sig_info *md5sig;
+
+		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 		tcp_clear_md5_list(sk);
-		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
-		tp->md5sig_info = NULL;
-		static_branch_slow_dec_deferred(&tcp_md5_needed);
+		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
+		rcu_assign_pointer(tp->md5sig_info, NULL);
 	}
 #endif
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ace806c5bd0c..3dcb3fc36e64 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -261,10 +261,9 @@ static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
 		tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
 		if (!tcptw->tw_md5_key)
 			return;
-		if (!tcp_alloc_md5sig_pool())
-			goto out_free;
 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
 			goto out_free;
+		tcp_md5_add_sigpool();
 	}
 	return;
 out_free:
@@ -349,16 +348,26 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 }
 EXPORT_SYMBOL(tcp_time_wait);
 
+#ifdef CONFIG_TCP_MD5SIG
+static void tcp_md5_twsk_free_rcu(struct rcu_head *head)
+{
+	struct tcp_md5sig_key *key;
+
+	key = container_of(head, struct tcp_md5sig_key, rcu);
+	kfree(key);
+	static_branch_slow_dec_deferred(&tcp_md5_needed);
+	tcp_md5_release_sigpool();
+}
+#endif
+
 void tcp_twsk_destructor(struct sock *sk)
 {
 #ifdef CONFIG_TCP_MD5SIG
 	if (static_branch_unlikely(&tcp_md5_needed.key)) {
 		struct tcp_timewait_sock *twsk = tcp_twsk(sk);
 
-		if (twsk->tw_md5_key) {
-			kfree_rcu(twsk->tw_md5_key, rcu);
-			static_branch_slow_dec_deferred(&tcp_md5_needed);
-		}
+		if (twsk->tw_md5_key)
+			call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu);
 	}
 #endif
 }
diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c
new file mode 100644
index 000000000000..65a8eaae2fec
--- /dev/null
+++ b/net/ipv4/tcp_sigpool.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/hash.h>
+#include <linux/cpu.h>
+#include <linux/kref.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+
+static size_t __scratch_size;
+static DEFINE_PER_CPU(void __rcu *, sigpool_scratch);
+
+struct sigpool_entry {
+	struct crypto_ahash	*hash;
+	const char		*alg;
+	struct kref		kref;
+	uint16_t		needs_key:1,
+				reserved:15;
+};
+
+#define CPOOL_SIZE (PAGE_SIZE / sizeof(struct sigpool_entry))
+static struct sigpool_entry cpool[CPOOL_SIZE];
+static unsigned int cpool_populated;
+static DEFINE_MUTEX(cpool_mutex);
+
+/* Slow-path */
+struct scratches_to_free {
+	struct rcu_head rcu;
+	unsigned int cnt;
+	void *scratches[];
+};
+
+static void free_old_scratches(struct rcu_head *head)
+{
+	struct scratches_to_free *stf;
+
+	stf = container_of(head, struct scratches_to_free, rcu);
+	while (stf->cnt--)
+		kfree(stf->scratches[stf->cnt]);
+	kfree(stf);
+}
+
+/**
+ * sigpool_reserve_scratch - re-allocates scratch buffer, slow-path
+ * @size: request size for the scratch/temp buffer
+ */
+static int sigpool_reserve_scratch(size_t size)
+{
+	struct scratches_to_free *stf;
+	size_t stf_sz = struct_size(stf, scratches, num_possible_cpus());
+	int cpu, err = 0;
+
+	lockdep_assert_held(&cpool_mutex);
+	if (__scratch_size >= size)
+		return 0;
+
+	stf = kmalloc(stf_sz, GFP_KERNEL);
+	if (!stf)
+		return -ENOMEM;
+	stf->cnt = 0;
+
+	size = max(size, __scratch_size);
+	cpus_read_lock();
+	for_each_possible_cpu(cpu) {
+		void *scratch, *old_scratch;
+
+		scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+		if (!scratch) {
+			err = -ENOMEM;
+			break;
+		}
+
+		old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu),
+					scratch, lockdep_is_held(&cpool_mutex));
+		if (!cpu_online(cpu) || !old_scratch) {
+			kfree(old_scratch);
+			continue;
+		}
+		stf->scratches[stf->cnt++] = old_scratch;
+	}
+	cpus_read_unlock();
+	if (!err)
+		__scratch_size = size;
+
+	call_rcu(&stf->rcu, free_old_scratches);
+	return err;
+}
+
+static void sigpool_scratch_free(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu),
+					  NULL, lockdep_is_held(&cpool_mutex)));
+	__scratch_size = 0;
+}
+
+static int __cpool_try_clone(struct crypto_ahash *hash)
+{
+	struct crypto_ahash *tmp;
+
+	tmp = crypto_clone_ahash(hash);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	crypto_free_ahash(tmp);
+	return 0;
+}
+
+static int __cpool_alloc_ahash(struct sigpool_entry *e, const char *alg)
+{
+	struct crypto_ahash *cpu0_hash;
+	int ret;
+
+	e->alg = kstrdup(alg, GFP_KERNEL);
+	if (!e->alg)
+		return -ENOMEM;
+
+	cpu0_hash = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(cpu0_hash)) {
+		ret = PTR_ERR(cpu0_hash);
+		goto out_free_alg;
+	}
+
+	e->needs_key = crypto_ahash_get_flags(cpu0_hash) & CRYPTO_TFM_NEED_KEY;
+
+	ret = __cpool_try_clone(cpu0_hash);
+	if (ret)
+		goto out_free_cpu0_hash;
+	e->hash = cpu0_hash;
+	kref_init(&e->kref);
+	return 0;
+
+out_free_cpu0_hash:
+	crypto_free_ahash(cpu0_hash);
+out_free_alg:
+	kfree(e->alg);
+	e->alg = NULL;
+	return ret;
+}
+
+/**
+ * tcp_sigpool_alloc_ahash - allocates pool for ahash requests
+ * @alg: name of async hash algorithm
+ * @scratch_size: reserve a tcp_sigpool::scratch buffer of this size
+ */
+int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size)
+{
+	int i, ret;
+
+	/* slow-path */
+	mutex_lock(&cpool_mutex);
+	ret = sigpool_reserve_scratch(scratch_size);
+	if (ret)
+		goto out;
+	for (i = 0; i < cpool_populated; i++) {
+		if (!cpool[i].alg)
+			continue;
+		if (strcmp(cpool[i].alg, alg))
+			continue;
+
+		if (kref_read(&cpool[i].kref) > 0)
+			kref_get(&cpool[i].kref);
+		else
+			kref_init(&cpool[i].kref);
+		ret = i;
+		goto out;
+	}
+
+	for (i = 0; i < cpool_populated; i++) {
+		if (!cpool[i].alg)
+			break;
+	}
+	if (i >= CPOOL_SIZE) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	ret = __cpool_alloc_ahash(&cpool[i], alg);
+	if (!ret) {
+		ret = i;
+		if (i == cpool_populated)
+			cpool_populated++;
+	}
+out:
+	mutex_unlock(&cpool_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_alloc_ahash);
+
+static void __cpool_free_entry(struct sigpool_entry *e)
+{
+	crypto_free_ahash(e->hash);
+	kfree(e->alg);
+	memset(e, 0, sizeof(*e));
+}
+
+static void cpool_cleanup_work_cb(struct work_struct *work)
+{
+	bool free_scratch = true;
+	unsigned int i;
+
+	mutex_lock(&cpool_mutex);
+	for (i = 0; i < cpool_populated; i++) {
+		if (kref_read(&cpool[i].kref) > 0) {
+			free_scratch = false;
+			continue;
+		}
+		if (!cpool[i].alg)
+			continue;
+		__cpool_free_entry(&cpool[i]);
+	}
+	if (free_scratch)
+		sigpool_scratch_free();
+	mutex_unlock(&cpool_mutex);
+}
+
+static DECLARE_WORK(cpool_cleanup_work, cpool_cleanup_work_cb);
+static void cpool_schedule_cleanup(struct kref *kref)
+{
+	schedule_work(&cpool_cleanup_work);
+}
+
+/**
+ * tcp_sigpool_release - decreases number of users for a pool. If it was
+ * the last user of the pool, releases any memory that was consumed.
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ */
+void tcp_sigpool_release(unsigned int id)
+{
+	if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg))
+		return;
+
+	/* slow-path */
+	kref_put(&cpool[id].kref, cpool_schedule_cleanup);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_release);
+
+/**
+ * tcp_sigpool_get - increases number of users (refcounter) for a pool
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ */
+void tcp_sigpool_get(unsigned int id)
+{
+	if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg))
+		return;
+	kref_get(&cpool[id].kref);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_get);
+
+int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RCU_BH)
+{
+	struct crypto_ahash *hash;
+
+	rcu_read_lock_bh();
+	if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg)) {
+		rcu_read_unlock_bh();
+		return -EINVAL;
+	}
+
+	hash = crypto_clone_ahash(cpool[id].hash);
+	if (IS_ERR(hash)) {
+		rcu_read_unlock_bh();
+		return PTR_ERR(hash);
+	}
+
+	c->req = ahash_request_alloc(hash, GFP_ATOMIC);
+	if (!c->req) {
+		crypto_free_ahash(hash);
+		rcu_read_unlock_bh();
+		return -ENOMEM;
+	}
+	ahash_request_set_callback(c->req, 0, NULL, NULL);
+
+	/* Pairs with tcp_sigpool_reserve_scratch(), scratch area is
+	 * valid (allocated) until tcp_sigpool_end().
+	 */
+	c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_start);
+
+void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH)
+{
+	struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req);
+
+	rcu_read_unlock_bh();
+	ahash_request_free(c->req);
+	crypto_free_ahash(hash);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_end);
+
+/**
+ * tcp_sigpool_algo - return algorithm of tcp_sigpool
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ * @buf: buffer to return name of algorithm
+ * @buf_len: size of @buf
+ */
+size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len)
+{
+	if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg))
+		return -EINVAL;
+
+	return strscpy(buf, cpool[id].alg, buf_len);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_algo);
+
+/**
+ * tcp_sigpool_hash_skb_data - hash data in skb with initialized tcp_sigpool
+ * @hp: tcp_sigpool pointer
+ * @skb: buffer to add sign for
+ * @header_len: TCP header length for this segment
+ */
+int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp,
+			      const struct sk_buff *skb,
+			      unsigned int header_len)
+{
+	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+					   skb_headlen(skb) - header_len : 0;
+	const struct skb_shared_info *shi = skb_shinfo(skb);
+	const struct tcphdr *tp = tcp_hdr(skb);
+	struct ahash_request *req = hp->req;
+	struct sk_buff *frag_iter;
+	struct scatterlist sg;
+	unsigned int i;
+
+	sg_init_table(&sg, 1);
+
+	sg_set_buf(&sg, ((u8 *)tp) + header_len, head_data_len);
+	ahash_request_set_crypt(req, &sg, NULL, head_data_len);
+	if (crypto_ahash_update(req))
+		return 1;
+
+	for (i = 0; i < shi->nr_frags; ++i) {
+		const skb_frag_t *f = &shi->frags[i];
+		unsigned int offset = skb_frag_off(f);
+		struct page *page;
+
+		page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+		sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset));
+		ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
+		if (crypto_ahash_update(req))
+			return 1;
+	}
+
+	skb_walk_frags(skb, frag_iter)
+		if (tcp_sigpool_hash_skb_data(hp, frag_iter, 0))
+			return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sigpool_hash_skb_data);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Per-CPU pool of crypto requests");
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index dc27988512a6..ee53dad20a59 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -671,7 +671,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 			      cmd.tcpm_key, cmd.tcpm_keylen);
 }
 
-static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
+static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr,
 				   const struct tcphdr *th, int nbytes)
@@ -692,39 +692,36 @@ static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
 	_th->check = 0;
 
 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
-	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+	ahash_request_set_crypt(hp->req, &sg, NULL,
 				sizeof(*bp) + sizeof(*th));
-	return crypto_ahash_update(hp->md5_req);
+	return crypto_ahash_update(hp->req);
 }
 
 static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 			       const struct in6_addr *daddr, struct in6_addr *saddr,
 			       const struct tcphdr *th)
 {
-	struct tcp_md5sig_pool *hp;
-	struct ahash_request *req;
+	struct tcp_sigpool hp;
 
-	hp = tcp_get_md5sig_pool();
-	if (!hp)
-		goto clear_hash_noput;
-	req = hp->md5_req;
+	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
+		goto clear_hash_nostart;
 
-	if (crypto_ahash_init(req))
+	if (crypto_ahash_init(hp.req))
 		goto clear_hash;
-	if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
+	if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
 		goto clear_hash;
-	if (tcp_md5_hash_key(hp, key))
+	if (tcp_md5_hash_key(&hp, key))
 		goto clear_hash;
-	ahash_request_set_crypt(req, NULL, md5_hash, 0);
-	if (crypto_ahash_final(req))
+	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(hp.req))
 		goto clear_hash;
 
-	tcp_put_md5sig_pool();
+	tcp_sigpool_end(&hp);
 	return 0;
 
 clear_hash:
-	tcp_put_md5sig_pool();
-clear_hash_noput:
+	tcp_sigpool_end(&hp);
+clear_hash_nostart:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
@@ -734,10 +731,9 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
 			       const struct sock *sk,
 			       const struct sk_buff *skb)
 {
-	const struct in6_addr *saddr, *daddr;
-	struct tcp_md5sig_pool *hp;
-	struct ahash_request *req;
 	const struct tcphdr *th = tcp_hdr(skb);
+	const struct in6_addr *saddr, *daddr;
+	struct tcp_sigpool hp;
 
 	if (sk) { /* valid for establish/request sockets */
 		saddr = &sk->sk_v6_rcv_saddr;
@@ -748,30 +744,28 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
 		daddr = &ip6h->daddr;
 	}
 
-	hp = tcp_get_md5sig_pool();
-	if (!hp)
-		goto clear_hash_noput;
-	req = hp->md5_req;
+	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
+		goto clear_hash_nostart;
 
-	if (crypto_ahash_init(req))
+	if (crypto_ahash_init(hp.req))
 		goto clear_hash;
 
-	if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len))
+	if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
 		goto clear_hash;
-	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
+	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
 		goto clear_hash;
-	if (tcp_md5_hash_key(hp, key))
+	if (tcp_md5_hash_key(&hp, key))
 		goto clear_hash;
-	ahash_request_set_crypt(req, NULL, md5_hash, 0);
-	if (crypto_ahash_final(req))
+	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(hp.req))
 		goto clear_hash;
 
-	tcp_put_md5sig_pool();
+	tcp_sigpool_end(&hp);
 	return 0;
 
 clear_hash:
-	tcp_put_md5sig_pool();
-clear_hash_noput:
+	tcp_sigpool_end(&hp);
+clear_hash_nostart:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
-- 
cgit v1.2.3-70-g09d2


From c845f5f3590ef4669fe5464f8a42be6442cd174b Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:54 +0100
Subject: net/tcp: Add TCP-AO config and structures

Introduce new kernel config option and common structures as well as
helpers to be used by TCP-AO code.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  9 +++--
 include/net/tcp.h        |  8 ++---
 include/net/tcp_ao.h     | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/tcp.h |  2 ++
 net/ipv4/Kconfig         | 13 +++++++
 5 files changed, 114 insertions(+), 8 deletions(-)
 create mode 100644 include/net/tcp_ao.h

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6df715b6e51d..64e7b560fa79 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -447,13 +447,18 @@ struct tcp_sock {
 	bool	syn_smc;	/* SYN includes SMC */
 #endif
 
-#ifdef CONFIG_TCP_MD5SIG
-/* TCP AF-Specific parts; only used by MD5 Signature support so far */
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+/* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */
 	const struct tcp_sock_af_ops	*af_specific;
 
+#ifdef CONFIG_TCP_MD5SIG
 /* TCP MD5 Signature Option information */
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info	__rcu *ao_info;
+#endif
+#endif
 
 /* TCP fastopen related information */
 	struct tcp_fastopen_request *fastopen_req;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f6e2db5292b5..3153712faa3d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -37,6 +37,7 @@
 #include <net/snmp.h>
 #include <net/ip.h>
 #include <net/tcp_states.h>
+#include <net/tcp_ao.h>
 #include <net/inet_ecn.h>
 #include <net/dst.h>
 #include <net/mptcp.h>
@@ -1688,12 +1689,7 @@ static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
 	tp->retransmit_skb_hint = NULL;
 }
 
-union tcp_md5_addr {
-	struct in_addr  a4;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct in6_addr	a6;
-#endif
-};
+#define tcp_md5_addr tcp_ao_addr
 
 /* - key database */
 struct tcp_md5sig_key {
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
new file mode 100644
index 000000000000..af76e1c47bea
--- /dev/null
+++ b/include/net/tcp_ao.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _TCP_AO_H
+#define _TCP_AO_H
+
+#define TCP_AO_KEY_ALIGN	1
+#define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN)
+
+union tcp_ao_addr {
+	struct in_addr  a4;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct in6_addr	a6;
+#endif
+};
+
+struct tcp_ao_hdr {
+	u8	kind;
+	u8	length;
+	u8	keyid;
+	u8	rnext_keyid;
+};
+
+struct tcp_ao_key {
+	struct hlist_node	node;
+	union tcp_ao_addr	addr;
+	u8			key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align;
+	unsigned int		tcp_sigpool_id;
+	unsigned int		digest_size;
+	u8			prefixlen;
+	u8			family;
+	u8			keylen;
+	u8			keyflags;
+	u8			sndid;
+	u8			rcvid;
+	u8			maclen;
+	struct rcu_head		rcu;
+	u8			traffic_keys[];
+};
+
+static inline u8 *rcv_other_key(struct tcp_ao_key *key)
+{
+	return key->traffic_keys;
+}
+
+static inline u8 *snd_other_key(struct tcp_ao_key *key)
+{
+	return key->traffic_keys + key->digest_size;
+}
+
+static inline int tcp_ao_maclen(const struct tcp_ao_key *key)
+{
+	return key->maclen;
+}
+
+static inline int tcp_ao_len(const struct tcp_ao_key *key)
+{
+	return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr);
+}
+
+static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key)
+{
+	return key->digest_size;
+}
+
+static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key)
+{
+	return sizeof(struct tcp_ao_key) + (key->digest_size << 1);
+}
+
+struct tcp_ao_info {
+	/* List of tcp_ao_key's */
+	struct hlist_head	head;
+	/* current_key and rnext_key aren't maintained on listen sockets.
+	 * Their purpose is to cache keys on established connections,
+	 * saving needless lookups. Never dereference any of them from
+	 * listen sockets.
+	 * ::current_key may change in RX to the key that was requested by
+	 * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid
+	 * load/store tearing.
+	 * Do the same for ::rnext_key, if you don't hold socket lock
+	 * (it's changed only by userspace request in setsockopt()).
+	 */
+	struct tcp_ao_key	*current_key;
+	struct tcp_ao_key	*rnext_key;
+	u32			flags;
+	__be32			lisn;
+	__be32			risn;
+	struct rcu_head		rcu;
+};
+
+#endif /* _TCP_AO_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8aa3916e14f6..74a1267baac5 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -361,6 +361,8 @@ struct tcp_diag_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
+#define TCP_AO_MAXKEYLEN	80
+
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
 
 #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 89e2ab023272..8e94ed7c56a0 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -744,6 +744,19 @@ config DEFAULT_TCP_CONG
 config TCP_SIGPOOL
 	tristate
 
+config TCP_AO
+	bool "TCP: Authentication Option (RFC5925)"
+	select CRYPTO
+	select TCP_SIGPOOL
+	depends on 64BIT && IPV6 != m # seq-number extension needs WRITE_ONCE(u64)
+	help
+	  TCP-AO specifies the use of stronger Message Authentication Codes (MACs),
+	  protects against replays for long-lived TCP connections, and
+	  provides more details on the association of security with TCP
+	  connections than TCP MD5 (See RFC5925)
+
+	  If unsure, say N.
+
 config TCP_MD5SIG
 	bool "TCP: MD5 Signature Option support (RFC2385)"
 	select CRYPTO
-- 
cgit v1.2.3-70-g09d2


From 4954f17ddefc51d218625dcdfaf422a253dad3fa Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:55 +0100
Subject: net/tcp: Introduce TCP_AO setsockopt()s

Add 3 setsockopt()s:
1. TCP_AO_ADD_KEY to add a new Master Key Tuple (MKT) on a socket
2. TCP_AO_DEL_KEY to delete present MKT from a socket
3. TCP_AO_INFO to change flags, Current_key/RNext_key on a TCP-AO sk

Userspace has to introduce keys on every socket it wants to use TCP-AO
option on, similarly to TCP_MD5SIG/TCP_MD5SIG_EXT.
RFC5925 prohibits definition of MKTs that would match the same peer,
so do sanity checks on the data provided by userspace. Be as
conservative as possible, including refusal of defining MKT on
an established connection with no AO, removing the key in-use and etc.

(1) and (2) are to be used by userspace key manager to add/remove keys.
(3) main purpose is to set RNext_key, which (as prescribed by RFC5925)
is the KeyID that will be requested in TCP-AO header from the peer to
sign their segments with.

At this moment the life of ao_info ends in tcp_v4_destroy_sock().

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sockptr.h  |  23 ++
 include/net/tcp.h        |   3 +
 include/net/tcp_ao.h     |  17 +-
 include/uapi/linux/tcp.h |  46 +++
 net/ipv4/Makefile        |   1 +
 net/ipv4/tcp.c           |  17 +
 net/ipv4/tcp_ao.c        | 794 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  10 +-
 net/ipv6/Makefile        |   1 +
 net/ipv6/tcp_ao.c        |  19 ++
 net/ipv6/tcp_ipv6.c      |  39 ++-
 11 files changed, 952 insertions(+), 18 deletions(-)
 create mode 100644 net/ipv4/tcp_ao.c
 create mode 100644 net/ipv6/tcp_ao.c

(limited to 'include')

diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index bae5e2369b4f..307961b41541 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -55,6 +55,29 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
 	return copy_from_sockptr_offset(dst, src, 0, size);
 }
 
+static inline int copy_struct_from_sockptr(void *dst, size_t ksize,
+		sockptr_t src, size_t usize)
+{
+	size_t size = min(ksize, usize);
+	size_t rest = max(ksize, usize) - size;
+
+	if (!sockptr_is_kernel(src))
+		return copy_struct_from_user(dst, ksize, src.user, size);
+
+	if (usize < ksize) {
+		memset(dst + size, 0, rest);
+	} else if (usize > ksize) {
+		char *p = src.kernel;
+
+		while (rest--) {
+			if (*p++)
+				return -E2BIG;
+		}
+	}
+	memcpy(dst, src.kernel, size);
+	return 0;
+}
+
 static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
 		const void *src, size_t size)
 {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3153712faa3d..ff204471d451 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2175,6 +2175,9 @@ struct tcp_sock_af_ops {
 				     sockptr_t optval,
 				     int optlen);
 #endif
+#ifdef CONFIG_TCP_AO
+	int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen);
+#endif
 };
 
 struct tcp_request_sock_ops {
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index af76e1c47bea..a81e40fd255a 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -81,10 +81,25 @@ struct tcp_ao_info {
 	 */
 	struct tcp_ao_key	*current_key;
 	struct tcp_ao_key	*rnext_key;
-	u32			flags;
+	u32			ao_required	:1,
+				__unused	:31;
 	__be32			lisn;
 	__be32			risn;
 	struct rcu_head		rcu;
 };
 
+#ifdef CONFIG_TCP_AO
+int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
+		 sockptr_t optval, int optlen);
+void tcp_ao_destroy_sock(struct sock *sk);
+/* ipv4 specific functions */
+int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+/* ipv6 specific functions */
+int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+#else
+static inline void tcp_ao_destroy_sock(struct sock *sk)
+{
+}
+#endif
+
 #endif /* _TCP_AO_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 74a1267baac5..fa49f03e62fe 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -129,6 +129,9 @@ enum {
 
 #define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */
 
+#define TCP_AO_ADD_KEY		38	/* Add/Set MKT */
+#define TCP_AO_DEL_KEY		39	/* Delete MKT */
+#define TCP_AO_INFO		40	/* Modify TCP-AO per-socket options */
 
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF		0
@@ -363,6 +366,49 @@ struct tcp_diag_md5sig {
 
 #define TCP_AO_MAXKEYLEN	80
 
+#define TCP_AO_KEYF_IFINDEX	(1 << 0)	/* L3 ifindex for VRF */
+
+struct tcp_ao_add { /* setsockopt(TCP_AO_ADD_KEY) */
+	struct __kernel_sockaddr_storage addr;	/* peer's address for the key */
+	char	alg_name[64];		/* crypto hash algorithm to use */
+	__s32	ifindex;		/* L3 dev index for VRF */
+	__u32   set_current	:1,	/* set key as Current_key at once */
+		set_rnext	:1,	/* request it from peer with RNext_key */
+		reserved	:30;	/* must be 0 */
+	__u16	reserved2;		/* padding, must be 0 */
+	__u8	prefix;			/* peer's address prefix */
+	__u8	sndid;			/* SendID for outgoing segments */
+	__u8	rcvid;			/* RecvID to match for incoming seg */
+	__u8	maclen;			/* length of authentication code (hash) */
+	__u8	keyflags;		/* see TCP_AO_KEYF_ */
+	__u8	keylen;			/* length of ::key */
+	__u8	key[TCP_AO_MAXKEYLEN];
+} __attribute__((aligned(8)));
+
+struct tcp_ao_del { /* setsockopt(TCP_AO_DEL_KEY) */
+	struct __kernel_sockaddr_storage addr;	/* peer's address for the key */
+	__s32	ifindex;		/* L3 dev index for VRF */
+	__u32   set_current	:1,	/* corresponding ::current_key */
+		set_rnext	:1,	/* corresponding ::rnext */
+		reserved	:30;	/* must be 0 */
+	__u16	reserved2;		/* padding, must be 0 */
+	__u8	prefix;			/* peer's address prefix */
+	__u8	sndid;			/* SendID for outgoing segments */
+	__u8	rcvid;			/* RecvID to match for incoming seg */
+	__u8	current_key;		/* KeyID to set as Current_key */
+	__u8	rnext;			/* KeyID to set as Rnext_key */
+	__u8	keyflags;		/* see TCP_AO_KEYF_ */
+} __attribute__((aligned(8)));
+
+struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */
+	__u32   set_current	:1,	/* corresponding ::current_key */
+		set_rnext	:1,	/* corresponding ::rnext */
+		ao_required	:1,	/* don't accept non-AO connects */
+		reserved	:29;	/* must be 0 */
+	__u8	current_key;		/* KeyID to set as Current_key */
+	__u8	rnext;			/* KeyID to set as Rnext_key */
+} __attribute__((aligned(8)));
+
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
 
 #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index cd760793cfcb..e144a02a6a61 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o xfrm4_protocol.o
+obj-$(CONFIG_TCP_AO) += tcp_ao.o
 
 ifeq ($(CONFIG_BPF_JIT),y)
 obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dca9ca2f1081..b6faee8a1e67 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3593,6 +3593,23 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 		__tcp_sock_set_quickack(sk, val);
 		break;
 
+#ifdef CONFIG_TCP_AO
+	case TCP_AO_ADD_KEY:
+	case TCP_AO_DEL_KEY:
+	case TCP_AO_INFO: {
+		/* If this is the first TCP-AO setsockopt() on the socket,
+		 * sk_state has to be LISTEN or CLOSE
+		 */
+		if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) ||
+		    rcu_dereference_protected(tcp_sk(sk)->ao_info,
+					      lockdep_sock_is_held(sk)))
+			err = tp->af_specific->ao_parse(sk, optname, optval,
+							optlen);
+		else
+			err = -EISCONN;
+		break;
+	}
+#endif
 #ifdef CONFIG_TCP_MD5SIG
 	case TCP_MD5SIG:
 	case TCP_MD5SIG_EXT:
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
new file mode 100644
index 000000000000..3c2d005a37ce
--- /dev/null
+++ b/net/ipv4/tcp_ao.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP Authentication Option (TCP-AO).
+ *		See RFC5925.
+ *
+ * Authors:	Dmitry Safonov <dima@arista.com>
+ *		Francesco Ruggeri <fruggeri@arista.com>
+ *		Salam Noureddine <noureddine@arista.com>
+ */
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <crypto/hash.h>
+#include <linux/inetdevice.h>
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+/* Optimized version of tcp_ao_do_lookup(): only for sockets for which
+ * it's known that the keys in ao_info are matching peer's
+ * family/address/VRF/etc.
+ */
+static struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
+						 int sndid, int rcvid)
+{
+	struct tcp_ao_key *key;
+
+	hlist_for_each_entry_rcu(key, &ao->head, node) {
+		if ((sndid >= 0 && key->sndid != sndid) ||
+		    (rcvid >= 0 && key->rcvid != rcvid))
+			continue;
+		return key;
+	}
+
+	return NULL;
+}
+
+static int ipv4_prefix_cmp(const struct in_addr *addr1,
+			   const struct in_addr *addr2,
+			   unsigned int prefixlen)
+{
+	__be32 mask = inet_make_mask(prefixlen);
+	__be32 a1 = addr1->s_addr & mask;
+	__be32 a2 = addr2->s_addr & mask;
+
+	if (a1 == a2)
+		return 0;
+	return memcmp(&a1, &a2, sizeof(a1));
+}
+
+static int __tcp_ao_key_cmp(const struct tcp_ao_key *key,
+			    const union tcp_ao_addr *addr, u8 prefixlen,
+			    int family, int sndid, int rcvid)
+{
+	if (sndid >= 0 && key->sndid != sndid)
+		return (key->sndid > sndid) ? 1 : -1;
+	if (rcvid >= 0 && key->rcvid != rcvid)
+		return (key->rcvid > rcvid) ? 1 : -1;
+
+	if (family == AF_UNSPEC)
+		return 0;
+	if (key->family != family)
+		return (key->family > family) ? 1 : -1;
+
+	if (family == AF_INET) {
+		if (ntohl(key->addr.a4.s_addr) == INADDR_ANY)
+			return 0;
+		if (ntohl(addr->a4.s_addr) == INADDR_ANY)
+			return 0;
+		return ipv4_prefix_cmp(&key->addr.a4, &addr->a4, prefixlen);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		if (ipv6_addr_any(&key->addr.a6) || ipv6_addr_any(&addr->a6))
+			return 0;
+		if (ipv6_prefix_equal(&key->addr.a6, &addr->a6, prefixlen))
+			return 0;
+		return memcmp(&key->addr.a6, &addr->a6, sizeof(addr->a6));
+#endif
+	}
+	return -1;
+}
+
+static int tcp_ao_key_cmp(const struct tcp_ao_key *key,
+			  const union tcp_ao_addr *addr, u8 prefixlen,
+			  int family, int sndid, int rcvid)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (family == AF_INET6 && ipv6_addr_v4mapped(&addr->a6)) {
+		__be32 addr4 = addr->a6.s6_addr32[3];
+
+		return __tcp_ao_key_cmp(key, (union tcp_ao_addr *)&addr4,
+					prefixlen, AF_INET, sndid, rcvid);
+	}
+#endif
+	return __tcp_ao_key_cmp(key, addr, prefixlen, family, sndid, rcvid);
+}
+
+static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk,
+		const union tcp_ao_addr *addr, int family, u8 prefix,
+		int sndid, int rcvid)
+{
+	struct tcp_ao_key *key;
+	struct tcp_ao_info *ao;
+
+	ao = rcu_dereference_check(tcp_sk(sk)->ao_info,
+				   lockdep_sock_is_held(sk));
+	if (!ao)
+		return NULL;
+
+	hlist_for_each_entry_rcu(key, &ao->head, node) {
+		u8 prefixlen = min(prefix, key->prefixlen);
+
+		if (!tcp_ao_key_cmp(key, addr, prefixlen, family, sndid, rcvid))
+			return key;
+	}
+	return NULL;
+}
+
+static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
+{
+	struct tcp_ao_info *ao;
+
+	ao = kzalloc(sizeof(*ao), flags);
+	if (!ao)
+		return NULL;
+	INIT_HLIST_HEAD(&ao->head);
+
+	return ao;
+}
+
+static void tcp_ao_link_mkt(struct tcp_ao_info *ao, struct tcp_ao_key *mkt)
+{
+	hlist_add_head_rcu(&mkt->node, &ao->head);
+}
+
+static void tcp_ao_key_free_rcu(struct rcu_head *head)
+{
+	struct tcp_ao_key *key = container_of(head, struct tcp_ao_key, rcu);
+
+	tcp_sigpool_release(key->tcp_sigpool_id);
+	kfree_sensitive(key);
+}
+
+void tcp_ao_destroy_sock(struct sock *sk)
+{
+	struct tcp_ao_info *ao;
+	struct tcp_ao_key *key;
+	struct hlist_node *n;
+
+	ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
+	tcp_sk(sk)->ao_info = NULL;
+
+	if (!ao)
+		return;
+
+	hlist_for_each_entry_safe(key, n, &ao->head, node) {
+		hlist_del_rcu(&key->node);
+		atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+		call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+	}
+
+	kfree_rcu(ao, rcu);
+}
+
+static bool tcp_ao_can_set_current_rnext(struct sock *sk)
+{
+	/* There aren't current/rnext keys on TCP_LISTEN sockets */
+	if (sk->sk_state == TCP_LISTEN)
+		return false;
+	return true;
+}
+
+static int tcp_ao_verify_ipv4(struct sock *sk, struct tcp_ao_add *cmd,
+			      union tcp_ao_addr **addr)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd->addr;
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (sin->sin_family != AF_INET)
+		return -EINVAL;
+
+	/* Currently matching is not performed on port (or port ranges) */
+	if (sin->sin_port != 0)
+		return -EINVAL;
+
+	/* Check prefix and trailing 0's in addr */
+	if (cmd->prefix != 0) {
+		__be32 mask;
+
+		if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY)
+			return -EINVAL;
+		if (cmd->prefix > 32)
+			return -EINVAL;
+
+		mask = inet_make_mask(cmd->prefix);
+		if (sin->sin_addr.s_addr & ~mask)
+			return -EINVAL;
+
+		/* Check that MKT address is consistent with socket */
+		if (ntohl(inet->inet_daddr) != INADDR_ANY &&
+		    (inet->inet_daddr & mask) != sin->sin_addr.s_addr)
+			return -EINVAL;
+	} else {
+		if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY)
+			return -EINVAL;
+	}
+
+	*addr = (union tcp_ao_addr *)&sin->sin_addr;
+	return 0;
+}
+
+static int tcp_ao_parse_crypto(struct tcp_ao_add *cmd, struct tcp_ao_key *key)
+{
+	unsigned int syn_tcp_option_space;
+	bool is_kdf_aes_128_cmac = false;
+	struct crypto_ahash *tfm;
+	struct tcp_sigpool hp;
+	void *tmp_key = NULL;
+	int err;
+
+	/* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */
+	if (!strcmp("cmac(aes128)", cmd->alg_name)) {
+		strscpy(cmd->alg_name, "cmac(aes)", sizeof(cmd->alg_name));
+		is_kdf_aes_128_cmac = (cmd->keylen != 16);
+		tmp_key = kmalloc(cmd->keylen, GFP_KERNEL);
+		if (!tmp_key)
+			return -ENOMEM;
+	}
+
+	key->maclen = cmd->maclen ?: 12; /* 12 is the default in RFC5925 */
+
+	/* Check: maclen + tcp-ao header <= (MAX_TCP_OPTION_SPACE - mss
+	 *					- tstamp - wscale - sackperm),
+	 * see tcp_syn_options(), tcp_synack_options(), commit 33ad798c924b.
+	 *
+	 * In order to allow D-SACK with TCP-AO, the header size should be:
+	 * (MAX_TCP_OPTION_SPACE - TCPOLEN_TSTAMP_ALIGNED
+	 *			- TCPOLEN_SACK_BASE_ALIGNED
+	 *			- 2 * TCPOLEN_SACK_PERBLOCK) = 8 (maclen = 4),
+	 * see tcp_established_options().
+	 *
+	 * RFC5925, 2.2:
+	 * Typical MACs are 96-128 bits (12-16 bytes), but any length
+	 * that fits in the header of the segment being authenticated
+	 * is allowed.
+	 *
+	 * RFC5925, 7.6:
+	 * TCP-AO continues to consume 16 bytes in non-SYN segments,
+	 * leaving a total of 24 bytes for other options, of which
+	 * the timestamp consumes 10.  This leaves 14 bytes, of which 10
+	 * are used for a single SACK block. When two SACK blocks are used,
+	 * such as to handle D-SACK, a smaller TCP-AO MAC would be required
+	 * to make room for the additional SACK block (i.e., to leave 18
+	 * bytes for the D-SACK variant of the SACK option) [RFC2883].
+	 * Note that D-SACK is not supportable in TCP MD5 in the presence
+	 * of timestamps, because TCP MD5’s MAC length is fixed and too
+	 * large to leave sufficient option space.
+	 */
+	syn_tcp_option_space = MAX_TCP_OPTION_SPACE;
+	syn_tcp_option_space -= TCPOLEN_TSTAMP_ALIGNED;
+	syn_tcp_option_space -= TCPOLEN_WSCALE_ALIGNED;
+	syn_tcp_option_space -= TCPOLEN_SACKPERM_ALIGNED;
+	if (tcp_ao_len(key) > syn_tcp_option_space) {
+		err = -EMSGSIZE;
+		goto err_kfree;
+	}
+
+	key->keylen = cmd->keylen;
+	memcpy(key->key, cmd->key, cmd->keylen);
+
+	err = tcp_sigpool_start(key->tcp_sigpool_id, &hp);
+	if (err)
+		goto err_kfree;
+
+	tfm = crypto_ahash_reqtfm(hp.req);
+	if (is_kdf_aes_128_cmac) {
+		void *scratch = hp.scratch;
+		struct scatterlist sg;
+
+		memcpy(tmp_key, cmd->key, cmd->keylen);
+		sg_init_one(&sg, tmp_key, cmd->keylen);
+
+		/* Using zero-key of 16 bytes as described in RFC5926 */
+		memset(scratch, 0, 16);
+		err = crypto_ahash_setkey(tfm, scratch, 16);
+		if (err)
+			goto err_pool_end;
+
+		err = crypto_ahash_init(hp.req);
+		if (err)
+			goto err_pool_end;
+
+		ahash_request_set_crypt(hp.req, &sg, key->key, cmd->keylen);
+		err = crypto_ahash_update(hp.req);
+		if (err)
+			goto err_pool_end;
+
+		err |= crypto_ahash_final(hp.req);
+		if (err)
+			goto err_pool_end;
+		key->keylen = 16;
+	}
+
+	err = crypto_ahash_setkey(tfm, key->key, key->keylen);
+	if (err)
+		goto err_pool_end;
+
+	tcp_sigpool_end(&hp);
+	kfree_sensitive(tmp_key);
+
+	if (tcp_ao_maclen(key) > key->digest_size)
+		return -EINVAL;
+
+	return 0;
+
+err_pool_end:
+	tcp_sigpool_end(&hp);
+err_kfree:
+	kfree_sensitive(tmp_key);
+	return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd,
+			      union tcp_ao_addr **paddr,
+			      unsigned short int *family)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd->addr;
+	struct in6_addr *addr = &sin6->sin6_addr;
+	u8 prefix = cmd->prefix;
+
+	if (sin6->sin6_family != AF_INET6)
+		return -EINVAL;
+
+	/* Currently matching is not performed on port (or port ranges) */
+	if (sin6->sin6_port != 0)
+		return -EINVAL;
+
+	/* Check prefix and trailing 0's in addr */
+	if (cmd->prefix != 0 && ipv6_addr_v4mapped(addr)) {
+		__be32 addr4 = addr->s6_addr32[3];
+		__be32 mask;
+
+		if (prefix > 32 || ntohl(addr4) == INADDR_ANY)
+			return -EINVAL;
+
+		mask = inet_make_mask(prefix);
+		if (addr4 & ~mask)
+			return -EINVAL;
+
+		/* Check that MKT address is consistent with socket */
+		if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+			__be32 daddr4 = sk->sk_v6_daddr.s6_addr32[3];
+
+			if (!ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+				return -EINVAL;
+			if ((daddr4 & mask) != addr4)
+				return -EINVAL;
+		}
+
+		*paddr = (union tcp_ao_addr *)&addr->s6_addr32[3];
+		*family = AF_INET;
+		return 0;
+	} else if (cmd->prefix != 0) {
+		struct in6_addr pfx;
+
+		if (ipv6_addr_any(addr) || prefix > 128)
+			return -EINVAL;
+
+		ipv6_addr_prefix(&pfx, addr, prefix);
+		if (ipv6_addr_cmp(&pfx, addr))
+			return -EINVAL;
+
+		/* Check that MKT address is consistent with socket */
+		if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+		    !ipv6_prefix_equal(&sk->sk_v6_daddr, addr, prefix))
+
+			return -EINVAL;
+	} else {
+		if (!ipv6_addr_any(addr))
+			return -EINVAL;
+	}
+
+	*paddr = (union tcp_ao_addr *)addr;
+	return 0;
+}
+#else
+static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd,
+			      union tcp_ao_addr **paddr,
+			      unsigned short int *family)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
+{
+	if (sk_fullsock(sk)) {
+		return rcu_dereference_protected(tcp_sk(sk)->ao_info,
+						 lockdep_sock_is_held(sk));
+	}
+	return ERR_PTR(-ESOCKTNOSUPPORT);
+}
+
+#define TCP_AO_KEYF_ALL		(0)
+
+static struct tcp_ao_key *tcp_ao_key_alloc(struct sock *sk,
+					   struct tcp_ao_add *cmd)
+{
+	const char *algo = cmd->alg_name;
+	unsigned int digest_size;
+	struct crypto_ahash *tfm;
+	struct tcp_ao_key *key;
+	struct tcp_sigpool hp;
+	int err, pool_id;
+	size_t size;
+
+	/* Force null-termination of alg_name */
+	cmd->alg_name[ARRAY_SIZE(cmd->alg_name) - 1] = '\0';
+
+	/* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */
+	if (!strcmp("cmac(aes128)", algo))
+		algo = "cmac(aes)";
+
+	/* Full TCP header (th->doff << 2) should fit into scratch area,
+	 * see tcp_ao_hash_header().
+	 */
+	pool_id = tcp_sigpool_alloc_ahash(algo, 60);
+	if (pool_id < 0)
+		return ERR_PTR(pool_id);
+
+	err = tcp_sigpool_start(pool_id, &hp);
+	if (err)
+		goto err_free_pool;
+
+	tfm = crypto_ahash_reqtfm(hp.req);
+	if (crypto_ahash_alignmask(tfm) > TCP_AO_KEY_ALIGN) {
+		err = -EOPNOTSUPP;
+		goto err_pool_end;
+	}
+	digest_size = crypto_ahash_digestsize(tfm);
+	tcp_sigpool_end(&hp);
+
+	size = sizeof(struct tcp_ao_key) + (digest_size << 1);
+	key = sock_kmalloc(sk, size, GFP_KERNEL);
+	if (!key) {
+		err = -ENOMEM;
+		goto err_free_pool;
+	}
+
+	key->tcp_sigpool_id = pool_id;
+	key->digest_size = digest_size;
+	return key;
+
+err_pool_end:
+	tcp_sigpool_end(&hp);
+err_free_pool:
+	tcp_sigpool_release(pool_id);
+	return ERR_PTR(err);
+}
+
+static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
+			  sockptr_t optval, int optlen)
+{
+	struct tcp_ao_info *ao_info;
+	union tcp_ao_addr *addr;
+	struct tcp_ao_key *key;
+	struct tcp_ao_add cmd;
+	bool first = false;
+	int ret;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	ret = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+	if (ret)
+		return ret;
+
+	if (cmd.keylen > TCP_AO_MAXKEYLEN)
+		return -EINVAL;
+
+	if (cmd.reserved != 0 || cmd.reserved2 != 0)
+		return -EINVAL;
+
+	if (family == AF_INET)
+		ret = tcp_ao_verify_ipv4(sk, &cmd, &addr);
+	else
+		ret = tcp_ao_verify_ipv6(sk, &cmd, &addr, &family);
+	if (ret)
+		return ret;
+
+	if (cmd.keyflags & ~TCP_AO_KEYF_ALL)
+		return -EINVAL;
+
+	if (cmd.set_current || cmd.set_rnext) {
+		if (!tcp_ao_can_set_current_rnext(sk))
+			return -EINVAL;
+	}
+
+	ao_info = setsockopt_ao_info(sk);
+	if (IS_ERR(ao_info))
+		return PTR_ERR(ao_info);
+
+	if (!ao_info) {
+		ao_info = tcp_ao_alloc_info(GFP_KERNEL);
+		if (!ao_info)
+			return -ENOMEM;
+		first = true;
+	} else {
+		/* Check that neither RecvID nor SendID match any
+		 * existing key for the peer, RFC5925 3.1:
+		 * > The IDs of MKTs MUST NOT overlap where their
+		 * > TCP connection identifiers overlap.
+		 */
+		if (__tcp_ao_do_lookup(sk, addr, family,
+				       cmd.prefix, -1, cmd.rcvid))
+			return -EEXIST;
+		if (__tcp_ao_do_lookup(sk, addr, family,
+				       cmd.prefix, cmd.sndid, -1))
+			return -EEXIST;
+	}
+
+	key = tcp_ao_key_alloc(sk, &cmd);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto err_free_ao;
+	}
+
+	INIT_HLIST_NODE(&key->node);
+	memcpy(&key->addr, addr, (family == AF_INET) ? sizeof(struct in_addr) :
+						       sizeof(struct in6_addr));
+	key->prefixlen	= cmd.prefix;
+	key->family	= family;
+	key->keyflags	= cmd.keyflags;
+	key->sndid	= cmd.sndid;
+	key->rcvid	= cmd.rcvid;
+
+	ret = tcp_ao_parse_crypto(&cmd, key);
+	if (ret < 0)
+		goto err_free_sock;
+
+	tcp_ao_link_mkt(ao_info, key);
+	if (first) {
+		sk_gso_disable(sk);
+		rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
+	}
+
+	if (cmd.set_current)
+		WRITE_ONCE(ao_info->current_key, key);
+	if (cmd.set_rnext)
+		WRITE_ONCE(ao_info->rnext_key, key);
+	return 0;
+
+err_free_sock:
+	atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+	tcp_sigpool_release(key->tcp_sigpool_id);
+	kfree_sensitive(key);
+err_free_ao:
+	if (first)
+		kfree(ao_info);
+	return ret;
+}
+
+static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info,
+			     struct tcp_ao_key *key,
+			     struct tcp_ao_key *new_current,
+			     struct tcp_ao_key *new_rnext)
+{
+	int err;
+
+	hlist_del_rcu(&key->node);
+
+	/* At this moment another CPU could have looked this key up
+	 * while it was unlinked from the list. Wait for RCU grace period,
+	 * after which the key is off-list and can't be looked up again;
+	 * the rx path [just before RCU came] might have used it and set it
+	 * as current_key (very unlikely).
+	 */
+	synchronize_rcu();
+	if (new_current)
+		WRITE_ONCE(ao_info->current_key, new_current);
+	if (new_rnext)
+		WRITE_ONCE(ao_info->rnext_key, new_rnext);
+
+	if (unlikely(READ_ONCE(ao_info->current_key) == key ||
+		     READ_ONCE(ao_info->rnext_key) == key)) {
+		err = -EBUSY;
+		goto add_key;
+	}
+
+	atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+	call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+
+	return 0;
+add_key:
+	hlist_add_head_rcu(&key->node, &ao_info->head);
+	return err;
+}
+
+static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
+			  sockptr_t optval, int optlen)
+{
+	struct tcp_ao_key *key, *new_current = NULL, *new_rnext = NULL;
+	struct tcp_ao_info *ao_info;
+	union tcp_ao_addr *addr;
+	struct tcp_ao_del cmd;
+	int addr_len;
+	__u8 prefix;
+	u16 port;
+	int err;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+	if (err)
+		return err;
+
+	if (cmd.reserved != 0 || cmd.reserved2 != 0)
+		return -EINVAL;
+
+	if (cmd.set_current || cmd.set_rnext) {
+		if (!tcp_ao_can_set_current_rnext(sk))
+			return -EINVAL;
+	}
+
+	ao_info = setsockopt_ao_info(sk);
+	if (IS_ERR(ao_info))
+		return PTR_ERR(ao_info);
+	if (!ao_info)
+		return -ENOENT;
+
+	/* For sockets in TCP_CLOSED it's possible set keys that aren't
+	 * matching the future peer (address/VRF/etc),
+	 * tcp_ao_connect_init() will choose a correct matching MKT
+	 * if there's any.
+	 */
+	if (cmd.set_current) {
+		new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1);
+		if (!new_current)
+			return -ENOENT;
+	}
+	if (cmd.set_rnext) {
+		new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext);
+		if (!new_rnext)
+			return -ENOENT;
+	}
+
+	if (family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.addr;
+
+		addr = (union tcp_ao_addr *)&sin->sin_addr;
+		addr_len = sizeof(struct in_addr);
+		port = ntohs(sin->sin_port);
+	} else {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.addr;
+		struct in6_addr *addr6 = &sin6->sin6_addr;
+
+		if (ipv6_addr_v4mapped(addr6)) {
+			addr = (union tcp_ao_addr *)&addr6->s6_addr32[3];
+			addr_len = sizeof(struct in_addr);
+			family = AF_INET;
+		} else {
+			addr = (union tcp_ao_addr *)addr6;
+			addr_len = sizeof(struct in6_addr);
+		}
+		port = ntohs(sin6->sin6_port);
+	}
+	prefix = cmd.prefix;
+
+	/* Currently matching is not performed on port (or port ranges) */
+	if (port != 0)
+		return -EINVAL;
+
+	/* We could choose random present key here for current/rnext
+	 * but that's less predictable. Let's be strict and don't
+	 * allow removing a key that's in use. RFC5925 doesn't
+	 * specify how-to coordinate key removal, but says:
+	 * "It is presumed that an MKT affecting a particular
+	 * connection cannot be destroyed during an active connection"
+	 */
+	hlist_for_each_entry_rcu(key, &ao_info->head, node) {
+		if (cmd.sndid != key->sndid ||
+		    cmd.rcvid != key->rcvid)
+			continue;
+
+		if (family != key->family ||
+		    prefix != key->prefixlen ||
+		    memcmp(addr, &key->addr, addr_len))
+			continue;
+
+		if (key == new_current || key == new_rnext)
+			continue;
+
+		return tcp_ao_delete_key(sk, ao_info, key,
+					  new_current, new_rnext);
+	}
+	return -ENOENT;
+}
+
+static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
+			   sockptr_t optval, int optlen)
+{
+	struct tcp_ao_key *new_current = NULL, *new_rnext = NULL;
+	struct tcp_ao_info *ao_info;
+	struct tcp_ao_info_opt cmd;
+	bool first = false;
+	int err;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+	if (err)
+		return err;
+
+	if (cmd.set_current || cmd.set_rnext) {
+		if (!tcp_ao_can_set_current_rnext(sk))
+			return -EINVAL;
+	}
+
+	if (cmd.reserved != 0)
+		return -EINVAL;
+
+	ao_info = setsockopt_ao_info(sk);
+	if (IS_ERR(ao_info))
+		return PTR_ERR(ao_info);
+	if (!ao_info) {
+		ao_info = tcp_ao_alloc_info(GFP_KERNEL);
+		if (!ao_info)
+			return -ENOMEM;
+		first = true;
+	}
+
+	/* For sockets in TCP_CLOSED it's possible set keys that aren't
+	 * matching the future peer (address/port/VRF/etc),
+	 * tcp_ao_connect_init() will choose a correct matching MKT
+	 * if there's any.
+	 */
+	if (cmd.set_current) {
+		new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1);
+		if (!new_current) {
+			err = -ENOENT;
+			goto out;
+		}
+	}
+	if (cmd.set_rnext) {
+		new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext);
+		if (!new_rnext) {
+			err = -ENOENT;
+			goto out;
+		}
+	}
+
+	ao_info->ao_required = cmd.ao_required;
+	if (new_current)
+		WRITE_ONCE(ao_info->current_key, new_current);
+	if (new_rnext)
+		WRITE_ONCE(ao_info->rnext_key, new_rnext);
+	if (first) {
+		sk_gso_disable(sk);
+		rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
+	}
+	return 0;
+out:
+	if (first)
+		kfree(ao_info);
+	return err;
+}
+
+int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
+		 sockptr_t optval, int optlen)
+{
+	if (WARN_ON_ONCE(family != AF_INET && family != AF_INET6))
+		return -EAFNOSUPPORT;
+
+	switch (cmd) {
+	case TCP_AO_ADD_KEY:
+		return tcp_ao_add_cmd(sk, family, optval, optlen);
+	case TCP_AO_DEL_KEY:
+		return tcp_ao_del_cmd(sk, family, optval, optlen);
+	case TCP_AO_INFO:
+		return tcp_ao_info_cmd(sk, family, optval, optlen);
+	default:
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+}
+
+int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen)
+{
+	return tcp_parse_ao(sk, cmd, AF_INET, optval, optlen);
+}
+
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7d81e90b6f5c..a4746c27f38a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2271,11 +2271,16 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 };
 EXPORT_SYMBOL(ipv4_specific);
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
+#ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup		= tcp_v4_md5_lookup,
 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
 	.md5_parse		= tcp_v4_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+	.ao_parse		= tcp_v4_parse_ao,
+#endif
 };
 #endif
 
@@ -2290,7 +2295,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 
 	icsk->icsk_af_ops = &ipv4_specific;
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
 #endif
 
@@ -2341,6 +2346,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		rcu_assign_pointer(tp->md5sig_info, NULL);
 	}
 #endif
+	tcp_ao_destroy_sock(sk);
 
 	/* Clean up a referenced TCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash)
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 3036a45e8a1e..d283c59df4c1 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -52,4 +52,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
 ifneq ($(CONFIG_IPV6),)
 obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
 obj-y += mcast_snoop.o
+obj-$(CONFIG_TCP_AO) += tcp_ao.o
 endif
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
new file mode 100644
index 000000000000..049ddbabe049
--- /dev/null
+++ b/net/ipv6/tcp_ao.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP Authentication Option (TCP-AO).
+ *		See RFC5925.
+ *
+ * Authors:	Dmitry Safonov <dima@arista.com>
+ *		Francesco Ruggeri <fruggeri@arista.com>
+ *		Salam Noureddine <noureddine@arista.com>
+ */
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+int tcp_v6_parse_ao(struct sock *sk, int cmd,
+		    sockptr_t optval, int optlen)
+{
+	return tcp_parse_ao(sk, cmd, AF_INET6, optval, optlen);
+}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ee53dad20a59..30bd17d03239 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -76,16 +76,9 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
 
 static const struct inet_connection_sock_af_ops ipv6_mapped;
 const struct inet_connection_sock_af_ops ipv6_specific;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
 static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
-#else
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
-						   const struct in6_addr *addr,
-						   int l3index)
-{
-	return NULL;
-}
 #endif
 
 /* Helper returning the inet6 address from a given tcp socket.
@@ -239,7 +232,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		if (sk_is_mptcp(sk))
 			mptcpv6_handle_mapped(sk, true);
 		sk->sk_backlog_rcv = tcp_v4_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 		tp->af_specific = &tcp_sock_ipv6_mapped_specific;
 #endif
 
@@ -252,7 +245,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			if (sk_is_mptcp(sk))
 				mptcpv6_handle_mapped(sk, false);
 			sk->sk_backlog_rcv = tcp_v6_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 			tp->af_specific = &tcp_sock_ipv6_specific;
 #endif
 			goto failure;
@@ -769,7 +762,13 @@ clear_hash_nostart:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
-
+#else /* CONFIG_TCP_MD5SIG */
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
+						   const struct in6_addr *addr,
+						   int l3index)
+{
+	return NULL;
+}
 #endif
 
 static void tcp_v6_init_req(struct request_sock *req,
@@ -1228,7 +1227,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		if (sk_is_mptcp(newsk))
 			mptcpv6_handle_mapped(newsk, true);
 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
 #endif
 
@@ -1896,11 +1895,16 @@ const struct inet_connection_sock_af_ops ipv6_specific = {
 	.mtu_reduced	   = tcp_v6_mtu_reduced,
 };
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
+#ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup	=	tcp_v6_md5_lookup,
 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
 	.md5_parse	=	tcp_v6_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+	.ao_parse	=	tcp_v6_parse_ao,
+#endif
 };
 #endif
 
@@ -1922,11 +1926,16 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.mtu_reduced	   = tcp_v4_mtu_reduced,
 };
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
+#ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup	=	tcp_v4_md5_lookup,
 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
 	.md5_parse	=	tcp_v6_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+	.ao_parse	=	tcp_v6_parse_ao,
+#endif
 };
 #endif
 
@@ -1941,7 +1950,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 	icsk->icsk_af_ops = &ipv6_specific;
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 	tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 0aadc73995d08f6b0dc061c14a564ffa46f5914e Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:56 +0100
Subject: net/tcp: Prevent TCP-MD5 with TCP-AO being set

Be as conservative as possible: if there is TCP-MD5 key for a given peer
regardless of L3 interface - don't allow setting TCP-AO key for the same
peer. According to RFC5925, TCP-AO is supposed to replace TCP-MD5 and
there can't be any switch between both on any connected tuple.
Later it can be relaxed, if there's a use, but in the beginning restrict
any intersection.

Note: it's still should be possible to set both TCP-MD5 and TCP-AO keys
on a listening socket for *different* peers.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 43 +++++++++++++++++++++++++++++++++++++++++--
 include/net/tcp_ao.h  | 13 +++++++++++++
 net/ipv4/tcp_ao.c     | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c   | 14 +++++++++++---
 net/ipv4/tcp_output.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/tcp_ao.c     | 17 +++++++++++++++++
 net/ipv6/tcp_ipv6.c   | 26 ++++++++++++++++++++++----
 7 files changed, 198 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ff204471d451..0272117511ea 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1778,6 +1778,7 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
 
 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
 		   int family, u8 prefixlen, int l3index, u8 flags);
+void tcp_clear_md5_list(struct sock *sk);
 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 					 const struct sock *addr_sk);
 
@@ -1786,14 +1787,23 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 extern struct static_key_false_deferred tcp_md5_needed;
 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
 					   const union tcp_md5_addr *addr,
-					   int family);
+					   int family, bool any_l3index);
 static inline struct tcp_md5sig_key *
 tcp_md5_do_lookup(const struct sock *sk, int l3index,
 		  const union tcp_md5_addr *addr, int family)
 {
 	if (!static_branch_unlikely(&tcp_md5_needed.key))
 		return NULL;
-	return __tcp_md5_do_lookup(sk, l3index, addr, family);
+	return __tcp_md5_do_lookup(sk, l3index, addr, family, false);
+}
+
+static inline struct tcp_md5sig_key *
+tcp_md5_do_lookup_any_l3index(const struct sock *sk,
+			      const union tcp_md5_addr *addr, int family)
+{
+	if (!static_branch_unlikely(&tcp_md5_needed.key))
+		return NULL;
+	return __tcp_md5_do_lookup(sk, 0, addr, family, true);
 }
 
 enum skb_drop_reason
@@ -1811,6 +1821,13 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index,
 	return NULL;
 }
 
+static inline struct tcp_md5sig_key *
+tcp_md5_do_lookup_any_l3index(const struct sock *sk,
+			      const union tcp_md5_addr *addr, int family)
+{
+	return NULL;
+}
+
 static inline enum skb_drop_reason
 tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 		     const void *saddr, const void *daddr,
@@ -2177,6 +2194,9 @@ struct tcp_sock_af_ops {
 #endif
 #ifdef CONFIG_TCP_AO
 	int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen);
+	struct tcp_ao_key *(*ao_lookup)(const struct sock *sk,
+					struct sock *addr_sk,
+					int sndid, int rcvid);
 #endif
 };
 
@@ -2588,4 +2608,23 @@ static inline u64 tcp_transmit_time(const struct sock *sk)
 	return 0;
 }
 
+static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
+				   int family)
+{
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao_info;
+	struct tcp_ao_key *ao_key;
+
+	ao_info = rcu_dereference_check(tcp_sk(sk)->ao_info,
+					lockdep_sock_is_held(sk));
+	if (!ao_info)
+		return false;
+
+	ao_key = tcp_ao_do_lookup(sk, saddr, family, -1, -1);
+	if (ao_info->ao_required || ao_key)
+		return true;
+#endif
+	return false;
+}
+
 #endif	/* _TCP_H */
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index a81e40fd255a..3c7f576376f9 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -92,11 +92,24 @@ struct tcp_ao_info {
 int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
 		 sockptr_t optval, int optlen);
 void tcp_ao_destroy_sock(struct sock *sk);
+struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
+				    const union tcp_ao_addr *addr,
+				    int family, int sndid, int rcvid);
 /* ipv4 specific functions */
 int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
+				    int sndid, int rcvid);
 /* ipv6 specific functions */
 int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
+				    struct sock *addr_sk, int sndid, int rcvid);
 #else
+static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
+		const union tcp_ao_addr *addr, int family, int sndid, int rcvid)
+{
+	return NULL;
+}
+
 static inline void tcp_ao_destroy_sock(struct sock *sk)
 {
 }
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 3c2d005a37ce..ee23356101f4 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -116,6 +116,13 @@ static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk,
 	return NULL;
 }
 
+struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
+				    const union tcp_ao_addr *addr,
+				    int family, int sndid, int rcvid)
+{
+	return __tcp_ao_do_lookup(sk, addr, family, U8_MAX, sndid, rcvid);
+}
+
 static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
 {
 	struct tcp_ao_info *ao;
@@ -162,6 +169,14 @@ void tcp_ao_destroy_sock(struct sock *sk)
 	kfree_rcu(ao, rcu);
 }
 
+struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
+				    int sndid, int rcvid)
+{
+	union tcp_ao_addr *addr = (union tcp_ao_addr *)&addr_sk->sk_daddr;
+
+	return tcp_ao_do_lookup(sk, addr, AF_INET, sndid, rcvid);
+}
+
 static bool tcp_ao_can_set_current_rnext(struct sock *sk)
 {
 	/* There aren't current/rnext keys on TCP_LISTEN sockets */
@@ -497,6 +512,10 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 			return -EINVAL;
 	}
 
+	/* Don't allow keys for peers that have a matching TCP-MD5 key */
+	if (tcp_md5_do_lookup_any_l3index(sk, addr, family))
+		return -EKEYREJECTED;
+
 	ao_info = setsockopt_ao_info(sk);
 	if (IS_ERR(ao_info))
 		return PTR_ERR(ao_info);
@@ -698,6 +717,31 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
 	return -ENOENT;
 }
 
+/* cmd.ao_required makes a socket TCP-AO only.
+ * Don't allow any md5 keys for any l3intf on the socket together with it.
+ * Restricting it early in setsockopt() removes a check for
+ * ao_info->ao_required on inbound tcp segment fast-path.
+ */
+static int tcp_ao_required_verify(struct sock *sk)
+{
+#ifdef CONFIG_TCP_MD5SIG
+	const struct tcp_md5sig_info *md5sig;
+
+	if (!static_branch_unlikely(&tcp_md5_needed.key))
+		return 0;
+
+	md5sig = rcu_dereference_check(tcp_sk(sk)->md5sig_info,
+				       lockdep_sock_is_held(sk));
+	if (!md5sig)
+		return 0;
+
+	if (rcu_dereference_check(hlist_first_rcu(&md5sig->head),
+				  lockdep_sock_is_held(sk)))
+		return 1;
+#endif
+	return 0;
+}
+
 static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
 			   sockptr_t optval, int optlen)
 {
@@ -732,6 +776,9 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
 		first = true;
 	}
 
+	if (cmd.ao_required && tcp_ao_required_verify(sk))
+		return -EKEYREJECTED;
+
 	/* For sockets in TCP_CLOSED it's possible set keys that aren't
 	 * matching the future peer (address/port/VRF/etc),
 	 * tcp_ao_connect_init() will choose a correct matching MKT
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a4746c27f38a..698e58a3ccec 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1082,7 +1082,7 @@ static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *
 /* Find the Key structure for an address.  */
 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
 					   const union tcp_md5_addr *addr,
-					   int family)
+					   int family, bool any_l3index)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_md5sig_key *key;
@@ -1101,7 +1101,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
 				 lockdep_sock_is_held(sk)) {
 		if (key->family != family)
 			continue;
-		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
+		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
+		    key->l3index != l3index)
 			continue;
 		if (family == AF_INET) {
 			mask = inet_make_mask(key->prefixlen);
@@ -1313,7 +1314,7 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
 }
 EXPORT_SYMBOL(tcp_md5_do_del);
 
-static void tcp_clear_md5_list(struct sock *sk)
+void tcp_clear_md5_list(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_md5sig_key *key;
@@ -1383,6 +1384,12 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 		return -EINVAL;
 
+	/* Don't allow keys for peers that have a matching TCP-AO key.
+	 * See the comment in tcp_ao_add_cmd()
+	 */
+	if (tcp_ao_required(sk, addr, AF_INET))
+		return -EKEYREJECTED;
+
 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
 			      cmd.tcpm_key, cmd.tcpm_keylen);
 }
@@ -2279,6 +2286,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
 	.md5_parse		= tcp_v4_parse_md5_keys,
 #endif
 #ifdef CONFIG_TCP_AO
+	.ao_lookup		= tcp_v4_ao_lookup,
 	.ao_parse		= tcp_v4_parse_ao,
 #endif
 };
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ca4d7594efd4..1b90107f7038 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3931,6 +3931,53 @@ int tcp_connect(struct sock *sk)
 
 	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
 
+#if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO)
+	/* Has to be checked late, after setting daddr/saddr/ops.
+	 * Return error if the peer has both a md5 and a tcp-ao key
+	 * configured as this is ambiguous.
+	 */
+	if (unlikely(rcu_dereference_protected(tp->md5sig_info,
+					       lockdep_sock_is_held(sk)))) {
+		bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1);
+		bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk);
+		struct tcp_ao_info *ao_info;
+
+		ao_info = rcu_dereference_check(tp->ao_info,
+						lockdep_sock_is_held(sk));
+		if (ao_info) {
+			/* This is an extra check: tcp_ao_required() in
+			 * tcp_v{4,6}_parse_md5_keys() should prevent adding
+			 * md5 keys on ao_required socket.
+			 */
+			needs_ao |= ao_info->ao_required;
+			WARN_ON_ONCE(ao_info->ao_required && needs_md5);
+		}
+		if (needs_md5 && needs_ao)
+			return -EKEYREJECTED;
+
+		/* If we have a matching md5 key and no matching tcp-ao key
+		 * then free up ao_info if allocated.
+		 */
+		if (needs_md5) {
+			tcp_ao_destroy_sock(sk);
+		} else if (needs_ao) {
+			tcp_clear_md5_list(sk);
+			kfree(rcu_replace_pointer(tp->md5sig_info, NULL,
+						  lockdep_sock_is_held(sk)));
+		}
+	}
+#endif
+#ifdef CONFIG_TCP_AO
+	if (unlikely(rcu_dereference_protected(tp->ao_info,
+					       lockdep_sock_is_held(sk)))) {
+		/* Don't allow connecting if ao is configured but no
+		 * matching key is found.
+		 */
+		if (!tp->af_specific->ao_lookup(sk, sk, -1, -1))
+			return -EKEYREJECTED;
+	}
+#endif
+
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
 
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
index 049ddbabe049..0640acaee67b 100644
--- a/net/ipv6/tcp_ao.c
+++ b/net/ipv6/tcp_ao.c
@@ -12,6 +12,23 @@
 #include <net/tcp.h>
 #include <net/ipv6.h>
 
+static struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
+					      const struct in6_addr *addr,
+					      int sndid, int rcvid)
+{
+	return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)addr, AF_INET6,
+				sndid, rcvid);
+}
+
+struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
+				    struct sock *addr_sk,
+				    int sndid, int rcvid)
+{
+	struct in6_addr *addr = &addr_sk->sk_v6_daddr;
+
+	return tcp_v6_ao_do_lookup(sk, addr, sndid, rcvid);
+}
+
 int tcp_v6_parse_ao(struct sock *sk, int cmd,
 		    sockptr_t optval, int optlen)
 {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 30bd17d03239..70a3842f47fa 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -600,6 +600,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 {
 	struct tcp_md5sig cmd;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
+	union tcp_ao_addr *addr;
 	int l3index = 0;
 	u8 prefixlen;
 	u8 flags;
@@ -654,13 +655,28 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 		return -EINVAL;
 
-	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-		return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
+	if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+		addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3];
+
+		/* Don't allow keys for peers that have a matching TCP-AO key.
+		 * See the comment in tcp_ao_add_cmd()
+		 */
+		if (tcp_ao_required(sk, addr, AF_INET))
+			return -EKEYREJECTED;
+		return tcp_md5_do_add(sk, addr,
 				      AF_INET, prefixlen, l3index, flags,
 				      cmd.tcpm_key, cmd.tcpm_keylen);
+	}
+
+	addr = (union tcp_md5_addr *)&sin6->sin6_addr;
+
+	/* Don't allow keys for peers that have a matching TCP-AO key.
+	 * See the comment in tcp_ao_add_cmd()
+	 */
+	if (tcp_ao_required(sk, addr, AF_INET6))
+		return -EKEYREJECTED;
 
-	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
-			      AF_INET6, prefixlen, l3index, flags,
+	return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags,
 			      cmd.tcpm_key, cmd.tcpm_keylen);
 }
 
@@ -1903,6 +1919,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
 	.md5_parse	=	tcp_v6_parse_md5_keys,
 #endif
 #ifdef CONFIG_TCP_AO
+	.ao_lookup	=	tcp_v6_ao_lookup,
 	.ao_parse	=	tcp_v6_parse_ao,
 #endif
 };
@@ -1934,6 +1951,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
 	.md5_parse	=	tcp_v6_parse_md5_keys,
 #endif
 #ifdef CONFIG_TCP_AO
+	.ao_lookup	=	tcp_v6_ao_lookup,
 	.ao_parse	=	tcp_v6_parse_ao,
 #endif
 };
-- 
cgit v1.2.3-70-g09d2


From 7c2ffaf21bd67f73d21560995ce17eaf5fc1d37f Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:57 +0100
Subject: net/tcp: Calculate TCP-AO traffic keys

Add traffic key calculation the way it's described in RFC5926.
Wire it up to tcp_finish_connect() and cache the new keys straight away
on already established TCP connections.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |   3 +
 include/net/tcp_ao.h  |  51 ++++++++++++-
 net/ipv4/tcp_ao.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c  |   2 +
 net/ipv4/tcp_ipv4.c   |   1 +
 net/ipv4/tcp_output.c |   2 +
 net/ipv6/tcp_ao.c     |  50 ++++++++++++
 net/ipv6/tcp_ipv6.c   |   1 +
 8 files changed, 314 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0272117511ea..b72c46cf229b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2197,6 +2197,9 @@ struct tcp_sock_af_ops {
 	struct tcp_ao_key *(*ao_lookup)(const struct sock *sk,
 					struct sock *addr_sk,
 					int sndid, int rcvid);
+	int (*ao_calc_key_sk)(struct tcp_ao_key *mkt, u8 *key,
+			      const struct sock *sk,
+			      __be32 sisn, __be32 disn, bool send);
 #endif
 };
 
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 3c7f576376f9..b021a811511b 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -89,8 +89,32 @@ struct tcp_ao_info {
 };
 
 #ifdef CONFIG_TCP_AO
+/* TCP-AO structures and functions */
+
+struct tcp4_ao_context {
+	__be32		saddr;
+	__be32		daddr;
+	__be16		sport;
+	__be16		dport;
+	__be32		sisn;
+	__be32		disn;
+};
+
+struct tcp6_ao_context {
+	struct in6_addr	saddr;
+	struct in6_addr	daddr;
+	__be16		sport;
+	__be16		dport;
+	__be32		sisn;
+	__be32		disn;
+};
+
+struct tcp_sigpool;
+
 int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
 		 sockptr_t optval, int optlen);
+int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
+			    unsigned int len, struct tcp_sigpool *hp);
 void tcp_ao_destroy_sock(struct sock *sk);
 struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 				    const union tcp_ao_addr *addr,
@@ -99,11 +123,22 @@ struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
 struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 				    int sndid, int rcvid);
+int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+			  const struct sock *sk,
+			  __be32 sisn, __be32 disn, bool send);
 /* ipv6 specific functions */
-int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+			  const struct sock *sk, __be32 sisn,
+			  __be32 disn, bool send);
 struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 				    struct sock *addr_sk, int sndid, int rcvid);
-#else
+int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+void tcp_ao_established(struct sock *sk);
+void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb);
+void tcp_ao_connect_init(struct sock *sk);
+
+#else /* CONFIG_TCP_AO */
+
 static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 		const union tcp_ao_addr *addr, int family, int sndid, int rcvid)
 {
@@ -113,6 +148,18 @@ static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 static inline void tcp_ao_destroy_sock(struct sock *sk)
 {
 }
+
+static inline void tcp_ao_established(struct sock *sk)
+{
+}
+
+static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static inline void tcp_ao_connect_init(struct sock *sk)
+{
+}
 #endif
 
 #endif /* _TCP_AO_H */
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index ee23356101f4..e478341fc336 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -16,6 +16,34 @@
 #include <net/tcp.h>
 #include <net/ipv6.h>
 
+int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
+			    unsigned int len, struct tcp_sigpool *hp)
+{
+	struct scatterlist sg;
+	int ret;
+
+	if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp->req),
+				mkt->key, mkt->keylen))
+		goto clear_hash;
+
+	ret = crypto_ahash_init(hp->req);
+	if (ret)
+		goto clear_hash;
+
+	sg_init_one(&sg, ctx, len);
+	ahash_request_set_crypt(hp->req, &sg, key, len);
+	crypto_ahash_update(hp->req);
+
+	ret = crypto_ahash_final(hp->req);
+	if (ret)
+		goto clear_hash;
+
+	return 0;
+clear_hash:
+	memset(key, 0, tcp_ao_digest_size(mkt));
+	return 1;
+}
+
 /* Optimized version of tcp_ao_do_lookup(): only for sockets for which
  * it's known that the keys in ao_info are matching peer's
  * family/address/VRF/etc.
@@ -169,6 +197,71 @@ void tcp_ao_destroy_sock(struct sock *sk)
 	kfree_rcu(ao, rcu);
 }
 
+/* 4 tuple and ISNs are expected in NBO */
+static int tcp_v4_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
+			      __be32 saddr, __be32 daddr,
+			      __be16 sport, __be16 dport,
+			      __be32 sisn,  __be32 disn)
+{
+	/* See RFC5926 3.1.1 */
+	struct kdf_input_block {
+		u8                      counter;
+		u8                      label[6];
+		struct tcp4_ao_context	ctx;
+		__be16                  outlen;
+	} __packed * tmp;
+	struct tcp_sigpool hp;
+	int err;
+
+	err = tcp_sigpool_start(mkt->tcp_sigpool_id, &hp);
+	if (err)
+		return err;
+
+	tmp = hp.scratch;
+	tmp->counter	= 1;
+	memcpy(tmp->label, "TCP-AO", 6);
+	tmp->ctx.saddr	= saddr;
+	tmp->ctx.daddr	= daddr;
+	tmp->ctx.sport	= sport;
+	tmp->ctx.dport	= dport;
+	tmp->ctx.sisn	= sisn;
+	tmp->ctx.disn	= disn;
+	tmp->outlen	= htons(tcp_ao_digest_size(mkt) * 8); /* in bits */
+
+	err = tcp_ao_calc_traffic_key(mkt, key, tmp, sizeof(*tmp), &hp);
+	tcp_sigpool_end(&hp);
+
+	return err;
+}
+
+int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+			  const struct sock *sk,
+			  __be32 sisn, __be32 disn, bool send)
+{
+	if (send)
+		return tcp_v4_ao_calc_key(mkt, key, sk->sk_rcv_saddr,
+					  sk->sk_daddr, htons(sk->sk_num),
+					  sk->sk_dport, sisn, disn);
+	else
+		return tcp_v4_ao_calc_key(mkt, key, sk->sk_daddr,
+					  sk->sk_rcv_saddr, sk->sk_dport,
+					  htons(sk->sk_num), disn, sisn);
+}
+
+static int tcp_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+			      const struct sock *sk,
+			      __be32 sisn, __be32 disn, bool send)
+{
+	if (mkt->family == AF_INET)
+		return tcp_v4_ao_calc_key_sk(mkt, key, sk, sisn, disn, send);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (mkt->family == AF_INET6)
+		return tcp_v6_ao_calc_key_sk(mkt, key, sk, sisn, disn, send);
+#endif
+	else
+		return -EOPNOTSUPP;
+}
+
 struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 				    int sndid, int rcvid)
 {
@@ -177,6 +270,113 @@ struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 	return tcp_ao_do_lookup(sk, addr, AF_INET, sndid, rcvid);
 }
 
+static int tcp_ao_cache_traffic_keys(const struct sock *sk,
+				     struct tcp_ao_info *ao,
+				     struct tcp_ao_key *ao_key)
+{
+	u8 *traffic_key = snd_other_key(ao_key);
+	int ret;
+
+	ret = tcp_ao_calc_key_sk(ao_key, traffic_key, sk,
+				 ao->lisn, ao->risn, true);
+	if (ret)
+		return ret;
+
+	traffic_key = rcv_other_key(ao_key);
+	ret = tcp_ao_calc_key_sk(ao_key, traffic_key, sk,
+				 ao->lisn, ao->risn, false);
+	return ret;
+}
+
+void tcp_ao_connect_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_ao_info *ao_info;
+	union tcp_ao_addr *addr;
+	struct tcp_ao_key *key;
+	int family;
+
+	ao_info = rcu_dereference_protected(tp->ao_info,
+					    lockdep_sock_is_held(sk));
+	if (!ao_info)
+		return;
+
+	/* Remove all keys that don't match the peer */
+	family = sk->sk_family;
+	if (family == AF_INET)
+		addr = (union tcp_ao_addr *)&sk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (family == AF_INET6)
+		addr = (union tcp_ao_addr *)&sk->sk_v6_daddr;
+#endif
+	else
+		return;
+
+	hlist_for_each_entry_rcu(key, &ao_info->head, node) {
+		if (!tcp_ao_key_cmp(key, addr, key->prefixlen, family, -1, -1))
+			continue;
+
+		if (key == ao_info->current_key)
+			ao_info->current_key = NULL;
+		if (key == ao_info->rnext_key)
+			ao_info->rnext_key = NULL;
+		hlist_del_rcu(&key->node);
+		atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+		call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+	}
+
+	key = tp->af_specific->ao_lookup(sk, sk, -1, -1);
+	if (key) {
+		/* if current_key or rnext_key were not provided,
+		 * use the first key matching the peer
+		 */
+		if (!ao_info->current_key)
+			ao_info->current_key = key;
+		if (!ao_info->rnext_key)
+			ao_info->rnext_key = key;
+		tp->tcp_header_len += tcp_ao_len(key);
+
+		ao_info->lisn = htonl(tp->write_seq);
+	} else {
+		/* Can't happen: tcp_connect() verifies that there's
+		 * at least one tcp-ao key that matches the remote peer.
+		 */
+		WARN_ON_ONCE(1);
+		rcu_assign_pointer(tp->ao_info, NULL);
+		kfree(ao_info);
+	}
+}
+
+void tcp_ao_established(struct sock *sk)
+{
+	struct tcp_ao_info *ao;
+	struct tcp_ao_key *key;
+
+	ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+				       lockdep_sock_is_held(sk));
+	if (!ao)
+		return;
+
+	hlist_for_each_entry_rcu(key, &ao->head, node)
+		tcp_ao_cache_traffic_keys(sk, ao, key);
+}
+
+void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_ao_info *ao;
+	struct tcp_ao_key *key;
+
+	ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+				       lockdep_sock_is_held(sk));
+	if (!ao)
+		return;
+
+	WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq);
+
+	hlist_for_each_entry_rcu(key, &ao->head, node)
+		tcp_ao_cache_traffic_keys(sk, ao, key);
+}
+
 static bool tcp_ao_can_set_current_rnext(struct sock *sk)
 {
 	/* There aren't current/rnext keys on TCP_LISTEN sockets */
@@ -558,6 +758,12 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 	if (ret < 0)
 		goto err_free_sock;
 
+	/* Change this condition if we allow adding keys in states
+	 * like close_wait, syn_sent or fin_wait...
+	 */
+	if (sk->sk_state == TCP_ESTABLISHED)
+		tcp_ao_cache_traffic_keys(sk, ao_info, key);
+
 	tcp_ao_link_mkt(ao_info, key);
 	if (first) {
 		sk_gso_disable(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 00d04ab68958..6ee0342b5338 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6151,6 +6151,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	tcp_ao_finish_connect(sk, skb);
 	tcp_set_state(sk, TCP_ESTABLISHED);
 	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
 
@@ -6648,6 +6649,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 					  skb);
 			WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
 		}
+		tcp_ao_established(sk);
 		smp_mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
 		sk->sk_state_change(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 698e58a3ccec..3c73b5829377 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2288,6 +2288,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
 #ifdef CONFIG_TCP_AO
 	.ao_lookup		= tcp_v4_ao_lookup,
 	.ao_parse		= tcp_v4_parse_ao,
+	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
 #endif
 };
 #endif
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1b90107f7038..9fbf1b2e2025 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3749,6 +3749,8 @@ static void tcp_connect_init(struct sock *sk)
 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
 		tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
 
+	tcp_ao_connect_init(sk);
+
 	/* If user gave his TCP_MAXSEG, record it to clamp */
 	if (tp->rx_opt.user_mss)
 		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
index 0640acaee67b..9ab594fadbd9 100644
--- a/net/ipv6/tcp_ao.c
+++ b/net/ipv6/tcp_ao.c
@@ -12,6 +12,56 @@
 #include <net/tcp.h>
 #include <net/ipv6.h>
 
+static int tcp_v6_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
+			      const struct in6_addr *saddr,
+			      const struct in6_addr *daddr,
+			      __be16 sport, __be16 dport,
+			      __be32 sisn, __be32 disn)
+{
+	struct kdf_input_block {
+		u8			counter;
+		u8			label[6];
+		struct tcp6_ao_context	ctx;
+		__be16			outlen;
+	} __packed * tmp;
+	struct tcp_sigpool hp;
+	int err;
+
+	err = tcp_sigpool_start(mkt->tcp_sigpool_id, &hp);
+	if (err)
+		return err;
+
+	tmp = hp.scratch;
+	tmp->counter	= 1;
+	memcpy(tmp->label, "TCP-AO", 6);
+	tmp->ctx.saddr	= *saddr;
+	tmp->ctx.daddr	= *daddr;
+	tmp->ctx.sport	= sport;
+	tmp->ctx.dport	= dport;
+	tmp->ctx.sisn	= sisn;
+	tmp->ctx.disn	= disn;
+	tmp->outlen	= htons(tcp_ao_digest_size(mkt) * 8); /* in bits */
+
+	err = tcp_ao_calc_traffic_key(mkt, key, tmp, sizeof(*tmp), &hp);
+	tcp_sigpool_end(&hp);
+
+	return err;
+}
+
+int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+			  const struct sock *sk, __be32 sisn,
+			  __be32 disn, bool send)
+{
+	if (send)
+		return tcp_v6_ao_calc_key(mkt, key, &sk->sk_v6_rcv_saddr,
+					  &sk->sk_v6_daddr, htons(sk->sk_num),
+					  sk->sk_dport, sisn, disn);
+	else
+		return tcp_v6_ao_calc_key(mkt, key, &sk->sk_v6_daddr,
+					  &sk->sk_v6_rcv_saddr, sk->sk_dport,
+					  htons(sk->sk_num), disn, sisn);
+}
+
 static struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
 					      const struct in6_addr *addr,
 					      int sndid, int rcvid)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 70a3842f47fa..074e16fe00e0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1921,6 +1921,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
 #ifdef CONFIG_TCP_AO
 	.ao_lookup	=	tcp_v6_ao_lookup,
 	.ao_parse	=	tcp_v6_parse_ao,
+	.ao_calc_key_sk	=	tcp_v6_ao_calc_key_sk,
 #endif
 };
 #endif
-- 
cgit v1.2.3-70-g09d2


From 1e03d32bea8e782b7d31769c25a5fae8a5044488 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:58 +0100
Subject: net/tcp: Add TCP-AO sign to outgoing packets

Using precalculated traffic keys, sign TCP segments as prescribed by
RFC5925. Per RFC, TCP header options are included in sign calculation:
"The TCP header, by default including options, and where the TCP
checksum and TCP-AO MAC fields are set to zero, all in network-
byte order." (5.1.3)

tcp_ao_hash_header() has exclude_options parameter to optionally exclude
TCP header from hash calculation, as described in RFC5925 (9.1), this is
needed for interaction with middleboxes that may change "some TCP
options". This is wired up to AO key flags and setsockopt() later.

Similarly to TCP-MD5 hash TCP segment fragments.

From this moment a user can start sending TCP-AO signed segments with
one of crypto ahash algorithms from supported by Linux kernel. It can
have a user-specified MAC length, to either save TCP option header space
or provide higher protection using a longer signature.
The inbound segments are not yet verified, TCP-AO option is ignored and
they are accepted.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  64 ++++++++++++++++
 include/net/tcp_ao.h  |  23 ++++++
 net/ipv4/tcp_ao.c     | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c   |   1 +
 net/ipv4/tcp_output.c | 112 ++++++++++++++++++----------
 net/ipv6/tcp_ao.c     |  28 +++++++
 net/ipv6/tcp_ipv6.c   |   2 +
 7 files changed, 391 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b72c46cf229b..96e83159f1be 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -195,6 +195,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
 #define TCPOPT_SACK             5       /* SACK Block */
 #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
+#define TCPOPT_AO		29	/* Authentication Option (RFC5925) */
 #define TCPOPT_MPTCP		30	/* Multipath TCP (RFC6824) */
 #define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
 #define TCPOPT_EXP		254	/* Experimental */
@@ -2200,6 +2201,9 @@ struct tcp_sock_af_ops {
 	int (*ao_calc_key_sk)(struct tcp_ao_key *mkt, u8 *key,
 			      const struct sock *sk,
 			      __be32 sisn, __be32 disn, bool send);
+	int (*calc_ao_hash)(char *location, struct tcp_ao_key *ao,
+			    const struct sock *sk, const struct sk_buff *skb,
+			    const u8 *tkey, int hash_offset, u32 sne);
 #endif
 };
 
@@ -2253,6 +2257,66 @@ static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
 }
 #endif
 
+struct tcp_key {
+	union {
+		struct tcp_ao_key *ao_key;
+		struct tcp_md5sig_key *md5_key;
+	};
+	enum {
+		TCP_KEY_NONE = 0,
+		TCP_KEY_MD5,
+		TCP_KEY_AO,
+	} type;
+};
+
+static inline void tcp_get_current_key(const struct sock *sk,
+				       struct tcp_key *out)
+{
+#if defined(CONFIG_TCP_AO) || defined(CONFIG_TCP_MD5SIG)
+	const struct tcp_sock *tp = tcp_sk(sk);
+#endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao;
+
+	ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk));
+	if (ao) {
+		out->ao_key = READ_ONCE(ao->current_key);
+		out->type = TCP_KEY_AO;
+		return;
+	}
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key) &&
+	    rcu_access_pointer(tp->md5sig_info)) {
+		out->md5_key = tp->af_specific->md5_lookup(sk, sk);
+		if (out->md5_key) {
+			out->type = TCP_KEY_MD5;
+			return;
+		}
+	}
+#endif
+	out->type = TCP_KEY_NONE;
+}
+
+static inline bool tcp_key_is_md5(const struct tcp_key *key)
+{
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key) &&
+	    key->type == TCP_KEY_MD5)
+		return true;
+#endif
+	return false;
+}
+
+static inline bool tcp_key_is_ao(const struct tcp_key *key)
+{
+#ifdef CONFIG_TCP_AO
+	if (key->type == TCP_KEY_AO)
+		return true;
+#endif
+	return false;
+}
+
 int tcpv4_offload_init(void);
 
 void tcp_v4_init(void);
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index b021a811511b..0b86bc05d8cf 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -111,6 +111,13 @@ struct tcp6_ao_context {
 
 struct tcp_sigpool;
 
+int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
+			struct tcp_ao_key *key, struct tcphdr *th,
+			__u8 *hash_location);
+int tcp_ao_hash_skb(unsigned short int family,
+		    char *ao_hash, struct tcp_ao_key *key,
+		    const struct sock *sk, const struct sk_buff *skb,
+		    const u8 *tkey, int hash_offset, u32 sne);
 int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
 		 sockptr_t optval, int optlen);
 int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
@@ -126,12 +133,21 @@ struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk,
 			  __be32 sisn, __be32 disn, bool send);
+int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
+		       const struct sock *sk, const struct sk_buff *skb,
+		       const u8 *tkey, int hash_offset, u32 sne);
 /* ipv6 specific functions */
+int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
+				const struct in6_addr *daddr,
+				const struct in6_addr *saddr, int nbytes);
 int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk, __be32 sisn,
 			  __be32 disn, bool send);
 struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 				    struct sock *addr_sk, int sndid, int rcvid);
+int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
+		       const struct sock *sk, const struct sk_buff *skb,
+		       const u8 *tkey, int hash_offset, u32 sne);
 int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
 void tcp_ao_established(struct sock *sk);
 void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb);
@@ -139,6 +155,13 @@ void tcp_ao_connect_init(struct sock *sk);
 
 #else /* CONFIG_TCP_AO */
 
+static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
+				      struct tcp_ao_key *key, struct tcphdr *th,
+				      __u8 *hash_location)
+{
+	return 0;
+}
+
 static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 		const union tcp_ao_addr *addr, int family, int sndid, int rcvid)
 {
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index e478341fc336..007f29a2531f 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -262,6 +262,171 @@ static int tcp_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 		return -EOPNOTSUPP;
 }
 
+static int tcp_v4_ao_hash_pseudoheader(struct tcp_sigpool *hp,
+				       __be32 daddr, __be32 saddr,
+				       int nbytes)
+{
+	struct tcp4_pseudohdr *bp;
+	struct scatterlist sg;
+
+	bp = hp->scratch;
+	bp->saddr = saddr;
+	bp->daddr = daddr;
+	bp->pad = 0;
+	bp->protocol = IPPROTO_TCP;
+	bp->len = cpu_to_be16(nbytes);
+
+	sg_init_one(&sg, bp, sizeof(*bp));
+	ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+	return crypto_ahash_update(hp->req);
+}
+
+static int tcp_ao_hash_pseudoheader(unsigned short int family,
+				    const struct sock *sk,
+				    const struct sk_buff *skb,
+				    struct tcp_sigpool *hp, int nbytes)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+
+	/* TODO: Can we rely on checksum being zero to mean outbound pkt? */
+	if (!th->check) {
+		if (family == AF_INET)
+			return tcp_v4_ao_hash_pseudoheader(hp, sk->sk_daddr,
+					sk->sk_rcv_saddr, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (family == AF_INET6)
+			return tcp_v6_ao_hash_pseudoheader(hp, &sk->sk_v6_daddr,
+					&sk->sk_v6_rcv_saddr, skb->len);
+#endif
+		else
+			return -EAFNOSUPPORT;
+	}
+
+	if (family == AF_INET) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		return tcp_v4_ao_hash_pseudoheader(hp, iph->daddr,
+				iph->saddr, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (family == AF_INET6) {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		return tcp_v6_ao_hash_pseudoheader(hp, &iph->daddr,
+				&iph->saddr, skb->len);
+#endif
+	}
+	return -EAFNOSUPPORT;
+}
+
+/* tcp_ao_hash_sne(struct tcp_sigpool *hp)
+ * @hp	- used for hashing
+ * @sne - sne value
+ */
+static int tcp_ao_hash_sne(struct tcp_sigpool *hp, u32 sne)
+{
+	struct scatterlist sg;
+	__be32 *bp;
+
+	bp = (__be32 *)hp->scratch;
+	*bp = htonl(sne);
+
+	sg_init_one(&sg, bp, sizeof(*bp));
+	ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+	return crypto_ahash_update(hp->req);
+}
+
+static int tcp_ao_hash_header(struct tcp_sigpool *hp,
+			      const struct tcphdr *th,
+			      bool exclude_options, u8 *hash,
+			      int hash_offset, int hash_len)
+{
+	int err, len = th->doff << 2;
+	struct scatterlist sg;
+	u8 *hdr = hp->scratch;
+
+	/* We are not allowed to change tcphdr, make a local copy */
+	if (exclude_options) {
+		len = sizeof(*th) + sizeof(struct tcp_ao_hdr) + hash_len;
+		memcpy(hdr, th, sizeof(*th));
+		memcpy(hdr + sizeof(*th),
+		       (u8 *)th + hash_offset - sizeof(struct tcp_ao_hdr),
+		       sizeof(struct tcp_ao_hdr));
+		memset(hdr + sizeof(*th) + sizeof(struct tcp_ao_hdr),
+		       0, hash_len);
+		((struct tcphdr *)hdr)->check = 0;
+	} else {
+		len = th->doff << 2;
+		memcpy(hdr, th, len);
+		/* zero out tcp-ao hash */
+		((struct tcphdr *)hdr)->check = 0;
+		memset(hdr + hash_offset, 0, hash_len);
+	}
+
+	sg_init_one(&sg, hdr, len);
+	ahash_request_set_crypt(hp->req, &sg, NULL, len);
+	err = crypto_ahash_update(hp->req);
+	WARN_ON_ONCE(err != 0);
+	return err;
+}
+
+int tcp_ao_hash_skb(unsigned short int family,
+		    char *ao_hash, struct tcp_ao_key *key,
+		    const struct sock *sk, const struct sk_buff *skb,
+		    const u8 *tkey, int hash_offset, u32 sne)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	int tkey_len = tcp_ao_digest_size(key);
+	struct tcp_sigpool hp;
+	void *hash_buf = NULL;
+
+	hash_buf = kmalloc(tkey_len, GFP_ATOMIC);
+	if (!hash_buf)
+		goto clear_hash_noput;
+
+	if (tcp_sigpool_start(key->tcp_sigpool_id, &hp))
+		goto clear_hash_noput;
+
+	if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp.req), tkey, tkey_len))
+		goto clear_hash;
+
+	/* For now use sha1 by default. Depends on alg in tcp_ao_key */
+	if (crypto_ahash_init(hp.req))
+		goto clear_hash;
+
+	if (tcp_ao_hash_sne(&hp, sne))
+		goto clear_hash;
+	if (tcp_ao_hash_pseudoheader(family, sk, skb, &hp, skb->len))
+		goto clear_hash;
+	if (tcp_ao_hash_header(&hp, th, false,
+			       ao_hash, hash_offset, tcp_ao_maclen(key)))
+		goto clear_hash;
+	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
+		goto clear_hash;
+	ahash_request_set_crypt(hp.req, NULL, hash_buf, 0);
+	if (crypto_ahash_final(hp.req))
+		goto clear_hash;
+
+	memcpy(ao_hash, hash_buf, tcp_ao_maclen(key));
+	tcp_sigpool_end(&hp);
+	kfree(hash_buf);
+	return 0;
+
+clear_hash:
+	tcp_sigpool_end(&hp);
+clear_hash_noput:
+	memset(ao_hash, 0, tcp_ao_maclen(key));
+	kfree(hash_buf);
+	return 1;
+}
+
+int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
+		       const struct sock *sk, const struct sk_buff *skb,
+		       const u8 *tkey, int hash_offset, u32 sne)
+{
+	return tcp_ao_hash_skb(AF_INET, ao_hash, key, sk, skb,
+			       tkey, hash_offset, sne);
+}
+
 struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 				    int sndid, int rcvid)
 {
@@ -270,6 +435,40 @@ struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 	return tcp_ao_do_lookup(sk, addr, AF_INET, sndid, rcvid);
 }
 
+int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
+			struct tcp_ao_key *key, struct tcphdr *th,
+			__u8 *hash_location)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_ao_info *ao;
+	void *tkey_buf = NULL;
+	u8 *traffic_key;
+
+	ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+				       lockdep_sock_is_held(sk));
+	traffic_key = snd_other_key(key);
+	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+		__be32 disn;
+
+		if (!(tcb->tcp_flags & TCPHDR_ACK)) {
+			disn = 0;
+			tkey_buf = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+			if (!tkey_buf)
+				return -ENOMEM;
+			traffic_key = tkey_buf;
+		} else {
+			disn = ao->risn;
+		}
+		tp->af_specific->ao_calc_key_sk(key, traffic_key,
+						sk, ao->lisn, disn, true);
+	}
+	tp->af_specific->calc_ao_hash(hash_location, key, sk, skb, traffic_key,
+				      hash_location - (u8 *)th, 0);
+	kfree(tkey_buf);
+	return 0;
+}
+
 static int tcp_ao_cache_traffic_keys(const struct sock *sk,
 				     struct tcp_ao_info *ao,
 				     struct tcp_ao_key *ao_key)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3c73b5829377..b002f6497d19 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2287,6 +2287,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
 #endif
 #ifdef CONFIG_TCP_AO
 	.ao_lookup		= tcp_v4_ao_lookup,
+	.calc_ao_hash		= tcp_v4_ao_hash_skb,
 	.ao_parse		= tcp_v4_parse_ao,
 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
 #endif
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9fbf1b2e2025..fa3a630f3629 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -422,6 +422,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_FAST_OPEN_COOKIE	BIT(8)
 #define OPTION_SMC		BIT(9)
 #define OPTION_MPTCP		BIT(10)
+#define OPTION_AO		BIT(11)
 
 static void smc_options_write(__be32 *ptr, u16 *options)
 {
@@ -614,19 +615,43 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
  * (but it may well be that other scenarios fail similarly).
  */
 static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
-			      struct tcp_out_options *opts)
+			      struct tcp_out_options *opts,
+			      struct tcp_key *key)
 {
 	__be32 *ptr = (__be32 *)(th + 1);
 	u16 options = opts->options;	/* mungable copy */
 
-	if (unlikely(OPTION_MD5 & options)) {
+	if (tcp_key_is_md5(key)) {
 		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
 		/* overload cookie hash location */
 		opts->hash_location = (__u8 *)ptr;
 		ptr += 4;
-	}
+	} else if (tcp_key_is_ao(key)) {
+#ifdef CONFIG_TCP_AO
+		struct tcp_ao_key *rnext_key;
+		struct tcp_ao_info *ao_info;
+		u8 maclen;
 
+		ao_info = rcu_dereference_check(tp->ao_info,
+				lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
+		rnext_key = READ_ONCE(ao_info->rnext_key);
+		if (WARN_ON_ONCE(!rnext_key))
+			goto out_ao;
+		maclen = tcp_ao_maclen(key->ao_key);
+		*ptr++ = htonl((TCPOPT_AO << 24) |
+				(tcp_ao_len(key->ao_key) << 16) |
+				(key->ao_key->sndid << 8) |
+				(rnext_key->rcvid));
+		opts->hash_location = (__u8 *)ptr;
+		ptr += maclen / sizeof(*ptr);
+		if (unlikely(maclen % sizeof(*ptr))) {
+			memset(ptr, TCPOPT_NOP, sizeof(*ptr));
+			ptr++;
+		}
+out_ao:
+#endif
+	}
 	if (unlikely(opts->mss)) {
 		*ptr++ = htonl((TCPOPT_MSS << 24) |
 			       (TCPOLEN_MSS << 16) |
@@ -767,23 +792,25 @@ static void mptcp_set_option_cond(const struct request_sock *req,
  */
 static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 				struct tcp_out_options *opts,
-				struct tcp_md5sig_key **md5)
+				struct tcp_key *key)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
 	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+	bool timestamps;
 
-	*md5 = NULL;
-#ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed.key) &&
-	    rcu_access_pointer(tp->md5sig_info)) {
-		*md5 = tp->af_specific->md5_lookup(sk, sk);
-		if (*md5) {
-			opts->options |= OPTION_MD5;
-			remaining -= TCPOLEN_MD5SIG_ALIGNED;
+	/* Better than switch (key.type) as it has static branches */
+	if (tcp_key_is_md5(key)) {
+		timestamps = false;
+		opts->options |= OPTION_MD5;
+		remaining -= TCPOLEN_MD5SIG_ALIGNED;
+	} else {
+		timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps);
+		if (tcp_key_is_ao(key)) {
+			opts->options |= OPTION_AO;
+			remaining -= tcp_ao_len(key->ao_key);
 		}
 	}
-#endif
 
 	/* We always get an MSS option.  The option bytes which will be seen in
 	 * normal data packets should timestamps be used, must be in the MSS
@@ -797,7 +824,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	opts->mss = tcp_advertise_mss(sk);
 	remaining -= TCPOLEN_MSS_ALIGNED;
 
-	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
+	if (likely(timestamps)) {
 		opts->options |= OPTION_TS;
 		opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
 		opts->tsecr = tp->rx_opt.ts_recent;
@@ -922,7 +949,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
  */
 static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
 					struct tcp_out_options *opts,
-					struct tcp_md5sig_key **md5)
+					struct tcp_key *key)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int size = 0;
@@ -930,17 +957,14 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 
 	opts->options = 0;
 
-	*md5 = NULL;
-#ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed.key) &&
-	    rcu_access_pointer(tp->md5sig_info)) {
-		*md5 = tp->af_specific->md5_lookup(sk, sk);
-		if (*md5) {
-			opts->options |= OPTION_MD5;
-			size += TCPOLEN_MD5SIG_ALIGNED;
-		}
+	/* Better than switch (key.type) as it has static branches */
+	if (tcp_key_is_md5(key)) {
+		opts->options |= OPTION_MD5;
+		size += TCPOLEN_MD5SIG_ALIGNED;
+	} else if (tcp_key_is_ao(key)) {
+		opts->options |= OPTION_AO;
+		size += tcp_ao_len(key->ao_key);
 	}
-#endif
 
 	if (likely(tp->rx_opt.tstamp_ok)) {
 		opts->options |= OPTION_TS;
@@ -1245,7 +1269,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	struct tcp_out_options opts;
 	unsigned int tcp_options_size, tcp_header_size;
 	struct sk_buff *oskb = NULL;
-	struct tcp_md5sig_key *md5;
+	struct tcp_key key;
 	struct tcphdr *th;
 	u64 prior_wstamp;
 	int err;
@@ -1277,11 +1301,11 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
+	tcp_get_current_key(sk, &key);
 	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
-		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
+		tcp_options_size = tcp_syn_options(sk, skb, &opts, &key);
 	} else {
-		tcp_options_size = tcp_established_options(sk, skb, &opts,
-							   &md5);
+		tcp_options_size = tcp_established_options(sk, skb, &opts, &key);
 		/* Force a PSH flag on all (GSO) packets to expedite GRO flush
 		 * at receiver : This slightly improve GRO performance.
 		 * Note that we do not force the PSH flag for non GSO packets,
@@ -1362,16 +1386,25 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		th->window	= htons(min(tp->rcv_wnd, 65535U));
 	}
 
-	tcp_options_write(th, tp, &opts);
+	tcp_options_write(th, tp, &opts, &key);
 
+	if (tcp_key_is_md5(&key)) {
 #ifdef CONFIG_TCP_MD5SIG
-	/* Calculate the MD5 hash, as we have all we need now */
-	if (md5) {
+		/* Calculate the MD5 hash, as we have all we need now */
 		sk_gso_disable(sk);
 		tp->af_specific->calc_md5_hash(opts.hash_location,
-					       md5, sk, skb);
-	}
+					       key.md5_key, sk, skb);
 #endif
+	} else if (tcp_key_is_ao(&key)) {
+		int err;
+
+		err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th,
+					  opts.hash_location);
+		if (err) {
+			kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+			return -ENOMEM;
+		}
+	}
 
 	/* BPF prog is the last one writing header option */
 	bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
@@ -1804,7 +1837,7 @@ unsigned int tcp_current_mss(struct sock *sk)
 	u32 mss_now;
 	unsigned int header_len;
 	struct tcp_out_options opts;
-	struct tcp_md5sig_key *md5;
+	struct tcp_key key;
 
 	mss_now = tp->mss_cache;
 
@@ -1813,8 +1846,8 @@ unsigned int tcp_current_mss(struct sock *sk)
 		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
 			mss_now = tcp_sync_mss(sk, mtu);
 	}
-
-	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
+	tcp_get_current_key(sk, &key);
+	header_len = tcp_established_options(sk, NULL, &opts, &key) +
 		     sizeof(struct tcphdr);
 	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
 	 * some common options. If this is an odd packet (because we have SACK
@@ -3613,6 +3646,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_md5sig_key *md5 = NULL;
 	struct tcp_out_options opts;
+	struct tcp_key key = {};
 	struct sk_buff *skb;
 	int tcp_header_size;
 	struct tcphdr *th;
@@ -3667,6 +3701,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 #ifdef CONFIG_TCP_MD5SIG
 	rcu_read_lock();
 	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+	if (md5)
+		key.type = TCP_KEY_MD5;
 #endif
 	skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4);
 	/* bpf program will be interested in the tcp_flags */
@@ -3693,7 +3729,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
-	tcp_options_write(th, NULL, &opts);
+	tcp_options_write(th, NULL, &opts, &key);
 	th->doff = (tcp_header_size >> 2);
 	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
index 9ab594fadbd9..d08735b6f3c5 100644
--- a/net/ipv6/tcp_ao.c
+++ b/net/ipv6/tcp_ao.c
@@ -7,6 +7,7 @@
  *		Francesco Ruggeri <fruggeri@arista.com>
  *		Salam Noureddine <noureddine@arista.com>
  */
+#include <crypto/hash.h>
 #include <linux/tcp.h>
 
 #include <net/tcp.h>
@@ -79,6 +80,33 @@ struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 	return tcp_v6_ao_do_lookup(sk, addr, sndid, rcvid);
 }
 
+int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
+				const struct in6_addr *daddr,
+				const struct in6_addr *saddr, int nbytes)
+{
+	struct tcp6_pseudohdr *bp;
+	struct scatterlist sg;
+
+	bp = hp->scratch;
+	/* 1. TCP pseudo-header (RFC2460) */
+	bp->saddr = *saddr;
+	bp->daddr = *daddr;
+	bp->len = cpu_to_be32(nbytes);
+	bp->protocol = cpu_to_be32(IPPROTO_TCP);
+
+	sg_init_one(&sg, bp, sizeof(*bp));
+	ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+	return crypto_ahash_update(hp->req);
+}
+
+int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
+		       const struct sock *sk, const struct sk_buff *skb,
+		       const u8 *tkey, int hash_offset, u32 sne)
+{
+	return tcp_ao_hash_skb(AF_INET6, ao_hash, key, sk, skb, tkey,
+			hash_offset, sne);
+}
+
 int tcp_v6_parse_ao(struct sock *sk, int cmd,
 		    sockptr_t optval, int optlen)
 {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 074e16fe00e0..301b2498d793 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1920,6 +1920,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
 #endif
 #ifdef CONFIG_TCP_AO
 	.ao_lookup	=	tcp_v6_ao_lookup,
+	.calc_ao_hash	=	tcp_v6_ao_hash_skb,
 	.ao_parse	=	tcp_v6_parse_ao,
 	.ao_calc_key_sk	=	tcp_v6_ao_calc_key_sk,
 #endif
@@ -1953,6 +1954,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
 #endif
 #ifdef CONFIG_TCP_AO
 	.ao_lookup	=	tcp_v6_ao_lookup,
+	.calc_ao_hash	=	tcp_v4_ao_hash_skb,
 	.ao_parse	=	tcp_v6_parse_ao,
 #endif
 };
-- 
cgit v1.2.3-70-g09d2


From f7dca36fc54afa2eb76bff8d0589a2ef18caea91 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:59 +0100
Subject: net/tcp: Add tcp_parse_auth_options()

Introduce a helper that:
(1) shares the common code with TCP-MD5 header options parsing
(2) looks for hash signature only once for both TCP-MD5 and TCP-AO
(3) fails with -EEXIST if any TCP sign option is present twice, see
    RFC5925 (2.2):
    ">> A single TCP segment MUST NOT have more than one TCP-AO in its
    options sequence. When multiple TCP-AOs appear, TCP MUST discard
    the segment."

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dropreason-core.h |  6 ++++++
 include/net/tcp.h             | 24 +++++++++++++++++++++++-
 include/net/tcp_ao.h          | 17 ++++++++++++++++-
 net/ipv4/tcp.c                |  3 ++-
 net/ipv4/tcp_input.c          | 39 +++++++++++++++++++++++++++++----------
 net/ipv4/tcp_ipv4.c           | 15 ++++++++++-----
 net/ipv6/tcp_ipv6.c           | 11 +++++++----
 7 files changed, 93 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index 845dce805de7..3af4464a9c5b 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -20,6 +20,7 @@
 	FN(IP_NOPROTO)			\
 	FN(SOCKET_RCVBUFF)		\
 	FN(PROTO_MEM)			\
+	FN(TCP_AUTH_HDR)		\
 	FN(TCP_MD5NOTFOUND)		\
 	FN(TCP_MD5UNEXPECTED)		\
 	FN(TCP_MD5FAILURE)		\
@@ -142,6 +143,11 @@ enum skb_drop_reason {
 	 * drop out of udp_memory_allocated.
 	 */
 	SKB_DROP_REASON_PROTO_MEM,
+	/**
+	 * @SKB_DROP_REASON_TCP_AUTH_HDR: TCP-MD5 or TCP-AO hashes are met
+	 * twice or set incorrectly.
+	 */
+	SKB_DROP_REASON_TCP_AUTH_HDR,
 	/**
 	 * @SKB_DROP_REASON_TCP_MD5NOTFOUND: no MD5 hash and one expected,
 	 * corresponding to LINUX_MIB_TCPMD5NOTFOUND
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 96e83159f1be..423807ae3e37 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -438,7 +438,6 @@ int tcp_mmap(struct file *file, struct socket *sock,
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
-const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
 /*
  *	BPF SKB-less helpers
@@ -2675,6 +2674,29 @@ static inline u64 tcp_transmit_time(const struct sock *sk)
 	return 0;
 }
 
+static inline int tcp_parse_auth_options(const struct tcphdr *th,
+		const u8 **md5_hash, const struct tcp_ao_hdr **aoh)
+{
+	const u8 *md5_tmp, *ao_tmp;
+	int ret;
+
+	ret = tcp_do_parse_auth_options(th, &md5_tmp, &ao_tmp);
+	if (ret)
+		return ret;
+
+	if (md5_hash)
+		*md5_hash = md5_tmp;
+
+	if (aoh) {
+		if (!ao_tmp)
+			*aoh = NULL;
+		else
+			*aoh = (struct tcp_ao_hdr *)(ao_tmp - 2);
+	}
+
+	return 0;
+}
+
 static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
 				   int family)
 {
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 0b86bc05d8cf..fdd2f5091b98 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -152,7 +152,9 @@ int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
 void tcp_ao_established(struct sock *sk);
 void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb);
 void tcp_ao_connect_init(struct sock *sk);
-
+void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
+		      struct tcp_request_sock *treq,
+		      unsigned short int family);
 #else /* CONFIG_TCP_AO */
 
 static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
@@ -185,4 +187,17 @@ static inline void tcp_ao_connect_init(struct sock *sk)
 }
 #endif
 
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+int tcp_do_parse_auth_options(const struct tcphdr *th,
+			      const u8 **md5_hash, const u8 **ao_hash);
+#else
+static inline int tcp_do_parse_auth_options(const struct tcphdr *th,
+		const u8 **md5_hash, const u8 **ao_hash)
+{
+	*md5_hash = NULL;
+	*ao_hash = NULL;
+	return 0;
+}
+#endif
+
 #endif /* _TCP_AO_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b6faee8a1e67..369e2a41bc1b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4398,7 +4398,8 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 	l3index = sdif ? dif : 0;
 
 	hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family);
-	hash_location = tcp_parse_md5sig_option(th);
+	if (tcp_parse_auth_options(th, &hash_location, NULL))
+		return SKB_DROP_REASON_TCP_AUTH_HDR;
 
 	/* We've parsed the options - do we have a hash? */
 	if (!hash_expected && !hash_location)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6ee0342b5338..fc42b172abf6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4255,39 +4255,58 @@ static bool tcp_fast_parse_options(const struct net *net,
 	return true;
 }
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 /*
- * Parse MD5 Signature option
+ * Parse Signature options
  */
-const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
+int tcp_do_parse_auth_options(const struct tcphdr *th,
+			      const u8 **md5_hash, const u8 **ao_hash)
 {
 	int length = (th->doff << 2) - sizeof(*th);
 	const u8 *ptr = (const u8 *)(th + 1);
+	unsigned int minlen = TCPOLEN_MD5SIG;
+
+	if (IS_ENABLED(CONFIG_TCP_AO))
+		minlen = sizeof(struct tcp_ao_hdr) + 1;
+
+	*md5_hash = NULL;
+	*ao_hash = NULL;
 
 	/* If not enough data remaining, we can short cut */
-	while (length >= TCPOLEN_MD5SIG) {
+	while (length >= minlen) {
 		int opcode = *ptr++;
 		int opsize;
 
 		switch (opcode) {
 		case TCPOPT_EOL:
-			return NULL;
+			return 0;
 		case TCPOPT_NOP:
 			length--;
 			continue;
 		default:
 			opsize = *ptr++;
 			if (opsize < 2 || opsize > length)
-				return NULL;
-			if (opcode == TCPOPT_MD5SIG)
-				return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
+				return -EINVAL;
+			if (opcode == TCPOPT_MD5SIG) {
+				if (opsize != TCPOLEN_MD5SIG)
+					return -EINVAL;
+				if (unlikely(*md5_hash || *ao_hash))
+					return -EEXIST;
+				*md5_hash = ptr;
+			} else if (opcode == TCPOPT_AO) {
+				if (opsize <= sizeof(struct tcp_ao_hdr))
+					return -EINVAL;
+				if (unlikely(*md5_hash || *ao_hash))
+					return -EEXIST;
+				*ao_hash = ptr;
+			}
 		}
 		ptr += opsize - 2;
 		length -= opsize;
 	}
-	return NULL;
+	return 0;
 }
-EXPORT_SYMBOL(tcp_parse_md5sig_option);
+EXPORT_SYMBOL(tcp_do_parse_auth_options);
 #endif
 
 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b002f6497d19..83e069d0f778 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -670,7 +670,9 @@ EXPORT_SYMBOL(tcp_v4_send_check);
  *	Exception: precedence violation. We do not implement it in any case.
  */
 
-#ifdef CONFIG_TCP_MD5SIG
+#ifdef CONFIG_TCP_AO
+#define OPTION_BYTES MAX_TCP_OPTION_SPACE
+#elif defined(CONFIG_TCP_MD5SIG)
 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 #else
 #define OPTION_BYTES sizeof(__be32)
@@ -685,8 +687,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	} rep;
 	struct ip_reply_arg arg;
 #ifdef CONFIG_TCP_MD5SIG
+	const __u8 *md5_hash_location = NULL;
 	struct tcp_md5sig_key *key = NULL;
-	const __u8 *hash_location = NULL;
 	unsigned char newhash[16];
 	int genhash;
 	struct sock *sk1 = NULL;
@@ -727,8 +729,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 
 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 #ifdef CONFIG_TCP_MD5SIG
+	/* Invalid TCP option size or twice included auth */
+	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, NULL))
+		return;
+
 	rcu_read_lock();
-	hash_location = tcp_parse_md5sig_option(th);
 	if (sk && sk_fullsock(sk)) {
 		const union tcp_md5_addr *addr;
 		int l3index;
@@ -739,7 +744,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
-	} else if (hash_location) {
+	} else if (md5_hash_location) {
 		const union tcp_md5_addr *addr;
 		int sdif = tcp_v4_sdif(skb);
 		int dif = inet_iif(skb);
@@ -771,7 +776,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 
 
 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
-		if (genhash || memcmp(hash_location, newhash, 16) != 0)
+		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
 			goto out;
 
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 301b2498d793..5dd016bdf44b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -990,7 +990,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 	u32 seq = 0, ack_seq = 0;
 	struct tcp_md5sig_key *key = NULL;
 #ifdef CONFIG_TCP_MD5SIG
-	const __u8 *hash_location = NULL;
+	const __u8 *md5_hash_location = NULL;
 	unsigned char newhash[16];
 	int genhash;
 	struct sock *sk1 = NULL;
@@ -1012,8 +1012,11 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 
 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 #ifdef CONFIG_TCP_MD5SIG
+	/* Invalid TCP option size or twice included auth */
+	if (tcp_parse_auth_options(th, &md5_hash_location, NULL))
+		return;
+
 	rcu_read_lock();
-	hash_location = tcp_parse_md5sig_option(th);
 	if (sk && sk_fullsock(sk)) {
 		int l3index;
 
@@ -1022,7 +1025,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		 */
 		l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
 		key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index);
-	} else if (hash_location) {
+	} else if (md5_hash_location) {
 		int dif = tcp_v6_iif_l3_slave(skb);
 		int sdif = tcp_v6_sdif(skb);
 		int l3index;
@@ -1051,7 +1054,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 			goto out;
 
 		genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb);
-		if (genhash || memcmp(hash_location, newhash, 16) != 0)
+		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
 			goto out;
 	}
 #endif
-- 
cgit v1.2.3-70-g09d2


From ba7783ad45c8f0fb7a70640f6b6fcdc54ed48412 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:00 +0100
Subject: net/tcp: Add AO sign to RST packets

Wire up sending resets to TCP-AO hashing.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |   7 +++-
 include/net/tcp_ao.h |  12 ++++++
 net/ipv4/tcp_ao.c    | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/tcp_ipv4.c  |  69 +++++++++++++++++++++++++++-------
 net/ipv6/tcp_ipv6.c  |  98 +++++++++++++++++++++++++++++++++++--------------
 5 files changed, 245 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 423807ae3e37..17b3ecd2760f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2258,7 +2258,12 @@ static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
 
 struct tcp_key {
 	union {
-		struct tcp_ao_key *ao_key;
+		struct {
+			struct tcp_ao_key *ao_key;
+			char *traffic_key;
+			u32 sne;
+			u8 rcv_next;
+		};
 		struct tcp_md5sig_key *md5_key;
 	};
 	enum {
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index fdd2f5091b98..629ab0365b83 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -120,12 +120,24 @@ int tcp_ao_hash_skb(unsigned short int family,
 		    const u8 *tkey, int hash_offset, u32 sne);
 int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
 		 sockptr_t optval, int optlen);
+struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
+					  int sndid, int rcvid);
 int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp);
 void tcp_ao_destroy_sock(struct sock *sk);
 struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 				    const union tcp_ao_addr *addr,
 				    int family, int sndid, int rcvid);
+int tcp_ao_hash_hdr(unsigned short family, char *ao_hash,
+		    struct tcp_ao_key *key, const u8 *tkey,
+		    const union tcp_ao_addr *daddr,
+		    const union tcp_ao_addr *saddr,
+		    const struct tcphdr *th, u32 sne);
+int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
+			 const struct tcp_ao_hdr *aoh, int l3index,
+			 struct tcp_ao_key **key, char **traffic_key,
+			 bool *allocated_traffic_key, u8 *keyid, u32 *sne);
+
 /* ipv4 specific functions */
 int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
 struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 007f29a2531f..b8afe78ff057 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -48,8 +48,8 @@ clear_hash:
  * it's known that the keys in ao_info are matching peer's
  * family/address/VRF/etc.
  */
-static struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
-						 int sndid, int rcvid)
+struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
+					  int sndid, int rcvid)
 {
 	struct tcp_ao_key *key;
 
@@ -369,6 +369,66 @@ static int tcp_ao_hash_header(struct tcp_sigpool *hp,
 	return err;
 }
 
+int tcp_ao_hash_hdr(unsigned short int family, char *ao_hash,
+		    struct tcp_ao_key *key, const u8 *tkey,
+		    const union tcp_ao_addr *daddr,
+		    const union tcp_ao_addr *saddr,
+		    const struct tcphdr *th, u32 sne)
+{
+	int tkey_len = tcp_ao_digest_size(key);
+	int hash_offset = ao_hash - (char *)th;
+	struct tcp_sigpool hp;
+	void *hash_buf = NULL;
+
+	hash_buf = kmalloc(tkey_len, GFP_ATOMIC);
+	if (!hash_buf)
+		goto clear_hash_noput;
+
+	if (tcp_sigpool_start(key->tcp_sigpool_id, &hp))
+		goto clear_hash_noput;
+
+	if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp.req), tkey, tkey_len))
+		goto clear_hash;
+
+	if (crypto_ahash_init(hp.req))
+		goto clear_hash;
+
+	if (tcp_ao_hash_sne(&hp, sne))
+		goto clear_hash;
+	if (family == AF_INET) {
+		if (tcp_v4_ao_hash_pseudoheader(&hp, daddr->a4.s_addr,
+						saddr->a4.s_addr, th->doff * 4))
+			goto clear_hash;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (family == AF_INET6) {
+		if (tcp_v6_ao_hash_pseudoheader(&hp, &daddr->a6,
+						&saddr->a6, th->doff * 4))
+			goto clear_hash;
+#endif
+	} else {
+		WARN_ON_ONCE(1);
+		goto clear_hash;
+	}
+	if (tcp_ao_hash_header(&hp, th, false,
+			       ao_hash, hash_offset, tcp_ao_maclen(key)))
+		goto clear_hash;
+	ahash_request_set_crypt(hp.req, NULL, hash_buf, 0);
+	if (crypto_ahash_final(hp.req))
+		goto clear_hash;
+
+	memcpy(ao_hash, hash_buf, tcp_ao_maclen(key));
+	tcp_sigpool_end(&hp);
+	kfree(hash_buf);
+	return 0;
+
+clear_hash:
+	tcp_sigpool_end(&hp);
+clear_hash_noput:
+	memset(ao_hash, 0, tcp_ao_maclen(key));
+	kfree(hash_buf);
+	return 1;
+}
+
 int tcp_ao_hash_skb(unsigned short int family,
 		    char *ao_hash, struct tcp_ao_key *key,
 		    const struct sock *sk, const struct sk_buff *skb,
@@ -435,6 +495,44 @@ struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 	return tcp_ao_do_lookup(sk, addr, AF_INET, sndid, rcvid);
 }
 
+int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
+			 const struct tcp_ao_hdr *aoh, int l3index,
+			 struct tcp_ao_key **key, char **traffic_key,
+			 bool *allocated_traffic_key, u8 *keyid, u32 *sne)
+{
+	struct tcp_ao_key *rnext_key;
+	struct tcp_ao_info *ao_info;
+
+	*allocated_traffic_key = false;
+	/* If there's no socket - than initial sisn/disn are unknown.
+	 * Drop the segment. RFC5925 (7.7) advises to require graceful
+	 * restart [RFC4724]. Alternatively, the RFC5925 advises to
+	 * save/restore traffic keys before/after reboot.
+	 * Linux TCP-AO support provides TCP_AO_ADD_KEY and TCP_AO_REPAIR
+	 * options to restore a socket post-reboot.
+	 */
+	if (!sk)
+		return -ENOTCONN;
+
+	if ((1 << sk->sk_state) &
+	    (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT))
+		return -1;
+
+	ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+	if (!ao_info)
+		return -ENOENT;
+
+	*key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
+	if (!*key)
+		return -ENOENT;
+	*traffic_key = snd_other_key(*key);
+	rnext_key = READ_ONCE(ao_info->rnext_key);
+	*keyid = rnext_key->rcvid;
+	*sne = 0;
+
+	return 0;
+}
+
 int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 			struct tcp_ao_key *key, struct tcphdr *th,
 			__u8 *hash_location)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 83e069d0f778..71e1cbb0020b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -657,6 +657,52 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(tcp_v4_send_check);
 
+#define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
+
+static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
+				 const struct tcp_ao_hdr *aoh,
+				 struct ip_reply_arg *arg, struct tcphdr *reply,
+				 __be32 reply_options[REPLY_OPTIONS_LEN])
+{
+#ifdef CONFIG_TCP_AO
+	int sdif = tcp_v4_sdif(skb);
+	int dif = inet_iif(skb);
+	int l3index = sdif ? dif : 0;
+	bool allocated_traffic_key;
+	struct tcp_ao_key *key;
+	char *traffic_key;
+	bool drop = true;
+	u32 ao_sne = 0;
+	u8 keyid;
+
+	rcu_read_lock();
+	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index,
+				 &key, &traffic_key, &allocated_traffic_key,
+				 &keyid, &ao_sne))
+		goto out;
+
+	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
+				 (aoh->rnext_keyid << 8) | keyid);
+	arg->iov[0].iov_len += round_up(tcp_ao_len(key), 4);
+	reply->doff = arg->iov[0].iov_len / 4;
+
+	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
+			    key, traffic_key,
+			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
+			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
+			    reply, ao_sne))
+		goto out;
+	drop = false;
+out:
+	rcu_read_unlock();
+	if (allocated_traffic_key)
+		kfree(traffic_key);
+	return drop;
+#else
+	return true;
+#endif
+}
+
 /*
  *	This routine will send an RST to the other tcp.
  *
@@ -670,28 +716,21 @@ EXPORT_SYMBOL(tcp_v4_send_check);
  *	Exception: precedence violation. We do not implement it in any case.
  */
 
-#ifdef CONFIG_TCP_AO
-#define OPTION_BYTES MAX_TCP_OPTION_SPACE
-#elif defined(CONFIG_TCP_MD5SIG)
-#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
-#else
-#define OPTION_BYTES sizeof(__be32)
-#endif
-
 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct {
 		struct tcphdr th;
-		__be32 opt[OPTION_BYTES / sizeof(__be32)];
+		__be32 opt[REPLY_OPTIONS_LEN];
 	} rep;
+	const __u8 *md5_hash_location = NULL;
+	const struct tcp_ao_hdr *aoh;
 	struct ip_reply_arg arg;
 #ifdef CONFIG_TCP_MD5SIG
-	const __u8 *md5_hash_location = NULL;
 	struct tcp_md5sig_key *key = NULL;
 	unsigned char newhash[16];
-	int genhash;
 	struct sock *sk1 = NULL;
+	int genhash;
 #endif
 	u64 transmit_time = 0;
 	struct sock *ctl_sk;
@@ -728,11 +767,15 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	arg.iov[0].iov_len  = sizeof(rep.th);
 
 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
-#ifdef CONFIG_TCP_MD5SIG
+
 	/* Invalid TCP option size or twice included auth */
-	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, NULL))
+	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
 		return;
 
+	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
+		return;
+
+#ifdef CONFIG_TCP_MD5SIG
 	rcu_read_lock();
 	if (sk && sk_fullsock(sk)) {
 		const union tcp_md5_addr *addr;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5dd016bdf44b..45b81e230438 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -854,8 +854,8 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 
 static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
 				 u32 ack, u32 win, u32 tsval, u32 tsecr,
-				 int oif, struct tcp_md5sig_key *key, int rst,
-				 u8 tclass, __be32 label, u32 priority, u32 txhash)
+				 int oif, int rst, u8 tclass, __be32 label,
+				 u32 priority, u32 txhash, struct tcp_key *key)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct tcphdr *t1;
@@ -870,13 +870,13 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 
 	if (tsecr)
 		tot_len += TCPOLEN_TSTAMP_ALIGNED;
-#ifdef CONFIG_TCP_MD5SIG
-	if (key)
+	if (tcp_key_is_md5(key))
 		tot_len += TCPOLEN_MD5SIG_ALIGNED;
-#endif
+	if (tcp_key_is_ao(key))
+		tot_len += tcp_ao_len(key->ao_key);
 
 #ifdef CONFIG_MPTCP
-	if (rst && !key) {
+	if (rst && !tcp_key_is_md5(key)) {
 		mrst = mptcp_reset_option(skb);
 
 		if (mrst)
@@ -917,14 +917,28 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		*topt++ = mrst;
 
 #ifdef CONFIG_TCP_MD5SIG
-	if (key) {
+	if (tcp_key_is_md5(key)) {
 		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 				(TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
-		tcp_v6_md5_hash_hdr((__u8 *)topt, key,
+		tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key,
 				    &ipv6_hdr(skb)->saddr,
 				    &ipv6_hdr(skb)->daddr, t1);
 	}
 #endif
+#ifdef CONFIG_TCP_AO
+	if (tcp_key_is_ao(key)) {
+		*topt++ = htonl((TCPOPT_AO << 24) |
+				(tcp_ao_len(key->ao_key) << 16) |
+				(key->ao_key->sndid << 8) |
+				(key->rcv_next));
+
+		tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key,
+				key->traffic_key,
+				(union tcp_ao_addr *)&ipv6_hdr(skb)->saddr,
+				(union tcp_ao_addr *)&ipv6_hdr(skb)->daddr,
+				t1, key->sne);
+	}
+#endif
 
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.daddr = ipv6_hdr(skb)->saddr;
@@ -987,19 +1001,23 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
-	u32 seq = 0, ack_seq = 0;
-	struct tcp_md5sig_key *key = NULL;
-#ifdef CONFIG_TCP_MD5SIG
 	const __u8 *md5_hash_location = NULL;
-	unsigned char newhash[16];
-	int genhash;
-	struct sock *sk1 = NULL;
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+	bool allocated_traffic_key = false;
 #endif
+	const struct tcp_ao_hdr *aoh;
+	struct tcp_key key = {};
+	u32 seq = 0, ack_seq = 0;
 	__be32 label = 0;
 	u32 priority = 0;
 	struct net *net;
 	u32 txhash = 0;
 	int oif = 0;
+#ifdef CONFIG_TCP_MD5SIG
+	unsigned char newhash[16];
+	int genhash;
+	struct sock *sk1 = NULL;
+#endif
 
 	if (th->rst)
 		return;
@@ -1011,12 +1029,13 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		return;
 
 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
-#ifdef CONFIG_TCP_MD5SIG
 	/* Invalid TCP option size or twice included auth */
-	if (tcp_parse_auth_options(th, &md5_hash_location, NULL))
+	if (tcp_parse_auth_options(th, &md5_hash_location, &aoh))
 		return;
-
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 	rcu_read_lock();
+#endif
+#ifdef CONFIG_TCP_MD5SIG
 	if (sk && sk_fullsock(sk)) {
 		int l3index;
 
@@ -1024,7 +1043,9 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		 * in an L3 domain and inet_iif is set to it.
 		 */
 		l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
-		key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index);
+		key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
 	} else if (md5_hash_location) {
 		int dif = tcp_v6_iif_l3_slave(skb);
 		int sdif = tcp_v6_sdif(skb);
@@ -1049,11 +1070,12 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		 */
 		l3index = tcp_v6_sdif(skb) ? dif : 0;
 
-		key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index);
-		if (!key)
+		key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index);
+		if (!key.md5_key)
 			goto out;
+		key.type = TCP_KEY_MD5;
 
-		genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb);
+		genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb);
 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
 			goto out;
 	}
@@ -1065,6 +1087,20 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
 			  (th->doff << 2);
 
+#ifdef CONFIG_TCP_AO
+	if (aoh) {
+		int l3index;
+
+		l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+		if (tcp_ao_prepare_reset(sk, skb, aoh, l3index,
+					 &key.ao_key, &key.traffic_key,
+					 &allocated_traffic_key,
+					 &key.rcv_next, &key.sne))
+			goto out;
+		key.type = TCP_KEY_AO;
+	}
+#endif
+
 	if (sk) {
 		oif = sk->sk_bound_dev_if;
 		if (sk_fullsock(sk)) {
@@ -1084,22 +1120,30 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 			label = ip6_flowlabel(ipv6h);
 	}
 
-	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1,
-			     ipv6_get_dsfield(ipv6h), label, priority, txhash);
+	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
+			     ipv6_get_dsfield(ipv6h), label, priority, txhash,
+			     &key);
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 out:
+	if (allocated_traffic_key)
+		kfree(key.traffic_key);
 	rcu_read_unlock();
 #endif
 }
 
 static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
 			    u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
-			    struct tcp_md5sig_key *key, u8 tclass,
+			    struct tcp_md5sig_key *md5_key, u8 tclass,
 			    __be32 label, u32 priority, u32 txhash)
 {
-	tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
-			     tclass, label, priority, txhash);
+	struct tcp_key key = {
+				.md5_key = md5_key,
+				.type = md5_key ? TCP_KEY_MD5 : TCP_KEY_NONE,
+	};
+
+	tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0,
+			     tclass, label, priority, txhash, &key);
 }
 
 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
-- 
cgit v1.2.3-70-g09d2


From decde2586b34b99684faff1eab41e5c496c27fb6 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:01 +0100
Subject: net/tcp: Add TCP-AO sign to twsk

Add support for sockets in time-wait state.
ao_info as well as all keys are inherited on transition to time-wait
socket. The lifetime of ao_info is now protected by ref counter, so
that tcp_ao_destroy_sock() will destruct it only when the last user is
gone.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  3 ++
 include/net/tcp_ao.h     | 11 ++++--
 net/ipv4/tcp_ao.c        | 49 +++++++++++++++++++++-----
 net/ipv4/tcp_ipv4.c      | 92 ++++++++++++++++++++++++++++++++++++++----------
 net/ipv4/tcp_minisocks.c |  4 ++-
 net/ipv4/tcp_output.c    |  2 +-
 net/ipv6/tcp_ipv6.c      | 72 +++++++++++++++++++++++++++----------
 7 files changed, 183 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 64e7b560fa79..eec6e7e5312e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -514,6 +514,9 @@ struct tcp_timewait_sock {
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info	__rcu *ao_info;
+#endif
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 629ab0365b83..971d7edcda9c 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -85,6 +85,7 @@ struct tcp_ao_info {
 				__unused	:31;
 	__be32			lisn;
 	__be32			risn;
+	refcount_t		refcnt;		/* Protects twsk destruction */
 	struct rcu_head		rcu;
 };
 
@@ -124,7 +125,8 @@ struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
 					  int sndid, int rcvid);
 int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp);
-void tcp_ao_destroy_sock(struct sock *sk);
+void tcp_ao_destroy_sock(struct sock *sk, bool twsk);
+void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp);
 struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 				    const union tcp_ao_addr *addr,
 				    int family, int sndid, int rcvid);
@@ -182,7 +184,7 @@ static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 	return NULL;
 }
 
-static inline void tcp_ao_destroy_sock(struct sock *sk)
+static inline void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
 {
 }
 
@@ -194,6 +196,11 @@ static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
 {
 }
 
+static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw,
+				    struct tcp_sock *tp)
+{
+}
+
 static inline void tcp_ao_connect_init(struct sock *sk)
 {
 }
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index b8afe78ff057..7c4e2f42845a 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -159,6 +159,7 @@ static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
 	if (!ao)
 		return NULL;
 	INIT_HLIST_HEAD(&ao->head);
+	refcount_set(&ao->refcnt, 1);
 
 	return ao;
 }
@@ -176,27 +177,54 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head)
 	kfree_sensitive(key);
 }
 
-void tcp_ao_destroy_sock(struct sock *sk)
+void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
 {
 	struct tcp_ao_info *ao;
 	struct tcp_ao_key *key;
 	struct hlist_node *n;
 
-	ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
-	tcp_sk(sk)->ao_info = NULL;
+	if (twsk) {
+		ao = rcu_dereference_protected(tcp_twsk(sk)->ao_info, 1);
+		tcp_twsk(sk)->ao_info = NULL;
+	} else {
+		ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
+		tcp_sk(sk)->ao_info = NULL;
+	}
 
-	if (!ao)
+	if (!ao || !refcount_dec_and_test(&ao->refcnt))
 		return;
 
 	hlist_for_each_entry_safe(key, n, &ao->head, node) {
 		hlist_del_rcu(&key->node);
-		atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+		if (!twsk)
+			atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
 		call_rcu(&key->rcu, tcp_ao_key_free_rcu);
 	}
 
 	kfree_rcu(ao, rcu);
 }
 
+void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp)
+{
+	struct tcp_ao_info *ao_info = rcu_dereference_protected(tp->ao_info, 1);
+
+	if (ao_info) {
+		struct tcp_ao_key *key;
+		struct hlist_node *n;
+		int omem = 0;
+
+		hlist_for_each_entry_safe(key, n, &ao_info->head, node) {
+			omem += tcp_ao_sizeof_key(key);
+		}
+
+		refcount_inc(&ao_info->refcnt);
+		atomic_sub(omem, &(((struct sock *)tp)->sk_omem_alloc));
+		rcu_assign_pointer(tcptw->ao_info, ao_info);
+	} else {
+		tcptw->ao_info = NULL;
+	}
+}
+
 /* 4 tuple and ISNs are expected in NBO */
 static int tcp_v4_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
 			      __be32 saddr, __be32 daddr,
@@ -514,11 +542,13 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 	if (!sk)
 		return -ENOTCONN;
 
-	if ((1 << sk->sk_state) &
-	    (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT))
+	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
 		return -1;
 
-	ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+	if (sk->sk_state == TCP_TIME_WAIT)
+		ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
+	else
+		ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
 	if (!ao_info)
 		return -ENOENT;
 
@@ -910,6 +940,9 @@ static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
 	if (sk_fullsock(sk)) {
 		return rcu_dereference_protected(tcp_sk(sk)->ao_info,
 						 lockdep_sock_is_held(sk));
+	} else if (sk->sk_state == TCP_TIME_WAIT) {
+		return rcu_dereference_protected(tcp_twsk(sk)->ao_info,
+						 lockdep_sock_is_held(sk));
 	}
 	return ERR_PTR(-ESOCKTNOSUPPORT);
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 71e1cbb0020b..a78112d78d06 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -911,17 +911,13 @@ out:
 static void tcp_v4_send_ack(const struct sock *sk,
 			    struct sk_buff *skb, u32 seq, u32 ack,
 			    u32 win, u32 tsval, u32 tsecr, int oif,
-			    struct tcp_md5sig_key *key,
+			    struct tcp_key *key,
 			    int reply_flags, u8 tos, u32 txhash)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct {
 		struct tcphdr th;
-		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
-#ifdef CONFIG_TCP_MD5SIG
-			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
-#endif
-			];
+		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
 	} rep;
 	struct net *net = sock_net(sk);
 	struct ip_reply_arg arg;
@@ -952,7 +948,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	rep.th.window  = htons(win);
 
 #ifdef CONFIG_TCP_MD5SIG
-	if (key) {
+	if (tcp_key_is_md5(key)) {
 		int offset = (tsecr) ? 3 : 0;
 
 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
@@ -963,9 +959,27 @@ static void tcp_v4_send_ack(const struct sock *sk,
 		rep.th.doff = arg.iov[0].iov_len/4;
 
 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
-				    key, ip_hdr(skb)->saddr,
+				    key->md5_key, ip_hdr(skb)->saddr,
 				    ip_hdr(skb)->daddr, &rep.th);
 	}
+#endif
+#ifdef CONFIG_TCP_AO
+	if (tcp_key_is_ao(key)) {
+		int offset = (tsecr) ? 3 : 0;
+
+		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
+					  (tcp_ao_len(key->ao_key) << 16) |
+					  (key->ao_key->sndid << 8) |
+					  key->rcv_next);
+		arg.iov[0].iov_len += round_up(tcp_ao_len(key->ao_key), 4);
+		rep.th.doff = arg.iov[0].iov_len / 4;
+
+		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
+				key->ao_key, key->traffic_key,
+				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
+				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
+				&rep.th, key->sne);
+	}
 #endif
 	arg.flags = reply_flags;
 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
@@ -999,18 +1013,50 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+	struct tcp_key key = {};
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao_info;
+
+	/* FIXME: the segment to-be-acked is not verified yet */
+	ao_info = rcu_dereference(tcptw->ao_info);
+	if (ao_info) {
+		const struct tcp_ao_hdr *aoh;
+
+		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
+			inet_twsk_put(tw);
+			return;
+		}
+
+		if (aoh)
+			key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
+	}
+	if (key.ao_key) {
+		struct tcp_ao_key *rnext_key;
+
+		key.traffic_key = snd_other_key(key.ao_key);
+		rnext_key = READ_ONCE(ao_info->rnext_key);
+		key.rcv_next = rnext_key->rcvid;
+		key.type = TCP_KEY_AO;
+#else
+	if (0) {
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		key.md5_key = tcp_twsk_md5_key(tcptw);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+#endif
+	}
 
 	tcp_v4_send_ack(sk, skb,
 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 			tcp_tw_tsval(tcptw),
 			tcptw->tw_ts_recent,
-			tw->tw_bound_dev_if,
-			tcp_twsk_md5_key(tcptw),
+			tw->tw_bound_dev_if, &key,
 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 			tw->tw_tos,
-			tw->tw_txhash
-			);
+			tw->tw_txhash);
 
 	inet_twsk_put(tw);
 }
@@ -1018,8 +1064,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req)
 {
-	const union tcp_md5_addr *addr;
-	int l3index;
+	struct tcp_key key = {};
 
 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
@@ -1032,15 +1077,24 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	 * exception of <SYN> segments, MUST be right-shifted by
 	 * Rcv.Wind.Shift bits:
 	 */
-	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
-	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		const union tcp_md5_addr *addr;
+		int l3index;
+
+		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+	}
+#endif
 	tcp_v4_send_ack(sk, skb, seq,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 			tcp_rsk_tsval(tcp_rsk(req)),
 			READ_ONCE(req->ts_recent),
-			0,
-			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
+			0, &key,
 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 			ip_hdr(skb)->tos,
 			READ_ONCE(tcp_rsk(req)->txhash));
@@ -2404,7 +2458,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		rcu_assign_pointer(tp->md5sig_info, NULL);
 	}
 #endif
-	tcp_ao_destroy_sock(sk);
+	tcp_ao_destroy_sock(sk, false);
 
 	/* Clean up a referenced TCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3dcb3fc36e64..6810cf65a322 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -279,7 +279,7 @@ out_free:
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct inet_timewait_sock *tw;
 
@@ -316,6 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 #endif
 
 		tcp_time_wait_init(sk, tcptw);
+		tcp_ao_time_wait(tcptw, tp);
 
 		/* Get the TIME_WAIT timeout firing. */
 		if (timeo < rto)
@@ -370,6 +371,7 @@ void tcp_twsk_destructor(struct sock *sk)
 			call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu);
 	}
 #endif
+	tcp_ao_destroy_sock(sk, true);
 }
 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fa3a630f3629..de546313a214 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3997,7 +3997,7 @@ int tcp_connect(struct sock *sk)
 		 * then free up ao_info if allocated.
 		 */
 		if (needs_md5) {
-			tcp_ao_destroy_sock(sk);
+			tcp_ao_destroy_sock(sk, false);
 		} else if (needs_ao) {
 			tcp_clear_md5_list(sk);
 			kfree(rcu_replace_pointer(tp->md5sig_info, NULL,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 45b81e230438..e6c3617976f0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -778,13 +778,6 @@ clear_hash_nostart:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
-#else /* CONFIG_TCP_MD5SIG */
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
-						   const struct in6_addr *addr,
-						   int l3index)
-{
-	return NULL;
-}
 #endif
 
 static void tcp_v6_init_req(struct request_sock *req,
@@ -1134,39 +1127,81 @@ out:
 
 static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
 			    u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
-			    struct tcp_md5sig_key *md5_key, u8 tclass,
+			    struct tcp_key *key, u8 tclass,
 			    __be32 label, u32 priority, u32 txhash)
 {
-	struct tcp_key key = {
-				.md5_key = md5_key,
-				.type = md5_key ? TCP_KEY_MD5 : TCP_KEY_NONE,
-	};
-
 	tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0,
-			     tclass, label, priority, txhash, &key);
+			     tclass, label, priority, txhash, key);
 }
 
 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+	struct tcp_key key = {};
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao_info;
+
+	/* FIXME: the segment to-be-acked is not verified yet */
+	ao_info = rcu_dereference(tcptw->ao_info);
+	if (ao_info) {
+		const struct tcp_ao_hdr *aoh;
+
+		/* Invalid TCP option size or twice included auth */
+		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+			goto out;
+		if (aoh) {
+			key.ao_key = tcp_ao_established_key(ao_info,
+							    aoh->rnext_keyid, -1);
+		}
+	}
+	if (key.ao_key) {
+		struct tcp_ao_key *rnext_key;
+
+		key.traffic_key = snd_other_key(key.ao_key);
+		/* rcv_next switches to our rcv_next */
+		rnext_key = READ_ONCE(ao_info->rnext_key);
+		key.rcv_next = rnext_key->rcvid;
+		key.type = TCP_KEY_AO;
+#else
+	if (0) {
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		key.md5_key = tcp_twsk_md5_key(tcptw);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+#endif
+	}
 
 	tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 			tcp_tw_tsval(tcptw),
-			tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
+			tcptw->tw_ts_recent, tw->tw_bound_dev_if, &key,
 			tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority,
 			tw->tw_txhash);
 
+#ifdef CONFIG_TCP_AO
+out:
+#endif
 	inet_twsk_put(tw);
 }
 
 static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req)
 {
-	int l3index;
+	struct tcp_key key = {};
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
 
-	l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+		key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr,
+						   l3index);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+	}
+#endif
 
 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
@@ -1182,8 +1217,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 			tcp_rsk_tsval(tcp_rsk(req)),
 			READ_ONCE(req->ts_recent), sk->sk_bound_dev_if,
-			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index),
-			ipv6_get_dsfield(ipv6_hdr(skb)), 0,
+			&key, ipv6_get_dsfield(ipv6_hdr(skb)), 0,
 			READ_ONCE(sk->sk_priority),
 			READ_ONCE(tcp_rsk(req)->txhash));
 }
-- 
cgit v1.2.3-70-g09d2


From 06b22ef29591f625ef877ae00d82192938e29e60 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:02 +0100
Subject: net/tcp: Wire TCP-AO to request sockets

Now when the new request socket is created from the listening socket,
it's recorded what MKT was used by the peer. tcp_rsk_used_ao() is
a new helper for checking if TCP-AO option was used to create the
request socket.
tcp_ao_copy_all_matching() will copy all keys that match the peer on the
request socket, as well as preparing them for the usage (creating
traffic keys).

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  18 ++++
 include/net/tcp.h        |   6 ++
 include/net/tcp_ao.h     |  24 +++++
 net/ipv4/syncookies.c    |   2 +
 net/ipv4/tcp_ao.c        | 264 ++++++++++++++++++++++++++++++++++++++++++++---
 net/ipv4/tcp_input.c     |  15 +++
 net/ipv4/tcp_ipv4.c      |  66 ++++++++++--
 net/ipv4/tcp_minisocks.c |  10 ++
 net/ipv4/tcp_output.c    |  37 ++++---
 net/ipv6/syncookies.c    |   2 +
 net/ipv6/tcp_ao.c        |  38 ++++++-
 net/ipv6/tcp_ipv6.c      |  75 +++++++++++---
 12 files changed, 506 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index eec6e7e5312e..ec4e9367f5b0 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -166,6 +166,11 @@ struct tcp_request_sock {
 						  * after data-in-SYN.
 						  */
 	u8				syn_tos;
+#ifdef CONFIG_TCP_AO
+	u8				ao_keyid;
+	u8				ao_rcv_next;
+	u8				maclen;
+#endif
 };
 
 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
@@ -173,6 +178,19 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
 	return (struct tcp_request_sock *)req;
 }
 
+static inline bool tcp_rsk_used_ao(const struct request_sock *req)
+{
+	/* The real length of MAC is saved in the request socket,
+	 * signing anything with zero-length makes no sense, so here is
+	 * a little hack..
+	 */
+#ifndef CONFIG_TCP_AO
+	return false;
+#else
+	return tcp_rsk(req)->maclen != 0;
+#endif
+}
+
 #define TCP_RMEM_TO_WIN_SCALE 8
 
 struct tcp_sock {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 17b3ecd2760f..d0bea102b523 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2216,6 +2216,12 @@ struct tcp_request_sock_ops {
 					  const struct sock *sk,
 					  const struct sk_buff *skb);
 #endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_key *(*ao_lookup)(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid);
+	int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk);
+#endif
 #ifdef CONFIG_SYN_COOKIES
 	__u32 (*cookie_init_seq)(const struct sk_buff *skb,
 				 __u16 *mss);
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 971d7edcda9c..d2c1ee8bf7b0 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -123,6 +123,9 @@ int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
 		 sockptr_t optval, int optlen);
 struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
 					  int sndid, int rcvid);
+int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
+			     struct request_sock *req, struct sk_buff *skb,
+			     int family);
 int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp);
 void tcp_ao_destroy_sock(struct sock *sk, bool twsk);
@@ -147,6 +150,11 @@ struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk,
 			  __be32 sisn, __be32 disn, bool send);
+int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+			   struct request_sock *req);
+struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid);
 int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 		       const struct sock *sk, const struct sk_buff *skb,
 		       const u8 *tkey, int hash_offset, u32 sne);
@@ -154,11 +162,21 @@ int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
 				const struct in6_addr *daddr,
 				const struct in6_addr *saddr, int nbytes);
+int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+			   const struct sk_buff *skb, __be32 sisn, __be32 disn);
 int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk, __be32 sisn,
 			  __be32 disn, bool send);
+int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+			   struct request_sock *req);
+struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
+				       const struct in6_addr *addr,
+				       int sndid, int rcvid);
 struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 				    struct sock *addr_sk, int sndid, int rcvid);
+struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid);
 int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 		       const struct sock *sk, const struct sk_buff *skb,
 		       const u8 *tkey, int hash_offset, u32 sne);
@@ -178,6 +196,12 @@ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	return 0;
 }
 
+static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
+				    struct tcp_request_sock *treq,
+				    unsigned short int family)
+{
+}
+
 static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 		const union tcp_ao_addr *addr, int family, int sndid, int rcvid)
 {
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c64334363230..0681d3e82b11 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -400,6 +400,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_synack	= 0;
 	treq->tfo_listener	= false;
 
+	tcp_ao_syncookie(sk, skb, treq, AF_INET);
+
 	if (IS_ENABLED(CONFIG_SMC))
 		ireq->smc_ok = 0;
 
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 7c4e2f42845a..68d81704e14e 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -169,6 +169,23 @@ static void tcp_ao_link_mkt(struct tcp_ao_info *ao, struct tcp_ao_key *mkt)
 	hlist_add_head_rcu(&mkt->node, &ao->head);
 }
 
+static struct tcp_ao_key *tcp_ao_copy_key(struct sock *sk,
+					  struct tcp_ao_key *key)
+{
+	struct tcp_ao_key *new_key;
+
+	new_key = sock_kmalloc(sk, tcp_ao_sizeof_key(key),
+			       GFP_ATOMIC);
+	if (!new_key)
+		return NULL;
+
+	*new_key = *key;
+	INIT_HLIST_NODE(&new_key->node);
+	tcp_sigpool_get(new_key->tcp_sigpool_id);
+
+	return new_key;
+}
+
 static void tcp_ao_key_free_rcu(struct rcu_head *head)
 {
 	struct tcp_ao_key *key = container_of(head, struct tcp_ao_key, rcu);
@@ -290,6 +307,42 @@ static int tcp_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 		return -EOPNOTSUPP;
 }
 
+int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+			   struct request_sock *req)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+
+	return tcp_v4_ao_calc_key(mkt, key,
+				  ireq->ir_loc_addr, ireq->ir_rmt_addr,
+				  htons(ireq->ir_num), ireq->ir_rmt_port,
+				  htonl(tcp_rsk(req)->snt_isn),
+				  htonl(tcp_rsk(req)->rcv_isn));
+}
+
+static int tcp_v4_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+				  const struct sk_buff *skb,
+				  __be32 sisn, __be32 disn)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
+
+	return tcp_v4_ao_calc_key(mkt, key, iph->saddr, iph->daddr,
+				  th->source, th->dest, sisn, disn);
+}
+
+static int tcp_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+			       const struct sk_buff *skb,
+			       __be32 sisn, __be32 disn, int family)
+{
+	if (family == AF_INET)
+		return tcp_v4_ao_calc_key_skb(mkt, key, skb, sisn, disn);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (family == AF_INET6)
+		return tcp_v6_ao_calc_key_skb(mkt, key, skb, sisn, disn);
+#endif
+	return -EAFNOSUPPORT;
+}
+
 static int tcp_v4_ao_hash_pseudoheader(struct tcp_sigpool *hp,
 				       __be32 daddr, __be32 saddr,
 				       int nbytes)
@@ -515,6 +568,16 @@ int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 			       tkey, hash_offset, sne);
 }
 
+struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid)
+{
+	union tcp_ao_addr *addr =
+			(union tcp_ao_addr *)&inet_rsk(req)->ir_rmt_addr;
+
+	return tcp_ao_do_lookup(sk, addr, AF_INET, sndid, rcvid);
+}
+
 struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 				    int sndid, int rcvid)
 {
@@ -528,7 +591,7 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 			 struct tcp_ao_key **key, char **traffic_key,
 			 bool *allocated_traffic_key, u8 *keyid, u32 *sne)
 {
-	struct tcp_ao_key *rnext_key;
+	const struct tcphdr *th = tcp_hdr(skb);
 	struct tcp_ao_info *ao_info;
 
 	*allocated_traffic_key = false;
@@ -543,23 +606,62 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 		return -ENOTCONN;
 
 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
-		return -1;
+		unsigned int family = READ_ONCE(sk->sk_family);
+		union tcp_ao_addr *addr;
+		__be32 disn, sisn;
 
-	if (sk->sk_state == TCP_TIME_WAIT)
-		ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
-	else
+		if (sk->sk_state == TCP_NEW_SYN_RECV) {
+			struct request_sock *req = inet_reqsk(sk);
+
+			sisn = htonl(tcp_rsk(req)->rcv_isn);
+			disn = htonl(tcp_rsk(req)->snt_isn);
+			*sne = 0;
+		} else {
+			sisn = th->seq;
+			disn = 0;
+		}
+		if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+			addr = (union tcp_md5_addr *)&ipv6_hdr(skb)->saddr;
+		else
+			addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (family == AF_INET6 && ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+			family = AF_INET;
+#endif
+
+		sk = sk_const_to_full_sk(sk);
 		ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
-	if (!ao_info)
-		return -ENOENT;
+		if (!ao_info)
+			return -ENOENT;
+		*key = tcp_ao_do_lookup(sk, addr, family, -1, aoh->rnext_keyid);
+		if (!*key)
+			return -ENOENT;
+		*traffic_key = kmalloc(tcp_ao_digest_size(*key), GFP_ATOMIC);
+		if (!*traffic_key)
+			return -ENOMEM;
+		*allocated_traffic_key = true;
+		if (tcp_ao_calc_key_skb(*key, *traffic_key, skb,
+					sisn, disn, family))
+			return -1;
+		*keyid = (*key)->rcvid;
+	} else {
+		struct tcp_ao_key *rnext_key;
 
-	*key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
-	if (!*key)
-		return -ENOENT;
-	*traffic_key = snd_other_key(*key);
-	rnext_key = READ_ONCE(ao_info->rnext_key);
-	*keyid = rnext_key->rcvid;
-	*sne = 0;
+		if (sk->sk_state == TCP_TIME_WAIT)
+			ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
+		else
+			ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+		if (!ao_info)
+			return -ENOENT;
 
+		*key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
+		if (!*key)
+			return -ENOENT;
+		*traffic_key = snd_other_key(*key);
+		rnext_key = READ_ONCE(ao_info->rnext_key);
+		*keyid = rnext_key->rcvid;
+		*sne = 0;
+	}
 	return 0;
 }
 
@@ -597,6 +699,46 @@ int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	return 0;
 }
 
+static struct tcp_ao_key *tcp_ao_inbound_lookup(unsigned short int family,
+		const struct sock *sk, const struct sk_buff *skb,
+		int sndid, int rcvid)
+{
+	if (family == AF_INET) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)&iph->saddr,
+				AF_INET, sndid, rcvid);
+	} else {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)&iph->saddr,
+				AF_INET6, sndid, rcvid);
+	}
+}
+
+void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
+		      struct tcp_request_sock *treq,
+		      unsigned short int family)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	const struct tcp_ao_hdr *aoh;
+	struct tcp_ao_key *key;
+
+	treq->maclen = 0;
+
+	if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh)
+		return;
+
+	key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid);
+	if (!key)
+		/* Key not found, continue without TCP-AO */
+		return;
+
+	treq->ao_rcv_next = aoh->keyid;
+	treq->ao_keyid = aoh->rnext_keyid;
+	treq->maclen = tcp_ao_maclen(key);
+}
+
 static int tcp_ao_cache_traffic_keys(const struct sock *sk,
 				     struct tcp_ao_info *ao,
 				     struct tcp_ao_key *ao_key)
@@ -704,6 +846,100 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
 		tcp_ao_cache_traffic_keys(sk, ao, key);
 }
 
+int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
+			     struct request_sock *req, struct sk_buff *skb,
+			     int family)
+{
+	struct tcp_ao_key *key, *new_key, *first_key;
+	struct tcp_ao_info *new_ao, *ao;
+	struct hlist_node *key_head;
+	union tcp_ao_addr *addr;
+	bool match = false;
+	int ret = -ENOMEM;
+
+	ao = rcu_dereference(tcp_sk(sk)->ao_info);
+	if (!ao)
+		return 0;
+
+	/* New socket without TCP-AO on it */
+	if (!tcp_rsk_used_ao(req))
+		return 0;
+
+	new_ao = tcp_ao_alloc_info(GFP_ATOMIC);
+	if (!new_ao)
+		return -ENOMEM;
+	new_ao->lisn = htonl(tcp_rsk(req)->snt_isn);
+	new_ao->risn = htonl(tcp_rsk(req)->rcv_isn);
+	new_ao->ao_required = ao->ao_required;
+
+	if (family == AF_INET) {
+		addr = (union tcp_ao_addr *)&newsk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (family == AF_INET6) {
+		addr = (union tcp_ao_addr *)&newsk->sk_v6_daddr;
+#endif
+	} else {
+		ret = -EAFNOSUPPORT;
+		goto free_ao;
+	}
+
+	hlist_for_each_entry_rcu(key, &ao->head, node) {
+		if (tcp_ao_key_cmp(key, addr, key->prefixlen, family, -1, -1))
+			continue;
+
+		new_key = tcp_ao_copy_key(newsk, key);
+		if (!new_key)
+			goto free_and_exit;
+
+		tcp_ao_cache_traffic_keys(newsk, new_ao, new_key);
+		tcp_ao_link_mkt(new_ao, new_key);
+		match = true;
+	}
+
+	if (!match) {
+		/* RFC5925 (7.4.1) specifies that the TCP-AO status
+		 * of a connection is determined on the initial SYN.
+		 * At this point the connection was TCP-AO enabled, so
+		 * it can't switch to being unsigned if peer's key
+		 * disappears on the listening socket.
+		 */
+		ret = -EKEYREJECTED;
+		goto free_and_exit;
+	}
+
+	key_head = rcu_dereference(hlist_first_rcu(&new_ao->head));
+	first_key = hlist_entry_safe(key_head, struct tcp_ao_key, node);
+
+	key = tcp_ao_established_key(new_ao, tcp_rsk(req)->ao_keyid, -1);
+	if (key)
+		new_ao->current_key = key;
+	else
+		new_ao->current_key = first_key;
+
+	/* set rnext_key */
+	key = tcp_ao_established_key(new_ao, -1, tcp_rsk(req)->ao_rcv_next);
+	if (key)
+		new_ao->rnext_key = key;
+	else
+		new_ao->rnext_key = first_key;
+
+	sk_gso_disable(newsk);
+	rcu_assign_pointer(tcp_sk(newsk)->ao_info, new_ao);
+
+	return 0;
+
+free_and_exit:
+	hlist_for_each_entry_safe(key, key_head, &new_ao->head, node) {
+		hlist_del(&key->node);
+		tcp_sigpool_release(key->tcp_sigpool_id);
+		atomic_sub(tcp_ao_sizeof_key(key), &newsk->sk_omem_alloc);
+		kfree_sensitive(key);
+	}
+free_ao:
+	kfree(new_ao);
+	return ret;
+}
+
 static bool tcp_ao_can_set_current_rnext(struct sock *sk)
 {
 	/* There aren't current/rnext keys on TCP_LISTEN sockets */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fc42b172abf6..de64c0c2fb69 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7045,6 +7045,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	struct flowi fl;
 	u8 syncookies;
 
+#ifdef CONFIG_TCP_AO
+	const struct tcp_ao_hdr *aoh;
+#endif
+
 	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
 
 	/* TW buckets are converted to open requests without
@@ -7131,6 +7135,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			inet_rsk(req)->ecn_ok = 0;
 	}
 
+#ifdef CONFIG_TCP_AO
+	if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+		goto drop_and_release; /* Invalid TCP options */
+	if (aoh) {
+		tcp_rsk(req)->maclen = aoh->length - sizeof(struct tcp_ao_hdr);
+		tcp_rsk(req)->ao_rcv_next = aoh->keyid;
+		tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
+	} else {
+		tcp_rsk(req)->maclen = 0;
+	}
+#endif
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_rsk(req)->txhash = net_tx_rndhash();
 	tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a78112d78d06..2bd56b57f3c9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1072,13 +1072,47 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 					     tcp_sk(sk)->snd_nxt;
 
-	/* RFC 7323 2.3
-	 * The window field (SEG.WND) of every outgoing segment, with the
-	 * exception of <SYN> segments, MUST be right-shifted by
-	 * Rcv.Wind.Shift bits:
-	 */
+#ifdef CONFIG_TCP_AO
+	if (tcp_rsk_used_ao(req)) {
+		const union tcp_md5_addr *addr;
+		const struct tcp_ao_hdr *aoh;
+
+		/* Invalid TCP option size or twice included auth */
+		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+			return;
+		if (!aoh)
+			return;
+
+		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+		key.ao_key = tcp_ao_do_lookup(sk, addr, AF_INET,
+					      aoh->rnext_keyid, -1);
+		if (unlikely(!key.ao_key)) {
+			/* Send ACK with any matching MKT for the peer */
+			key.ao_key = tcp_ao_do_lookup(sk, addr, AF_INET, -1, -1);
+			/* Matching key disappeared (user removed the key?)
+			 * let the handshake timeout.
+			 */
+			if (!key.ao_key) {
+				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
+						     addr,
+						     ntohs(tcp_hdr(skb)->source),
+						     &ip_hdr(skb)->daddr,
+						     ntohs(tcp_hdr(skb)->dest));
+				return;
+			}
+		}
+		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
+		if (!key.traffic_key)
+			return;
+
+		key.type = TCP_KEY_AO;
+		key.rcv_next = aoh->keyid;
+		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
+#else
+	if (0) {
+#endif
 #ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
 		const union tcp_md5_addr *addr;
 		int l3index;
 
@@ -1087,8 +1121,14 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 		if (key.md5_key)
 			key.type = TCP_KEY_MD5;
-	}
 #endif
+	}
+
+	/* RFC 7323 2.3
+	 * The window field (SEG.WND) of every outgoing segment, with the
+	 * exception of <SYN> segments, MUST be right-shifted by
+	 * Rcv.Wind.Shift bits:
+	 */
 	tcp_v4_send_ack(sk, skb, seq,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
@@ -1098,6 +1138,8 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 			ip_hdr(skb)->tos,
 			READ_ONCE(tcp_rsk(req)->txhash));
+	if (tcp_key_is_ao(&key))
+		kfree(key.traffic_key);
 }
 
 /*
@@ -1636,6 +1678,10 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 	.req_md5_lookup	=	tcp_v4_md5_lookup,
 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
 #endif
+#ifdef CONFIG_TCP_AO
+	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
+	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
+#endif
 #ifdef CONFIG_SYN_COOKIES
 	.cookie_init_seq =	cookie_v4_init_sequence,
 #endif
@@ -1737,12 +1783,16 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	/* Copy over the MD5 key from the original socket */
 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
-	if (key) {
+	if (key && !tcp_rsk_used_ao(req)) {
 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
 			goto put_and_exit;
 		sk_gso_disable(newsk);
 	}
 #endif
+#ifdef CONFIG_TCP_AO
+	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
+		goto put_and_exit; /* OOM, release back memory */
+#endif
 
 	if (__inet_inherit_port(sk, newsk) < 0)
 		goto put_and_exit;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6810cf65a322..8d941a6e066b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -506,6 +506,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	const struct tcp_sock *oldtp;
 	struct tcp_sock *newtp;
 	u32 seq;
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_key *ao_key;
+#endif
 
 	if (!newsk)
 		return NULL;
@@ -594,6 +597,13 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 #ifdef CONFIG_TCP_MD5SIG
 	newtp->md5sig_info = NULL;	/*XXX*/
 #endif
+#ifdef CONFIG_TCP_AO
+	newtp->ao_info = NULL;
+	ao_key = treq->af_specific->ao_lookup(sk, req,
+				tcp_rsk(req)->ao_keyid, -1);
+	if (ao_key)
+		newtp->tcp_header_len += tcp_ao_len(ao_key);
+ #endif
 	if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
 		newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 	newtp->rx_opt.mss_clamp = req->mss;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de546313a214..d44ef88025fb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -615,6 +615,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
  * (but it may well be that other scenarios fail similarly).
  */
 static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
+			      const struct tcp_request_sock *tcprsk,
 			      struct tcp_out_options *opts,
 			      struct tcp_key *key)
 {
@@ -629,20 +630,28 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
 		ptr += 4;
 	} else if (tcp_key_is_ao(key)) {
 #ifdef CONFIG_TCP_AO
-		struct tcp_ao_key *rnext_key;
-		struct tcp_ao_info *ao_info;
-		u8 maclen;
+		u8 maclen = tcp_ao_maclen(key->ao_key);
 
-		ao_info = rcu_dereference_check(tp->ao_info,
+		if (tcprsk) {
+			u8 aolen = maclen + sizeof(struct tcp_ao_hdr);
+
+			*ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) |
+				       (tcprsk->ao_keyid << 8) |
+				       (tcprsk->ao_rcv_next));
+		} else {
+			struct tcp_ao_key *rnext_key;
+			struct tcp_ao_info *ao_info;
+
+			ao_info = rcu_dereference_check(tp->ao_info,
 				lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
-		rnext_key = READ_ONCE(ao_info->rnext_key);
-		if (WARN_ON_ONCE(!rnext_key))
-			goto out_ao;
-		maclen = tcp_ao_maclen(key->ao_key);
-		*ptr++ = htonl((TCPOPT_AO << 24) |
-				(tcp_ao_len(key->ao_key) << 16) |
-				(key->ao_key->sndid << 8) |
-				(rnext_key->rcvid));
+			rnext_key = READ_ONCE(ao_info->rnext_key);
+			if (WARN_ON_ONCE(!rnext_key))
+				goto out_ao;
+			*ptr++ = htonl((TCPOPT_AO << 24) |
+				       (tcp_ao_len(key->ao_key) << 16) |
+				       (key->ao_key->sndid << 8) |
+				       (rnext_key->rcvid));
+		}
 		opts->hash_location = (__u8 *)ptr;
 		ptr += maclen / sizeof(*ptr);
 		if (unlikely(maclen % sizeof(*ptr))) {
@@ -1386,7 +1395,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		th->window	= htons(min(tp->rcv_wnd, 65535U));
 	}
 
-	tcp_options_write(th, tp, &opts, &key);
+	tcp_options_write(th, tp, NULL, &opts, &key);
 
 	if (tcp_key_is_md5(&key)) {
 #ifdef CONFIG_TCP_MD5SIG
@@ -3729,7 +3738,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
-	tcp_options_write(th, NULL, &opts, &key);
+	tcp_options_write(th, NULL, NULL, &opts, &key);
 	th->doff = (tcp_header_size >> 2);
 	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 5014aa663452..ad7a8caa7b2a 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -214,6 +214,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_isn = cookie;
 	treq->ts_off = 0;
 	treq->txhash = net_tx_rndhash();
+	tcp_ao_syncookie(sk, skb, treq, AF_INET6);
+
 	if (IS_ENABLED(CONFIG_SMC))
 		ireq->smc_ok = 0;
 
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
index d08735b6f3c5..c9a6fa84f6ce 100644
--- a/net/ipv6/tcp_ao.c
+++ b/net/ipv6/tcp_ao.c
@@ -49,6 +49,17 @@ static int tcp_v6_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
 	return err;
 }
 
+int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+			   const struct sk_buff *skb,
+			   __be32 sisn, __be32 disn)
+{
+       const struct ipv6hdr *iph = ipv6_hdr(skb);
+       const struct tcphdr *th = tcp_hdr(skb);
+
+       return tcp_v6_ao_calc_key(mkt, key, &iph->saddr, &iph->daddr,
+				 th->source, th->dest, sisn, disn);
+}
+
 int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk, __be32 sisn,
 			  __be32 disn, bool send)
@@ -63,9 +74,21 @@ int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 					  htons(sk->sk_num), disn, sisn);
 }
 
-static struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
-					      const struct in6_addr *addr,
-					      int sndid, int rcvid)
+int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+			   struct request_sock *req)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+
+	return tcp_v6_ao_calc_key(mkt, key,
+			&ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr,
+			htons(ireq->ir_num), ireq->ir_rmt_port,
+			htonl(tcp_rsk(req)->snt_isn),
+			htonl(tcp_rsk(req)->rcv_isn));
+}
+
+struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
+				       const struct in6_addr *addr,
+				       int sndid, int rcvid)
 {
 	return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)addr, AF_INET6,
 				sndid, rcvid);
@@ -80,6 +103,15 @@ struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 	return tcp_v6_ao_do_lookup(sk, addr, sndid, rcvid);
 }
 
+struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid)
+{
+	struct in6_addr *addr = &inet_rsk(req)->ir_v6_rmt_addr;
+
+	return tcp_v6_ao_do_lookup(sk, addr, sndid, rcvid);
+}
+
 int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
 				const struct in6_addr *daddr,
 				const struct in6_addr *saddr, int nbytes)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e6c3617976f0..b08b177847da 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -836,6 +836,10 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.req_md5_lookup	=	tcp_v6_md5_lookup,
 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
 #endif
+#ifdef CONFIG_TCP_AO
+	.ao_lookup	=	tcp_v6_ao_lookup_rsk,
+	.ao_calc_key	=	tcp_v6_ao_calc_key_rsk,
+#endif
 #ifdef CONFIG_SYN_COOKIES
 	.cookie_init_seq =	cookie_v6_init_sequence,
 #endif
@@ -1192,16 +1196,54 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_key key = {};
 
+#ifdef CONFIG_TCP_AO
+	if (tcp_rsk_used_ao(req)) {
+		const struct in6_addr *addr = &ipv6_hdr(skb)->saddr;
+		const struct tcp_ao_hdr *aoh;
+		int l3index;
+
+		l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+		/* Invalid TCP option size or twice included auth */
+		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+			return;
+		if (!aoh)
+			return;
+		key.ao_key = tcp_v6_ao_do_lookup(sk, addr, aoh->rnext_keyid, -1);
+		if (unlikely(!key.ao_key)) {
+			/* Send ACK with any matching MKT for the peer */
+			key.ao_key = tcp_v6_ao_do_lookup(sk, addr, -1, -1);
+			/* Matching key disappeared (user removed the key?)
+			 * let the handshake timeout.
+			 */
+			if (!key.ao_key) {
+				net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n",
+						     addr,
+						     ntohs(tcp_hdr(skb)->source),
+						     &ipv6_hdr(skb)->daddr,
+						     ntohs(tcp_hdr(skb)->dest));
+				return;
+			}
+		}
+		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
+		if (!key.traffic_key)
+			return;
+
+		key.type = TCP_KEY_AO;
+		key.rcv_next = aoh->keyid;
+		tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
+#else
+	if (0) {
+#endif
 #ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
 		int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
 
 		key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr,
 						   l3index);
 		if (key.md5_key)
 			key.type = TCP_KEY_MD5;
-	}
 #endif
+	}
 
 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
@@ -1220,6 +1262,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			&key, ipv6_get_dsfield(ipv6_hdr(skb)), 0,
 			READ_ONCE(sk->sk_priority),
 			READ_ONCE(tcp_rsk(req)->txhash));
+	if (tcp_key_is_ao(&key))
+		kfree(key.traffic_key);
 }
 
 
@@ -1449,19 +1493,26 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 #ifdef CONFIG_TCP_MD5SIG
 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
 
-	/* Copy over the MD5 key from the original socket */
-	key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
-	if (key) {
-		const union tcp_md5_addr *addr;
-
-		addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
-		if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) {
-			inet_csk_prepare_forced_close(newsk);
-			tcp_done(newsk);
-			goto out;
+	if (!tcp_rsk_used_ao(req)) {
+		/* Copy over the MD5 key from the original socket */
+		key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
+		if (key) {
+			const union tcp_md5_addr *addr;
+
+			addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
+			if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) {
+				inet_csk_prepare_forced_close(newsk);
+				tcp_done(newsk);
+				goto out;
+			}
 		}
 	}
 #endif
+#ifdef CONFIG_TCP_AO
+	/* Copy over tcp_ao_info if any */
+	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6))
+		goto out; /* OOM */
+#endif
 
 	if (__inet_inherit_port(sk, newsk) < 0) {
 		inet_csk_prepare_forced_close(newsk);
-- 
cgit v1.2.3-70-g09d2


From 9427c6aa3ec92f66b3d38f5d5f7af6b94b648a66 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:03 +0100
Subject: net/tcp: Sign SYN-ACK segments with TCP-AO

Similarly to RST segments, wire SYN-ACKs to TCP-AO.
tcp_rsk_used_ao() is handy here to check if the request socket used AO
and needs a signature on the outgoing segments.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  3 +++
 include/net/tcp_ao.h  |  6 +++++
 net/ipv4/tcp_ao.c     | 22 ++++++++++++++++
 net/ipv4/tcp_ipv4.c   |  1 +
 net/ipv4/tcp_output.c | 72 +++++++++++++++++++++++++++++++++++++++------------
 net/ipv6/tcp_ao.c     | 22 ++++++++++++++++
 net/ipv6/tcp_ipv6.c   |  1 +
 7 files changed, 111 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0bea102b523..c7f80cc94d71 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2221,6 +2221,9 @@ struct tcp_request_sock_ops {
 					struct request_sock *req,
 					int sndid, int rcvid);
 	int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk);
+	int (*ao_synack_hash)(char *ao_hash, struct tcp_ao_key *mkt,
+			      struct request_sock *req, const struct sk_buff *skb,
+			      int hash_offset, u32 sne);
 #endif
 #ifdef CONFIG_SYN_COOKIES
 	__u32 (*cookie_init_seq)(const struct sk_buff *skb,
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index d2c1ee8bf7b0..1d69978e349a 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -147,6 +147,9 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
 struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 				    int sndid, int rcvid);
+int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *mkt,
+			  struct request_sock *req, const struct sk_buff *skb,
+			  int hash_offset, u32 sne);
 int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk,
 			  __be32 sisn, __be32 disn, bool send);
@@ -181,6 +184,9 @@ int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 		       const struct sock *sk, const struct sk_buff *skb,
 		       const u8 *tkey, int hash_offset, u32 sne);
 int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key,
+			  struct request_sock *req, const struct sk_buff *skb,
+			  int hash_offset, u32 sne);
 void tcp_ao_established(struct sock *sk);
 void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb);
 void tcp_ao_connect_init(struct sock *sk);
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 68d81704e14e..de3710758d55 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -568,6 +568,28 @@ int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 			       tkey, hash_offset, sne);
 }
 
+int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key,
+			  struct request_sock *req, const struct sk_buff *skb,
+			  int hash_offset, u32 sne)
+{
+	void *hash_buf = NULL;
+	int err;
+
+	hash_buf = kmalloc(tcp_ao_digest_size(ao_key), GFP_ATOMIC);
+	if (!hash_buf)
+		return -ENOMEM;
+
+	err = tcp_v4_ao_calc_key_rsk(ao_key, hash_buf, req);
+	if (err)
+		goto out;
+
+	err = tcp_ao_hash_skb(AF_INET, ao_hash, ao_key, req_to_sk(req), skb,
+			      hash_buf, hash_offset, sne);
+out:
+	kfree(hash_buf);
+	return err;
+}
+
 struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk,
 					struct request_sock *req,
 					int sndid, int rcvid)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2bd56b57f3c9..bdf0224ae827 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1681,6 +1681,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 #ifdef CONFIG_TCP_AO
 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
+	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
 #endif
 #ifdef CONFIG_SYN_COOKIES
 	.cookie_init_seq =	cookie_v4_init_sequence,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d44ef88025fb..f558c054cf6e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -886,7 +886,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 				       struct request_sock *req,
 				       unsigned int mss, struct sk_buff *skb,
 				       struct tcp_out_options *opts,
-				       const struct tcp_md5sig_key *md5,
+				       const struct tcp_key *key,
 				       struct tcp_fastopen_cookie *foc,
 				       enum tcp_synack_type synack_type,
 				       struct sk_buff *syn_skb)
@@ -894,8 +894,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 	struct inet_request_sock *ireq = inet_rsk(req);
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
 
-#ifdef CONFIG_TCP_MD5SIG
-	if (md5) {
+	if (tcp_key_is_md5(key)) {
 		opts->options |= OPTION_MD5;
 		remaining -= TCPOLEN_MD5SIG_ALIGNED;
 
@@ -906,8 +905,11 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 		 */
 		if (synack_type != TCP_SYNACK_COOKIE)
 			ireq->tstamp_ok &= !ireq->sack_ok;
+	} else if (tcp_key_is_ao(key)) {
+		opts->options |= OPTION_AO;
+		remaining -= tcp_ao_len(key->ao_key);
+		ireq->tstamp_ok &= !ireq->sack_ok;
 	}
-#endif
 
 	/* We always send an MSS option. */
 	opts->mss = mss;
@@ -3653,7 +3655,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	const struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_md5sig_key *md5 = NULL;
 	struct tcp_out_options opts;
 	struct tcp_key key = {};
 	struct sk_buff *skb;
@@ -3707,18 +3708,48 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
 	}
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 	rcu_read_lock();
-	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
-	if (md5)
-		key.type = TCP_KEY_MD5;
 #endif
+	if (tcp_rsk_used_ao(req)) {
+#ifdef CONFIG_TCP_AO
+		struct tcp_ao_key *ao_key = NULL;
+		u8 maclen = tcp_rsk(req)->maclen;
+		u8 keyid = tcp_rsk(req)->ao_keyid;
+
+		ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
+							    keyid, -1);
+		/* If there is no matching key - avoid sending anything,
+		 * especially usigned segments. It could try harder and lookup
+		 * for another peer-matching key, but the peer has requested
+		 * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
+		 */
+		if (unlikely(!ao_key || tcp_ao_maclen(ao_key) != maclen)) {
+			u8 key_maclen = ao_key ? tcp_ao_maclen(ao_key) : 0;
+
+			rcu_read_unlock();
+			kfree_skb(skb);
+			net_warn_ratelimited("TCP-AO: the keyid %u with maclen %u|%u from SYN packet is not present - not sending SYNACK\n",
+					     keyid, maclen, key_maclen);
+			return NULL;
+		}
+		key.ao_key = ao_key;
+		key.type = TCP_KEY_AO;
+#endif
+	} else {
+#ifdef CONFIG_TCP_MD5SIG
+		key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk,
+					req_to_sk(req));
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+#endif
+	}
 	skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4);
 	/* bpf program will be interested in the tcp_flags */
 	TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
-	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
-					     foc, synack_type,
-					     syn_skb) + sizeof(*th);
+	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts,
+					     &key, foc, synack_type, syn_skb)
+					+ sizeof(*th);
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
@@ -3738,15 +3769,24 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
-	tcp_options_write(th, NULL, NULL, &opts, &key);
+	tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key);
 	th->doff = (tcp_header_size >> 2);
 	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
-#ifdef CONFIG_TCP_MD5SIG
 	/* Okay, we have all we need - do the md5 hash if needed */
-	if (md5)
+	if (tcp_key_is_md5(&key)) {
+#ifdef CONFIG_TCP_MD5SIG
 		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
-					       md5, req_to_sk(req), skb);
+					key.md5_key, req_to_sk(req), skb);
+#endif
+	} else if (tcp_key_is_ao(&key)) {
+#ifdef CONFIG_TCP_AO
+		tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location,
+					key.ao_key, req, skb,
+					opts.hash_location - (u8 *)th, 0);
+#endif
+	}
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 	rcu_read_unlock();
 #endif
 
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
index c9a6fa84f6ce..99753e12c08c 100644
--- a/net/ipv6/tcp_ao.c
+++ b/net/ipv6/tcp_ao.c
@@ -144,3 +144,25 @@ int tcp_v6_parse_ao(struct sock *sk, int cmd,
 {
 	return tcp_parse_ao(sk, cmd, AF_INET6, optval, optlen);
 }
+
+int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key,
+			  struct request_sock *req, const struct sk_buff *skb,
+			  int hash_offset, u32 sne)
+{
+	void *hash_buf = NULL;
+	int err;
+
+	hash_buf = kmalloc(tcp_ao_digest_size(ao_key), GFP_ATOMIC);
+	if (!hash_buf)
+		return -ENOMEM;
+
+	err = tcp_v6_ao_calc_key_rsk(ao_key, hash_buf, req);
+	if (err)
+		goto out;
+
+	err = tcp_ao_hash_skb(AF_INET6, ao_hash, ao_key, req_to_sk(req), skb,
+			      hash_buf, hash_offset, sne);
+out:
+	kfree(hash_buf);
+	return err;
+}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b08b177847da..8c5c96187a72 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -839,6 +839,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 #ifdef CONFIG_TCP_AO
 	.ao_lookup	=	tcp_v6_ao_lookup_rsk,
 	.ao_calc_key	=	tcp_v6_ao_calc_key_rsk,
+	.ao_synack_hash =	tcp_v6_ao_synack_hash,
 #endif
 #ifdef CONFIG_SYN_COOKIES
 	.cookie_init_seq =	cookie_v6_init_sequence,
-- 
cgit v1.2.3-70-g09d2


From 0a3a809089eb1d4a0a2fd0c16b520d603988c859 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:04 +0100
Subject: net/tcp: Verify inbound TCP-AO signed segments

Now there is a common function to verify signature on TCP segments:
tcp_inbound_hash(). It has checks for all possible cross-interactions
with MD5 signs as well as with unsigned segments.

The rules from RFC5925 are:
(1) Any TCP segment can have at max only one signature.
(2) TCP connections can't switch between using TCP-MD5 and TCP-AO.
(3) TCP-AO connections can't stop using AO, as well as unsigned
    connections can't suddenly start using AO.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dropreason-core.h |  17 +++++
 include/net/tcp.h             |  53 +++++++++++++++-
 include/net/tcp_ao.h          |  14 +++++
 net/ipv4/tcp.c                |  39 +++---------
 net/ipv4/tcp_ao.c             | 142 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c           |  10 +--
 net/ipv6/tcp_ao.c             |   9 +--
 net/ipv6/tcp_ipv6.c           |  11 ++--
 8 files changed, 248 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index 3af4464a9c5b..7637137ae33e 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -24,6 +24,10 @@
 	FN(TCP_MD5NOTFOUND)		\
 	FN(TCP_MD5UNEXPECTED)		\
 	FN(TCP_MD5FAILURE)		\
+	FN(TCP_AONOTFOUND)		\
+	FN(TCP_AOUNEXPECTED)		\
+	FN(TCP_AOKEYNOTFOUND)		\
+	FN(TCP_AOFAILURE)		\
 	FN(SOCKET_BACKLOG)		\
 	FN(TCP_FLAGS)			\
 	FN(TCP_ZEROWINDOW)		\
@@ -163,6 +167,19 @@ enum skb_drop_reason {
 	 * to LINUX_MIB_TCPMD5FAILURE
 	 */
 	SKB_DROP_REASON_TCP_MD5FAILURE,
+	/**
+	 * @SKB_DROP_REASON_TCP_AONOTFOUND: no TCP-AO hash and one was expected
+	 */
+	SKB_DROP_REASON_TCP_AONOTFOUND,
+	/**
+	 * @SKB_DROP_REASON_TCP_AOUNEXPECTED: TCP-AO hash is present and it
+	 * was not expected.
+	 */
+	SKB_DROP_REASON_TCP_AOUNEXPECTED,
+	/** @SKB_DROP_REASON_TCP_AOKEYNOTFOUND: TCP-AO key is unknown */
+	SKB_DROP_REASON_TCP_AOKEYNOTFOUND,
+	/** @SKB_DROP_REASON_TCP_AOFAILURE: TCP-AO hash is wrong */
+	SKB_DROP_REASON_TCP_AOFAILURE,
 	/**
 	 * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog (
 	 * see LINUX_MIB_TCPBACKLOGDROP)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c7f80cc94d71..8e1f835bad22 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1809,7 +1809,7 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk,
 enum skb_drop_reason
 tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 		     const void *saddr, const void *daddr,
-		     int family, int dif, int sdif);
+		     int family, int l3index, const __u8 *hash_location);
 
 
 #define tcp_twsk_md5_key(twsk)	((twsk)->tw_md5_key)
@@ -1831,7 +1831,7 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk,
 static inline enum skb_drop_reason
 tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 		     const void *saddr, const void *daddr,
-		     int family, int dif, int sdif)
+		     int family, int l3index, const __u8 *hash_location)
 {
 	return SKB_NOT_DROPPED_YET;
 }
@@ -2730,4 +2730,53 @@ static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
 	return false;
 }
 
+/* Called with rcu_read_lock() */
+static inline enum skb_drop_reason
+tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
+		 const struct sk_buff *skb,
+		 const void *saddr, const void *daddr,
+		 int family, int dif, int sdif)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	const struct tcp_ao_hdr *aoh;
+	const __u8 *md5_location;
+	int l3index;
+
+	/* Invalid option or two times meet any of auth options */
+	if (tcp_parse_auth_options(th, &md5_location, &aoh))
+		return SKB_DROP_REASON_TCP_AUTH_HDR;
+
+	if (req) {
+		if (tcp_rsk_used_ao(req) != !!aoh)
+			return SKB_DROP_REASON_TCP_AOFAILURE;
+	}
+
+	/* sdif set, means packet ingressed via a device
+	 * in an L3 domain and dif is set to the l3mdev
+	 */
+	l3index = sdif ? dif : 0;
+
+	/* Fast path: unsigned segments */
+	if (likely(!md5_location && !aoh)) {
+		/* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid
+		 * for the remote peer. On TCP-AO established connection
+		 * the last key is impossible to remove, so there's
+		 * always at least one current_key.
+		 */
+		if (tcp_ao_required(sk, saddr, family))
+			return SKB_DROP_REASON_TCP_AONOTFOUND;
+		if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+			return SKB_DROP_REASON_TCP_MD5NOTFOUND;
+		}
+		return SKB_NOT_DROPPED_YET;
+	}
+
+	if (aoh)
+		return tcp_inbound_ao_hash(sk, skb, family, req, aoh);
+
+	return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
+				    l3index, md5_location);
+}
+
 #endif	/* _TCP_H */
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 1d69978e349a..1c7c0a5d1877 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -111,6 +111,9 @@ struct tcp6_ao_context {
 };
 
 struct tcp_sigpool;
+#define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \
+			    TCPF_CLOSE | TCPF_CLOSE_WAIT | \
+			    TCPF_LAST_ACK | TCPF_CLOSING)
 
 int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 			struct tcp_ao_key *key, struct tcphdr *th,
@@ -130,6 +133,10 @@ int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp);
 void tcp_ao_destroy_sock(struct sock *sk, bool twsk);
 void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp);
+enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
+			const struct sk_buff *skb, unsigned short int family,
+			const struct request_sock *req,
+			const struct tcp_ao_hdr *aoh);
 struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 				    const union tcp_ao_addr *addr,
 				    int family, int sndid, int rcvid);
@@ -208,6 +215,13 @@ static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 {
 }
 
+static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
+		const struct sk_buff *skb, unsigned short int family,
+		const struct request_sock *req, const struct tcp_ao_hdr *aoh)
+{
+	return SKB_NOT_DROPPED_YET;
+}
+
 static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 		const union tcp_ao_addr *addr, int family, int sndid, int rcvid)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 369e2a41bc1b..eb71212a09d8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4375,42 +4375,23 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
 enum skb_drop_reason
 tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 		     const void *saddr, const void *daddr,
-		     int family, int dif, int sdif)
+		     int family, int l3index, const __u8 *hash_location)
 {
-	/*
-	 * This gets called for each TCP segment that arrives
-	 * so we want to be efficient.
+	/* This gets called for each TCP segment that has TCP-MD5 option.
 	 * We have 3 drop cases:
 	 * o No MD5 hash and one expected.
 	 * o MD5 hash and we're not expecting one.
 	 * o MD5 hash and its wrong.
 	 */
-	const __u8 *hash_location = NULL;
-	struct tcp_md5sig_key *hash_expected;
 	const struct tcphdr *th = tcp_hdr(skb);
 	const struct tcp_sock *tp = tcp_sk(sk);
-	int genhash, l3index;
+	struct tcp_md5sig_key *key;
 	u8 newhash[16];
+	int genhash;
 
-	/* sdif set, means packet ingressed via a device
-	 * in an L3 domain and dif is set to the l3mdev
-	 */
-	l3index = sdif ? dif : 0;
-
-	hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family);
-	if (tcp_parse_auth_options(th, &hash_location, NULL))
-		return SKB_DROP_REASON_TCP_AUTH_HDR;
-
-	/* We've parsed the options - do we have a hash? */
-	if (!hash_expected && !hash_location)
-		return SKB_NOT_DROPPED_YET;
+	key = tcp_md5_do_lookup(sk, l3index, saddr, family);
 
-	if (hash_expected && !hash_location) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
-		return SKB_DROP_REASON_TCP_MD5NOTFOUND;
-	}
-
-	if (!hash_expected && hash_location) {
+	if (!key && hash_location) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
 		return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
 	}
@@ -4420,14 +4401,10 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 	 * IPv4-mapped case.
 	 */
 	if (family == AF_INET)
-		genhash = tcp_v4_md5_hash_skb(newhash,
-					      hash_expected,
-					      NULL, skb);
+		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 	else
-		genhash = tp->af_specific->calc_md5_hash(newhash,
-							 hash_expected,
+		genhash = tp->af_specific->calc_md5_hash(newhash, key,
 							 NULL, skb);
-
 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
 		if (family == AF_INET) {
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index de3710758d55..6c5815713b73 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -761,6 +761,148 @@ void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 	treq->maclen = tcp_ao_maclen(key);
 }
 
+static enum skb_drop_reason
+tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
+		   unsigned short int family, struct tcp_ao_info *info,
+		   const struct tcp_ao_hdr *aoh, struct tcp_ao_key *key,
+		   u8 *traffic_key, u8 *phash, u32 sne)
+{
+	u8 maclen = aoh->length - sizeof(struct tcp_ao_hdr);
+	const struct tcphdr *th = tcp_hdr(skb);
+	void *hash_buf = NULL;
+
+	if (maclen != tcp_ao_maclen(key))
+		return SKB_DROP_REASON_TCP_AOFAILURE;
+
+	hash_buf = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+	if (!hash_buf)
+		return SKB_DROP_REASON_NOT_SPECIFIED;
+
+	/* XXX: make it per-AF callback? */
+	tcp_ao_hash_skb(family, hash_buf, key, sk, skb, traffic_key,
+			(phash - (u8 *)th), sne);
+	if (memcmp(phash, hash_buf, maclen)) {
+		kfree(hash_buf);
+		return SKB_DROP_REASON_TCP_AOFAILURE;
+	}
+	kfree(hash_buf);
+	return SKB_NOT_DROPPED_YET;
+}
+
+enum skb_drop_reason
+tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
+		    unsigned short int family, const struct request_sock *req,
+		    const struct tcp_ao_hdr *aoh)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	u8 *phash = (u8 *)(aoh + 1); /* hash goes just after the header */
+	struct tcp_ao_info *info;
+	enum skb_drop_reason ret;
+	struct tcp_ao_key *key;
+	__be32 sisn, disn;
+	u8 *traffic_key;
+	u32 sne = 0;
+
+	info = rcu_dereference(tcp_sk(sk)->ao_info);
+	if (!info)
+		return SKB_DROP_REASON_TCP_AOUNEXPECTED;
+
+	if (unlikely(th->syn)) {
+		sisn = th->seq;
+		disn = 0;
+	}
+
+	/* Fast-path */
+	if (likely((1 << sk->sk_state) & TCP_AO_ESTABLISHED)) {
+		enum skb_drop_reason err;
+		struct tcp_ao_key *current_key;
+
+		/* Check if this socket's rnext_key matches the keyid in the
+		 * packet. If not we lookup the key based on the keyid
+		 * matching the rcvid in the mkt.
+		 */
+		key = READ_ONCE(info->rnext_key);
+		if (key->rcvid != aoh->keyid) {
+			key = tcp_ao_established_key(info, -1, aoh->keyid);
+			if (!key)
+				goto key_not_found;
+		}
+
+		/* Delayed retransmitted SYN */
+		if (unlikely(th->syn && !th->ack))
+			goto verify_hash;
+
+		sne = 0;
+		/* Established socket, traffic key are cached */
+		traffic_key = rcv_other_key(key);
+		err = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
+					 traffic_key, phash, sne);
+		if (err)
+			return err;
+		current_key = READ_ONCE(info->current_key);
+		/* Key rotation: the peer asks us to use new key (RNext) */
+		if (unlikely(aoh->rnext_keyid != current_key->sndid)) {
+			/* If the key is not found we do nothing. */
+			key = tcp_ao_established_key(info, aoh->rnext_keyid, -1);
+			if (key)
+				/* pairs with tcp_ao_del_cmd */
+				WRITE_ONCE(info->current_key, key);
+		}
+		return SKB_NOT_DROPPED_YET;
+	}
+
+	/* Lookup key based on peer address and keyid.
+	 * current_key and rnext_key must not be used on tcp listen
+	 * sockets as otherwise:
+	 * - request sockets would race on those key pointers
+	 * - tcp_ao_del_cmd() allows async key removal
+	 */
+	key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid);
+	if (!key)
+		goto key_not_found;
+
+	if (th->syn && !th->ack)
+		goto verify_hash;
+
+	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
+		/* Make the initial syn the likely case here */
+		if (unlikely(req)) {
+			sne = 0;
+			sisn = htonl(tcp_rsk(req)->rcv_isn);
+			disn = htonl(tcp_rsk(req)->snt_isn);
+		} else if (unlikely(th->ack && !th->syn)) {
+			/* Possible syncookie packet */
+			sisn = htonl(ntohl(th->seq) - 1);
+			disn = htonl(ntohl(th->ack_seq) - 1);
+			sne = 0;
+		} else if (unlikely(!th->syn)) {
+			/* no way to figure out initial sisn/disn - drop */
+			return SKB_DROP_REASON_TCP_FLAGS;
+		}
+	} else if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		disn = info->lisn;
+		if (th->syn || th->rst)
+			sisn = th->seq;
+		else
+			sisn = info->risn;
+	} else {
+		WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", sk->sk_state);
+		return SKB_DROP_REASON_TCP_AOFAILURE;
+	}
+verify_hash:
+	traffic_key = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+	if (!traffic_key)
+		return SKB_DROP_REASON_NOT_SPECIFIED;
+	tcp_ao_calc_key_skb(key, traffic_key, skb, sisn, disn, family);
+	ret = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
+				 traffic_key, phash, sne);
+	kfree(traffic_key);
+	return ret;
+
+key_not_found:
+	return SKB_DROP_REASON_TCP_AOKEYNOTFOUND;
+}
+
 static int tcp_ao_cache_traffic_keys(const struct sock *sk,
 				     struct tcp_ao_info *ao,
 				     struct tcp_ao_key *ao_key)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bdf0224ae827..f39ccefa78dc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2204,9 +2204,9 @@ process:
 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
 		else
-			drop_reason = tcp_inbound_md5_hash(sk, skb,
-						   &iph->saddr, &iph->daddr,
-						   AF_INET, dif, sdif);
+			drop_reason = tcp_inbound_hash(sk, req, skb,
+						       &iph->saddr, &iph->daddr,
+						       AF_INET, dif, sdif);
 		if (unlikely(drop_reason)) {
 			sk_drops_add(sk, skb);
 			reqsk_put(req);
@@ -2283,8 +2283,8 @@ process:
 		goto discard_and_relse;
 	}
 
-	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
-					   &iph->daddr, AF_INET, dif, sdif);
+	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
+				       AF_INET, dif, sdif);
 	if (drop_reason)
 		goto discard_and_relse;
 
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
index 99753e12c08c..8b04611c9078 100644
--- a/net/ipv6/tcp_ao.c
+++ b/net/ipv6/tcp_ao.c
@@ -53,11 +53,12 @@ int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
 			   const struct sk_buff *skb,
 			   __be32 sisn, __be32 disn)
 {
-       const struct ipv6hdr *iph = ipv6_hdr(skb);
-       const struct tcphdr *th = tcp_hdr(skb);
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
 
-       return tcp_v6_ao_calc_key(mkt, key, &iph->saddr, &iph->daddr,
-				 th->source, th->dest, sisn, disn);
+	return tcp_v6_ao_calc_key(mkt, key, &iph->saddr,
+				  &iph->daddr, th->source,
+				  th->dest, sisn, disn);
 }
 
 int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8c5c96187a72..d2724383d263 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1785,9 +1785,9 @@ process:
 		if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
 		else
-			drop_reason = tcp_inbound_md5_hash(sk, skb,
-							   &hdr->saddr, &hdr->daddr,
-							   AF_INET6, dif, sdif);
+			drop_reason = tcp_inbound_hash(sk, req, skb,
+						       &hdr->saddr, &hdr->daddr,
+						       AF_INET6, dif, sdif);
 		if (drop_reason) {
 			sk_drops_add(sk, skb);
 			reqsk_put(req);
@@ -1861,8 +1861,8 @@ process:
 		goto discard_and_relse;
 	}
 
-	drop_reason = tcp_inbound_md5_hash(sk, skb, &hdr->saddr, &hdr->daddr,
-					   AF_INET6, dif, sdif);
+	drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr,
+				       AF_INET6, dif, sdif);
 	if (drop_reason)
 		goto discard_and_relse;
 
@@ -2089,6 +2089,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
 	.ao_lookup	=	tcp_v6_ao_lookup,
 	.calc_ao_hash	=	tcp_v4_ao_hash_skb,
 	.ao_parse	=	tcp_v6_parse_ao,
+	.ao_calc_key_sk	=	tcp_v4_ao_calc_key_sk,
 #endif
 };
 #endif
-- 
cgit v1.2.3-70-g09d2


From af09a341dcf63b34ce742295ad1ce876246c5de2 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:05 +0100
Subject: net/tcp: Add TCP-AO segments counters

Introduce segment counters that are useful for troubleshooting/debugging
as well as for writing tests.
Now there are global snmp counters as well as per-socket and per-key.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dropreason-core.h | 15 +++++++++++----
 include/net/tcp.h             | 15 +++++++++++----
 include/net/tcp_ao.h          | 10 ++++++++++
 include/uapi/linux/snmp.h     |  4 ++++
 include/uapi/linux/tcp.h      |  8 +++++++-
 net/ipv4/proc.c               |  4 ++++
 net/ipv4/tcp_ao.c             | 30 +++++++++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c           |  2 +-
 net/ipv6/tcp_ipv6.c           |  4 ++--
 9 files changed, 77 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index 7637137ae33e..3c70ad53a49c 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -168,17 +168,24 @@ enum skb_drop_reason {
 	 */
 	SKB_DROP_REASON_TCP_MD5FAILURE,
 	/**
-	 * @SKB_DROP_REASON_TCP_AONOTFOUND: no TCP-AO hash and one was expected
+	 * @SKB_DROP_REASON_TCP_AONOTFOUND: no TCP-AO hash and one was expected,
+	 * corresponding to LINUX_MIB_TCPAOREQUIRED
 	 */
 	SKB_DROP_REASON_TCP_AONOTFOUND,
 	/**
 	 * @SKB_DROP_REASON_TCP_AOUNEXPECTED: TCP-AO hash is present and it
-	 * was not expected.
+	 * was not expected, corresponding to LINUX_MIB_TCPAOKEYNOTFOUND
 	 */
 	SKB_DROP_REASON_TCP_AOUNEXPECTED,
-	/** @SKB_DROP_REASON_TCP_AOKEYNOTFOUND: TCP-AO key is unknown */
+	/**
+	 * @SKB_DROP_REASON_TCP_AOKEYNOTFOUND: TCP-AO key is unknown,
+	 * corresponding to LINUX_MIB_TCPAOKEYNOTFOUND
+	 */
 	SKB_DROP_REASON_TCP_AOKEYNOTFOUND,
-	/** @SKB_DROP_REASON_TCP_AOFAILURE: TCP-AO hash is wrong */
+	/**
+	 * @SKB_DROP_REASON_TCP_AOFAILURE: TCP-AO hash is wrong,
+	 * corresponding to LINUX_MIB_TCPAOBAD
+	 */
 	SKB_DROP_REASON_TCP_AOFAILURE,
 	/**
 	 * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog (
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8e1f835bad22..50ae1ed244e5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2712,7 +2712,7 @@ static inline int tcp_parse_auth_options(const struct tcphdr *th,
 }
 
 static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
-				   int family)
+				   int family, bool stat_inc)
 {
 #ifdef CONFIG_TCP_AO
 	struct tcp_ao_info *ao_info;
@@ -2724,8 +2724,13 @@ static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
 		return false;
 
 	ao_key = tcp_ao_do_lookup(sk, saddr, family, -1, -1);
-	if (ao_info->ao_required || ao_key)
+	if (ao_info->ao_required || ao_key) {
+		if (stat_inc) {
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED);
+			atomic64_inc(&ao_info->counters.ao_required);
+		}
 		return true;
+	}
 #endif
 	return false;
 }
@@ -2747,8 +2752,10 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
 		return SKB_DROP_REASON_TCP_AUTH_HDR;
 
 	if (req) {
-		if (tcp_rsk_used_ao(req) != !!aoh)
+		if (tcp_rsk_used_ao(req) != !!aoh) {
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
 			return SKB_DROP_REASON_TCP_AOFAILURE;
+		}
 	}
 
 	/* sdif set, means packet ingressed via a device
@@ -2763,7 +2770,7 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
 		 * the last key is impossible to remove, so there's
 		 * always at least one current_key.
 		 */
-		if (tcp_ao_required(sk, saddr, family))
+		if (tcp_ao_required(sk, saddr, family, true))
 			return SKB_DROP_REASON_TCP_AONOTFOUND;
 		if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 1c7c0a5d1877..cfb55bd9411b 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -19,6 +19,13 @@ struct tcp_ao_hdr {
 	u8	rnext_keyid;
 };
 
+struct tcp_ao_counters {
+	atomic64_t	pkt_good;
+	atomic64_t	pkt_bad;
+	atomic64_t	key_not_found;
+	atomic64_t	ao_required;
+};
+
 struct tcp_ao_key {
 	struct hlist_node	node;
 	union tcp_ao_addr	addr;
@@ -33,6 +40,8 @@ struct tcp_ao_key {
 	u8			rcvid;
 	u8			maclen;
 	struct rcu_head		rcu;
+	atomic64_t		pkt_good;
+	atomic64_t		pkt_bad;
 	u8			traffic_keys[];
 };
 
@@ -81,6 +90,7 @@ struct tcp_ao_info {
 	 */
 	struct tcp_ao_key	*current_key;
 	struct tcp_ao_key	*rnext_key;
+	struct tcp_ao_counters	counters;
 	u32			ao_required	:1,
 				__unused	:31;
 	__be32			lisn;
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b2b72886cb6d..3d5ea841bffe 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -297,6 +297,10 @@ enum
 	LINUX_MIB_TCPMIGRATEREQSUCCESS,		/* TCPMigrateReqSuccess */
 	LINUX_MIB_TCPMIGRATEREQFAILURE,		/* TCPMigrateReqFailure */
 	LINUX_MIB_TCPPLBREHASH,			/* TCPPLBRehash */
+	LINUX_MIB_TCPAOREQUIRED,		/* TCPAORequired */
+	LINUX_MIB_TCPAOBAD,			/* TCPAOBad */
+	LINUX_MIB_TCPAOKEYNOTFOUND,		/* TCPAOKeyNotFound */
+	LINUX_MIB_TCPAOGOOD,			/* TCPAOGood */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index fa49f03e62fe..9c48964849d1 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -404,9 +404,15 @@ struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */
 	__u32   set_current	:1,	/* corresponding ::current_key */
 		set_rnext	:1,	/* corresponding ::rnext */
 		ao_required	:1,	/* don't accept non-AO connects */
-		reserved	:29;	/* must be 0 */
+		set_counters	:1,	/* set/clear ::pkt_* counters */
+		reserved	:28;	/* must be 0 */
+	__u16	reserved2;		/* padding, must be 0 */
 	__u8	current_key;		/* KeyID to set as Current_key */
 	__u8	rnext;			/* KeyID to set as Rnext_key */
+	__u64	pkt_good;		/* verified segments */
+	__u64	pkt_bad;		/* failed verification */
+	__u64	pkt_key_not_found;	/* could not find a key to verify */
+	__u64	pkt_ao_required;	/* segments missing TCP-AO sign */
 } __attribute__((aligned(8)));
 
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a85b0aba3646..f5b37ebc18c0 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -299,6 +299,10 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
 	SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
 	SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH),
+	SNMP_MIB_ITEM("TCPAORequired", LINUX_MIB_TCPAOREQUIRED),
+	SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD),
+	SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND),
+	SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 6c5815713b73..1097e99a9ad6 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -182,6 +182,8 @@ static struct tcp_ao_key *tcp_ao_copy_key(struct sock *sk,
 	*new_key = *key;
 	INIT_HLIST_NODE(&new_key->node);
 	tcp_sigpool_get(new_key->tcp_sigpool_id);
+	atomic64_set(&new_key->pkt_good, 0);
+	atomic64_set(&new_key->pkt_bad, 0);
 
 	return new_key;
 }
@@ -771,8 +773,12 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
 	const struct tcphdr *th = tcp_hdr(skb);
 	void *hash_buf = NULL;
 
-	if (maclen != tcp_ao_maclen(key))
+	if (maclen != tcp_ao_maclen(key)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+		atomic64_inc(&info->counters.pkt_bad);
+		atomic64_inc(&key->pkt_bad);
 		return SKB_DROP_REASON_TCP_AOFAILURE;
+	}
 
 	hash_buf = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
 	if (!hash_buf)
@@ -782,9 +788,15 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
 	tcp_ao_hash_skb(family, hash_buf, key, sk, skb, traffic_key,
 			(phash - (u8 *)th), sne);
 	if (memcmp(phash, hash_buf, maclen)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+		atomic64_inc(&info->counters.pkt_bad);
+		atomic64_inc(&key->pkt_bad);
 		kfree(hash_buf);
 		return SKB_DROP_REASON_TCP_AOFAILURE;
 	}
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOGOOD);
+	atomic64_inc(&info->counters.pkt_good);
+	atomic64_inc(&key->pkt_good);
 	kfree(hash_buf);
 	return SKB_NOT_DROPPED_YET;
 }
@@ -804,8 +816,10 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
 	u32 sne = 0;
 
 	info = rcu_dereference(tcp_sk(sk)->ao_info);
-	if (!info)
+	if (!info) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
 		return SKB_DROP_REASON_TCP_AOUNEXPECTED;
+	}
 
 	if (unlikely(th->syn)) {
 		sisn = th->seq;
@@ -900,6 +914,8 @@ verify_hash:
 	return ret;
 
 key_not_found:
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
+	atomic64_inc(&info->counters.key_not_found);
 	return SKB_DROP_REASON_TCP_AOKEYNOTFOUND;
 }
 
@@ -1483,6 +1499,8 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 	key->keyflags	= cmd.keyflags;
 	key->sndid	= cmd.sndid;
 	key->rcvid	= cmd.rcvid;
+	atomic64_set(&key->pkt_good, 0);
+	atomic64_set(&key->pkt_bad, 0);
 
 	ret = tcp_ao_parse_crypto(&cmd, key);
 	if (ret < 0)
@@ -1699,7 +1717,7 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
 			return -EINVAL;
 	}
 
-	if (cmd.reserved != 0)
+	if (cmd.reserved != 0 || cmd.reserved2 != 0)
 		return -EINVAL;
 
 	ao_info = setsockopt_ao_info(sk);
@@ -1734,6 +1752,12 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
 			goto out;
 		}
 	}
+	if (cmd.set_counters) {
+		atomic64_set(&ao_info->counters.pkt_good, cmd.pkt_good);
+		atomic64_set(&ao_info->counters.pkt_bad, cmd.pkt_bad);
+		atomic64_set(&ao_info->counters.key_not_found, cmd.pkt_key_not_found);
+		atomic64_set(&ao_info->counters.ao_required, cmd.pkt_ao_required);
+	}
 
 	ao_info->ao_required = cmd.ao_required;
 	if (new_current)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f39ccefa78dc..ece95d5138e1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1531,7 +1531,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
 	/* Don't allow keys for peers that have a matching TCP-AO key.
 	 * See the comment in tcp_ao_add_cmd()
 	 */
-	if (tcp_ao_required(sk, addr, AF_INET))
+	if (tcp_ao_required(sk, addr, AF_INET, false))
 		return -EKEYREJECTED;
 
 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d2724383d263..cc899caf348e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -661,7 +661,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 		/* Don't allow keys for peers that have a matching TCP-AO key.
 		 * See the comment in tcp_ao_add_cmd()
 		 */
-		if (tcp_ao_required(sk, addr, AF_INET))
+		if (tcp_ao_required(sk, addr, AF_INET, false))
 			return -EKEYREJECTED;
 		return tcp_md5_do_add(sk, addr,
 				      AF_INET, prefixlen, l3index, flags,
@@ -673,7 +673,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 	/* Don't allow keys for peers that have a matching TCP-AO key.
 	 * See the comment in tcp_ao_add_cmd()
 	 */
-	if (tcp_ao_required(sk, addr, AF_INET6))
+	if (tcp_ao_required(sk, addr, AF_INET6, false))
 		return -EKEYREJECTED;
 
 	return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags,
-- 
cgit v1.2.3-70-g09d2


From 64382c71a5575741933dfdb0cf7162c6e9b8854e Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:06 +0100
Subject: net/tcp: Add TCP-AO SNE support

Add Sequence Number Extension (SNE) for TCP-AO.
This is needed to protect long-living TCP-AO connections from replaying
attacks after sequence number roll-over, see RFC5925 (6.2).

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp_ao.h     | 22 +++++++++++++++++++++-
 net/ipv4/tcp_ao.c        | 46 +++++++++++++++++++++++++++++++++++++---------
 net/ipv4/tcp_input.c     | 28 ++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  3 ++-
 net/ipv4/tcp_minisocks.c | 15 ++++++++++++++-
 net/ipv6/tcp_ipv6.c      |  3 ++-
 6 files changed, 104 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index cfb55bd9411b..0c3516d1b968 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -95,6 +95,25 @@ struct tcp_ao_info {
 				__unused	:31;
 	__be32			lisn;
 	__be32			risn;
+	/* Sequence Number Extension (SNE) are upper 4 bytes for SEQ,
+	 * that protect TCP-AO connection from replayed old TCP segments.
+	 * See RFC5925 (6.2).
+	 * In order to get correct SNE, there's a helper tcp_ao_compute_sne().
+	 * It needs SEQ basis to understand whereabouts are lower SEQ numbers.
+	 * According to that basis vector, it can provide incremented SNE
+	 * when SEQ rolls over or provide decremented SNE when there's
+	 * a retransmitted segment from before-rolling over.
+	 * - for request sockets such basis is rcv_isn/snt_isn, which seems
+	 *   good enough as it's unexpected to receive 4 Gbytes on reqsk.
+	 * - for full sockets the basis is rcv_nxt/snd_una. snd_una is
+	 *   taken instead of snd_nxt as currently it's easier to track
+	 *   in tcp_snd_una_update(), rather than updating SNE in all
+	 *   WRITE_ONCE(tp->snd_nxt, ...)
+	 * - for time-wait sockets the basis is tw_rcv_nxt/tw_snd_nxt.
+	 *   tw_snd_nxt is not expected to change, while tw_rcv_nxt may.
+	 */
+	u32			snd_sne;
+	u32			rcv_sne;
 	refcount_t		refcnt;		/* Protects twsk destruction */
 	struct rcu_head		rcu;
 };
@@ -147,6 +166,7 @@ enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
 			const struct sk_buff *skb, unsigned short int family,
 			const struct request_sock *req,
 			const struct tcp_ao_hdr *aoh);
+u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq);
 struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 				    const union tcp_ao_addr *addr,
 				    int family, int sndid, int rcvid);
@@ -156,7 +176,7 @@ int tcp_ao_hash_hdr(unsigned short family, char *ao_hash,
 		    const union tcp_ao_addr *saddr,
 		    const struct tcphdr *th, u32 sne);
 int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
-			 const struct tcp_ao_hdr *aoh, int l3index,
+			 const struct tcp_ao_hdr *aoh, int l3index, u32 seq,
 			 struct tcp_ao_key **key, char **traffic_key,
 			 bool *allocated_traffic_key, u8 *keyid, u32 *sne);
 
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 1097e99a9ad6..7e14bcd4dfd4 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -401,6 +401,21 @@ static int tcp_ao_hash_pseudoheader(unsigned short int family,
 	return -EAFNOSUPPORT;
 }
 
+u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq)
+{
+	u32 sne = next_sne;
+
+	if (before(seq, next_seq)) {
+		if (seq > next_seq)
+			sne--;
+	} else {
+		if (seq < next_seq)
+			sne++;
+	}
+
+	return sne;
+}
+
 /* tcp_ao_hash_sne(struct tcp_sigpool *hp)
  * @hp	- used for hashing
  * @sne - sne value
@@ -611,7 +626,7 @@ struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 }
 
 int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
-			 const struct tcp_ao_hdr *aoh, int l3index,
+			 const struct tcp_ao_hdr *aoh, int l3index, u32 seq,
 			 struct tcp_ao_key **key, char **traffic_key,
 			 bool *allocated_traffic_key, u8 *keyid, u32 *sne)
 {
@@ -639,7 +654,7 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 
 			sisn = htonl(tcp_rsk(req)->rcv_isn);
 			disn = htonl(tcp_rsk(req)->snt_isn);
-			*sne = 0;
+			*sne = tcp_ao_compute_sne(0, tcp_rsk(req)->snt_isn, seq);
 		} else {
 			sisn = th->seq;
 			disn = 0;
@@ -670,11 +685,15 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 		*keyid = (*key)->rcvid;
 	} else {
 		struct tcp_ao_key *rnext_key;
+		u32 snd_basis;
 
-		if (sk->sk_state == TCP_TIME_WAIT)
+		if (sk->sk_state == TCP_TIME_WAIT) {
 			ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
-		else
+			snd_basis = tcp_twsk(sk)->tw_snd_nxt;
+		} else {
 			ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+			snd_basis = tcp_sk(sk)->snd_una;
+		}
 		if (!ao_info)
 			return -ENOENT;
 
@@ -684,7 +703,8 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 		*traffic_key = snd_other_key(*key);
 		rnext_key = READ_ONCE(ao_info->rnext_key);
 		*keyid = rnext_key->rcvid;
-		*sne = 0;
+		*sne = tcp_ao_compute_sne(READ_ONCE(ao_info->snd_sne),
+					  snd_basis, seq);
 	}
 	return 0;
 }
@@ -698,6 +718,7 @@ int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	struct tcp_ao_info *ao;
 	void *tkey_buf = NULL;
 	u8 *traffic_key;
+	u32 sne;
 
 	ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
 				       lockdep_sock_is_held(sk));
@@ -717,8 +738,10 @@ int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		tp->af_specific->ao_calc_key_sk(key, traffic_key,
 						sk, ao->lisn, disn, true);
 	}
+	sne = tcp_ao_compute_sne(READ_ONCE(ao->snd_sne), READ_ONCE(tp->snd_una),
+				 ntohl(th->seq));
 	tp->af_specific->calc_ao_hash(hash_location, key, sk, skb, traffic_key,
-				      hash_location - (u8 *)th, 0);
+				      hash_location - (u8 *)th, sne);
 	kfree(tkey_buf);
 	return 0;
 }
@@ -846,7 +869,8 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
 		if (unlikely(th->syn && !th->ack))
 			goto verify_hash;
 
-		sne = 0;
+		sne = tcp_ao_compute_sne(info->rcv_sne, tcp_sk(sk)->rcv_nxt,
+					 ntohl(th->seq));
 		/* Established socket, traffic key are cached */
 		traffic_key = rcv_other_key(key);
 		err = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
@@ -881,14 +905,16 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
 		/* Make the initial syn the likely case here */
 		if (unlikely(req)) {
-			sne = 0;
+			sne = tcp_ao_compute_sne(0, tcp_rsk(req)->rcv_isn,
+						 ntohl(th->seq));
 			sisn = htonl(tcp_rsk(req)->rcv_isn);
 			disn = htonl(tcp_rsk(req)->snt_isn);
 		} else if (unlikely(th->ack && !th->syn)) {
 			/* Possible syncookie packet */
 			sisn = htonl(ntohl(th->seq) - 1);
 			disn = htonl(ntohl(th->ack_seq) - 1);
-			sne = 0;
+			sne = tcp_ao_compute_sne(0, ntohl(sisn),
+						 ntohl(th->seq));
 		} else if (unlikely(!th->syn)) {
 			/* no way to figure out initial sisn/disn - drop */
 			return SKB_DROP_REASON_TCP_FLAGS;
@@ -986,6 +1012,7 @@ void tcp_ao_connect_init(struct sock *sk)
 		tp->tcp_header_len += tcp_ao_len(key);
 
 		ao_info->lisn = htonl(tp->write_seq);
+		ao_info->snd_sne = 0;
 	} else {
 		/* Can't happen: tcp_connect() verifies that there's
 		 * at least one tcp-ao key that matches the remote peer.
@@ -1021,6 +1048,7 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
 		return;
 
 	WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq);
+	ao->rcv_sne = 0;
 
 	hlist_for_each_entry_rcu(key, &ao->head, node)
 		tcp_ao_cache_traffic_keys(sk, ao, key);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index de64c0c2fb69..67ca72185f14 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3576,9 +3576,18 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
 static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
 {
 	u32 delta = ack - tp->snd_una;
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao;
+#endif
 
 	sock_owned_by_me((struct sock *)tp);
 	tp->bytes_acked += delta;
+#ifdef CONFIG_TCP_AO
+	ao = rcu_dereference_protected(tp->ao_info,
+				       lockdep_sock_is_held((struct sock *)tp));
+	if (ao && ack < tp->snd_una)
+		ao->snd_sne++;
+#endif
 	tp->snd_una = ack;
 }
 
@@ -3586,9 +3595,18 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
 static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
 {
 	u32 delta = seq - tp->rcv_nxt;
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao;
+#endif
 
 	sock_owned_by_me((struct sock *)tp);
 	tp->bytes_received += delta;
+#ifdef CONFIG_TCP_AO
+	ao = rcu_dereference_protected(tp->ao_info,
+				       lockdep_sock_is_held((struct sock *)tp));
+	if (ao && seq < tp->rcv_nxt)
+		ao->rcv_sne++;
+#endif
 	WRITE_ONCE(tp->rcv_nxt, seq);
 }
 
@@ -6456,6 +6474,16 @@ consume:
 		 * simultaneous connect with crossed SYNs.
 		 * Particularly, it can be connect to self.
 		 */
+#ifdef CONFIG_TCP_AO
+		struct tcp_ao_info *ao;
+
+		ao = rcu_dereference_protected(tp->ao_info,
+					       lockdep_sock_is_held(sk));
+		if (ao) {
+			WRITE_ONCE(ao->risn, th->seq);
+			ao->rcv_sne = 0;
+		}
+#endif
 		tcp_set_state(sk, TCP_SYN_RECV);
 
 		if (tp->rx_opt.saw_tstamp) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ece95d5138e1..bdec99707028 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -676,7 +676,7 @@ static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
 	u8 keyid;
 
 	rcu_read_lock();
-	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index,
+	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
 				 &key, &traffic_key, &allocated_traffic_key,
 				 &keyid, &ao_sne))
 		goto out;
@@ -1034,6 +1034,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 		struct tcp_ao_key *rnext_key;
 
 		key.traffic_key = snd_other_key(key.ao_key);
+		key.sne = READ_ONCE(ao_info->snd_sne);
 		rnext_key = READ_ONCE(ao_info->rnext_key);
 		key.rcv_next = rnext_key->rcvid;
 		key.type = TCP_KEY_AO;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 8d941a6e066b..a9807eeb311c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -51,6 +51,18 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
 	return TCP_TW_SUCCESS;
 }
 
+static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq)
+{
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao;
+
+	ao = rcu_dereference(tcptw->ao_info);
+	if (unlikely(ao && seq < tcptw->tw_rcv_nxt))
+		WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1);
+#endif
+	tcptw->tw_rcv_nxt = seq;
+}
+
 /*
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -136,7 +148,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 
 		/* FIN arrived, enter true time-wait state. */
 		tw->tw_substate	  = TCP_TIME_WAIT;
-		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq);
+
 		if (tmp_opt.saw_tstamp) {
 			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cc899caf348e..4dcbc13e9ec8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1090,7 +1090,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		int l3index;
 
 		l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
-		if (tcp_ao_prepare_reset(sk, skb, aoh, l3index,
+		if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq,
 					 &key.ao_key, &key.traffic_key,
 					 &allocated_traffic_key,
 					 &key.rcv_next, &key.sne))
@@ -1167,6 +1167,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 		/* rcv_next switches to our rcv_next */
 		rnext_key = READ_ONCE(ao_info->rnext_key);
 		key.rcv_next = rnext_key->rcvid;
+		key.sne = READ_ONCE(ao_info->snd_sne);
 		key.type = TCP_KEY_AO;
 #else
 	if (0) {
-- 
cgit v1.2.3-70-g09d2


From 2717b5adea9e2558798c30eb0e93c01722edbb0a Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:07 +0100
Subject: net/tcp: Add tcp_hash_fail() ratelimited logs

Add a helper for logging connection-detailed messages for failed TCP
hash verification (both MD5 and AO).

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 14 ++++++++++++--
 include/net/tcp_ao.h | 29 +++++++++++++++++++++++++++++
 net/ipv4/tcp.c       | 23 +++++++++++++----------
 net/ipv4/tcp_ao.c    |  7 +++++++
 4 files changed, 61 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 50ae1ed244e5..54226d85feb8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2748,12 +2748,18 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
 	int l3index;
 
 	/* Invalid option or two times meet any of auth options */
-	if (tcp_parse_auth_options(th, &md5_location, &aoh))
+	if (tcp_parse_auth_options(th, &md5_location, &aoh)) {
+		tcp_hash_fail("TCP segment has incorrect auth options set",
+			      family, skb, "");
 		return SKB_DROP_REASON_TCP_AUTH_HDR;
+	}
 
 	if (req) {
 		if (tcp_rsk_used_ao(req) != !!aoh) {
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+			tcp_hash_fail("TCP connection can't start/end using TCP-AO",
+				      family, skb, "%s",
+				      !aoh ? "missing AO" : "AO signed");
 			return SKB_DROP_REASON_TCP_AOFAILURE;
 		}
 	}
@@ -2770,10 +2776,14 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
 		 * the last key is impossible to remove, so there's
 		 * always at least one current_key.
 		 */
-		if (tcp_ao_required(sk, saddr, family, true))
+		if (tcp_ao_required(sk, saddr, family, true)) {
+			tcp_hash_fail("AO hash is required, but not found",
+					family, skb, "L3 index %d", l3index);
 			return SKB_DROP_REASON_TCP_AONOTFOUND;
+		}
 		if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+			tcp_hash_fail("MD5 Hash not found", family, skb, "");
 			return SKB_DROP_REASON_TCP_MD5NOTFOUND;
 		}
 		return SKB_NOT_DROPPED_YET;
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 0c3516d1b968..4da6e3657913 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -118,6 +118,35 @@ struct tcp_ao_info {
 	struct rcu_head		rcu;
 };
 
+#define tcp_hash_fail(msg, family, skb, fmt, ...)			\
+do {									\
+	const struct tcphdr *th = tcp_hdr(skb);				\
+	char hdr_flags[5] = {};						\
+	char *f = hdr_flags;						\
+									\
+	if (th->fin)							\
+		*f++ = 'F';						\
+	if (th->syn)							\
+		*f++ = 'S';						\
+	if (th->rst)							\
+		*f++ = 'R';						\
+	if (th->ack)							\
+		*f++ = 'A';						\
+	if (f != hdr_flags)						\
+		*f = ' ';						\
+	if ((family) == AF_INET) {					\
+		net_info_ratelimited("%s for (%pI4, %d)->(%pI4, %d) %s" fmt "\n", \
+				msg, &ip_hdr(skb)->saddr, ntohs(th->source), \
+				&ip_hdr(skb)->daddr, ntohs(th->dest),	\
+				hdr_flags, ##__VA_ARGS__);		\
+	} else {							\
+		net_info_ratelimited("%s for [%pI6c]:%u->[%pI6c]:%u %s" fmt "\n", \
+				msg, &ipv6_hdr(skb)->saddr, ntohs(th->source), \
+				&ipv6_hdr(skb)->daddr, ntohs(th->dest),	\
+				hdr_flags, ##__VA_ARGS__);		\
+	}								\
+} while (0)
+
 #ifdef CONFIG_TCP_AO
 /* TCP-AO structures and functions */
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index eb71212a09d8..1be6467a059a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4383,7 +4383,6 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 	 * o MD5 hash and we're not expecting one.
 	 * o MD5 hash and its wrong.
 	 */
-	const struct tcphdr *th = tcp_hdr(skb);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_md5sig_key *key;
 	u8 newhash[16];
@@ -4393,6 +4392,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 
 	if (!key && hash_location) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+		tcp_hash_fail("Unexpected MD5 Hash found", family, skb, "");
 		return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
 	}
 
@@ -4408,16 +4408,19 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
 		if (family == AF_INET) {
-			net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
-					saddr, ntohs(th->source),
-					daddr, ntohs(th->dest),
-					genhash ? " tcp_v4_calc_md5_hash failed"
-					: "", l3index);
+			tcp_hash_fail("MD5 Hash failed", AF_INET, skb, "%s L3 index %d",
+				      genhash ? "tcp_v4_calc_md5_hash failed"
+				      : "", l3index);
 		} else {
-			net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
-					genhash ? "failed" : "mismatch",
-					saddr, ntohs(th->source),
-					daddr, ntohs(th->dest), l3index);
+			if (genhash) {
+				tcp_hash_fail("MD5 Hash failed",
+					      AF_INET6, skb, "L3 index %d",
+					      l3index);
+			} else {
+				tcp_hash_fail("MD5 Hash mismatch",
+					      AF_INET6, skb, "L3 index %d",
+					      l3index);
+			}
 		}
 		return SKB_DROP_REASON_TCP_MD5FAILURE;
 	}
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 7e14bcd4dfd4..f76fcb93499d 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -800,6 +800,8 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
 		atomic64_inc(&info->counters.pkt_bad);
 		atomic64_inc(&key->pkt_bad);
+		tcp_hash_fail("AO hash wrong length", family, skb,
+			      "%u != %d", maclen, tcp_ao_maclen(key));
 		return SKB_DROP_REASON_TCP_AOFAILURE;
 	}
 
@@ -814,6 +816,7 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
 		atomic64_inc(&info->counters.pkt_bad);
 		atomic64_inc(&key->pkt_bad);
+		tcp_hash_fail("AO hash mismatch", family, skb, "");
 		kfree(hash_buf);
 		return SKB_DROP_REASON_TCP_AOFAILURE;
 	}
@@ -841,6 +844,8 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
 	info = rcu_dereference(tcp_sk(sk)->ao_info);
 	if (!info) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
+		tcp_hash_fail("AO key not found", family, skb,
+			      "keyid: %u", aoh->keyid);
 		return SKB_DROP_REASON_TCP_AOUNEXPECTED;
 	}
 
@@ -942,6 +947,8 @@ verify_hash:
 key_not_found:
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
 	atomic64_inc(&info->counters.key_not_found);
+	tcp_hash_fail("Requested by the peer AO key id not found",
+		      family, skb, "");
 	return SKB_DROP_REASON_TCP_AOKEYNOTFOUND;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 953af8e3acb68d2db11937cec3bc5da31de5c12e Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:08 +0100
Subject: net/tcp: Ignore specific ICMPs for TCP-AO connections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similarly to IPsec, RFC5925 prescribes:
  ">> A TCP-AO implementation MUST default to ignore incoming ICMPv4
  messages of Type 3 (destination unreachable), Codes 2-4 (protocol
  unreachable, port unreachable, and fragmentation needed -- ’hard
  errors’), and ICMPv6 Type 1 (destination unreachable), Code 1
  (administratively prohibited) and Code 4 (port unreachable) intended
  for connections in synchronized states (ESTABLISHED, FIN-WAIT-1, FIN-
  WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT) that match MKTs."

A selftest (later in patch series) verifies that this attack is not
possible in this TCP-AO implementation.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp_ao.h      | 11 ++++++++-
 include/uapi/linux/snmp.h |  1 +
 include/uapi/linux/tcp.h  |  4 +++-
 net/ipv4/proc.c           |  1 +
 net/ipv4/tcp_ao.c         | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c       |  7 ++++++
 net/ipv6/tcp_ipv6.c       |  7 ++++++
 7 files changed, 87 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 4da6e3657913..a9d38b9e8bcb 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -24,6 +24,7 @@ struct tcp_ao_counters {
 	atomic64_t	pkt_bad;
 	atomic64_t	key_not_found;
 	atomic64_t	ao_required;
+	atomic64_t	dropped_icmp;
 };
 
 struct tcp_ao_key {
@@ -92,7 +93,8 @@ struct tcp_ao_info {
 	struct tcp_ao_key	*rnext_key;
 	struct tcp_ao_counters	counters;
 	u32			ao_required	:1,
-				__unused	:31;
+				accept_icmps	:1,
+				__unused	:30;
 	__be32			lisn;
 	__be32			risn;
 	/* Sequence Number Extension (SNE) are upper 4 bytes for SEQ,
@@ -191,6 +193,7 @@ int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp);
 void tcp_ao_destroy_sock(struct sock *sk, bool twsk);
 void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp);
+bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code);
 enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
 			const struct sk_buff *skb, unsigned short int family,
 			const struct request_sock *req,
@@ -274,6 +277,12 @@ static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 {
 }
 
+static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family,
+				      int type, int code)
+{
+	return false;
+}
+
 static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
 		const struct sk_buff *skb, unsigned short int family,
 		const struct request_sock *req, const struct tcp_ao_hdr *aoh)
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 3d5ea841bffe..a0819c6a5988 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -301,6 +301,7 @@ enum
 	LINUX_MIB_TCPAOBAD,			/* TCPAOBad */
 	LINUX_MIB_TCPAOKEYNOTFOUND,		/* TCPAOKeyNotFound */
 	LINUX_MIB_TCPAOGOOD,			/* TCPAOGood */
+	LINUX_MIB_TCPAODROPPEDICMPS,		/* TCPAODroppedIcmps */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 9c48964849d1..d8b2ea23f12a 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -405,7 +405,8 @@ struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */
 		set_rnext	:1,	/* corresponding ::rnext */
 		ao_required	:1,	/* don't accept non-AO connects */
 		set_counters	:1,	/* set/clear ::pkt_* counters */
-		reserved	:28;	/* must be 0 */
+		accept_icmps	:1,	/* accept incoming ICMPs */
+		reserved	:27;	/* must be 0 */
 	__u16	reserved2;		/* padding, must be 0 */
 	__u8	current_key;		/* KeyID to set as Current_key */
 	__u8	rnext;			/* KeyID to set as Rnext_key */
@@ -413,6 +414,7 @@ struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */
 	__u64	pkt_bad;		/* failed verification */
 	__u64	pkt_key_not_found;	/* could not find a key to verify */
 	__u64	pkt_ao_required;	/* segments missing TCP-AO sign */
+	__u64	pkt_dropped_icmp;	/* ICMPs that were ignored */
 } __attribute__((aligned(8)));
 
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f5b37ebc18c0..5f4654ebff48 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -303,6 +303,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD),
 	SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND),
 	SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD),
+	SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index f76fcb93499d..223af5c9eaf3 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -15,6 +15,7 @@
 
 #include <net/tcp.h>
 #include <net/ipv6.h>
+#include <net/icmp.h>
 
 int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp)
@@ -44,6 +45,60 @@ clear_hash:
 	return 1;
 }
 
+bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code)
+{
+	bool ignore_icmp = false;
+	struct tcp_ao_info *ao;
+
+	/* RFC5925, 7.8:
+	 * >> A TCP-AO implementation MUST default to ignore incoming ICMPv4
+	 * messages of Type 3 (destination unreachable), Codes 2-4 (protocol
+	 * unreachable, port unreachable, and fragmentation needed -- ’hard
+	 * errors’), and ICMPv6 Type 1 (destination unreachable), Code 1
+	 * (administratively prohibited) and Code 4 (port unreachable) intended
+	 * for connections in synchronized states (ESTABLISHED, FIN-WAIT-1, FIN-
+	 * WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT) that match MKTs.
+	 */
+	if (family == AF_INET) {
+		if (type != ICMP_DEST_UNREACH)
+			return false;
+		if (code < ICMP_PROT_UNREACH || code > ICMP_FRAG_NEEDED)
+			return false;
+	} else {
+		if (type != ICMPV6_DEST_UNREACH)
+			return false;
+		if (code != ICMPV6_ADM_PROHIBITED && code != ICMPV6_PORT_UNREACH)
+			return false;
+	}
+
+	rcu_read_lock();
+	switch (sk->sk_state) {
+	case TCP_TIME_WAIT:
+		ao = rcu_dereference(tcp_twsk(sk)->ao_info);
+		break;
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:
+	case TCP_LISTEN:
+	case TCP_NEW_SYN_RECV:
+		/* RFC5925 specifies to ignore ICMPs *only* on connections
+		 * in synchronized states.
+		 */
+		rcu_read_unlock();
+		return false;
+	default:
+		ao = rcu_dereference(tcp_sk(sk)->ao_info);
+	}
+
+	if (ao && !ao->accept_icmps) {
+		ignore_icmp = true;
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAODROPPEDICMPS);
+		atomic64_inc(&ao->counters.dropped_icmp);
+	}
+	rcu_read_unlock();
+
+	return ignore_icmp;
+}
+
 /* Optimized version of tcp_ao_do_lookup(): only for sockets for which
  * it's known that the keys in ao_info are matching peer's
  * family/address/VRF/etc.
@@ -1086,6 +1141,7 @@ int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
 	new_ao->lisn = htonl(tcp_rsk(req)->snt_isn);
 	new_ao->risn = htonl(tcp_rsk(req)->rcv_isn);
 	new_ao->ao_required = ao->ao_required;
+	new_ao->accept_icmps = ao->accept_icmps;
 
 	if (family == AF_INET) {
 		addr = (union tcp_ao_addr *)&newsk->sk_daddr;
@@ -1792,9 +1848,11 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
 		atomic64_set(&ao_info->counters.pkt_bad, cmd.pkt_bad);
 		atomic64_set(&ao_info->counters.key_not_found, cmd.pkt_key_not_found);
 		atomic64_set(&ao_info->counters.ao_required, cmd.pkt_ao_required);
+		atomic64_set(&ao_info->counters.dropped_icmp, cmd.pkt_dropped_icmp);
 	}
 
 	ao_info->ao_required = cmd.ao_required;
+	ao_info->accept_icmps = cmd.accept_icmps;
 	if (new_current)
 		WRITE_ONCE(ao_info->current_key, new_current);
 	if (new_rnext)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bdec99707028..8f98c58e2689 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -494,6 +494,8 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
 		return -ENOENT;
 	}
 	if (sk->sk_state == TCP_TIME_WAIT) {
+		/* To increase the counter of ignored icmps for TCP-AO */
+		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
 		inet_twsk_put(inet_twsk(sk));
 		return 0;
 	}
@@ -507,6 +509,11 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
 		return 0;
 	}
 
+	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
+		sock_put(sk);
+		return 0;
+	}
+
 	bh_lock_sock(sk);
 	/* If too many ICMPs get dropped on busy
 	 * servers this needs to be solved differently.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4dcbc13e9ec8..fa7050579e9a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -396,6 +396,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	}
 
 	if (sk->sk_state == TCP_TIME_WAIT) {
+		/* To increase the counter of ignored icmps for TCP-AO */
+		tcp_ao_ignore_icmp(sk, AF_INET6, type, code);
 		inet_twsk_put(inet_twsk(sk));
 		return 0;
 	}
@@ -406,6 +408,11 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return 0;
 	}
 
+	if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) {
+		sock_put(sk);
+		return 0;
+	}
+
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
 		__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
-- 
cgit v1.2.3-70-g09d2


From 7753c2f0a857bfa6501e67deee03988dd0bcaae7 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:09 +0100
Subject: net/tcp: Add option for TCP-AO to (not) hash header

Provide setsockopt() key flag that makes TCP-AO exclude hashing TCP
header for peers that match the key. This is needed for interraction
with middleboxes that may change TCP options, see RFC5925 (9.2).

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 5 +++++
 net/ipv4/tcp_ao.c        | 8 +++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index d8b2ea23f12a..320aab010f9a 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -367,6 +367,11 @@ struct tcp_diag_md5sig {
 #define TCP_AO_MAXKEYLEN	80
 
 #define TCP_AO_KEYF_IFINDEX	(1 << 0)	/* L3 ifindex for VRF */
+#define TCP_AO_KEYF_EXCLUDE_OPT	(1 << 1)	/* "Indicates whether TCP
+						 *  options other than TCP-AO
+						 *  are included in the MAC
+						 *  calculation"
+						 */
 
 struct tcp_ao_add { /* setsockopt(TCP_AO_ADD_KEY) */
 	struct __kernel_sockaddr_storage addr;	/* peer's address for the key */
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 223af5c9eaf3..10cc6be4d537 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -562,7 +562,8 @@ int tcp_ao_hash_hdr(unsigned short int family, char *ao_hash,
 		WARN_ON_ONCE(1);
 		goto clear_hash;
 	}
-	if (tcp_ao_hash_header(&hp, th, false,
+	if (tcp_ao_hash_header(&hp, th,
+			       !!(key->keyflags & TCP_AO_KEYF_EXCLUDE_OPT),
 			       ao_hash, hash_offset, tcp_ao_maclen(key)))
 		goto clear_hash;
 	ahash_request_set_crypt(hp.req, NULL, hash_buf, 0);
@@ -610,7 +611,8 @@ int tcp_ao_hash_skb(unsigned short int family,
 		goto clear_hash;
 	if (tcp_ao_hash_pseudoheader(family, sk, skb, &hp, skb->len))
 		goto clear_hash;
-	if (tcp_ao_hash_header(&hp, th, false,
+	if (tcp_ao_hash_header(&hp, th,
+			       !!(key->keyflags & TCP_AO_KEYF_EXCLUDE_OPT),
 			       ao_hash, hash_offset, tcp_ao_maclen(key)))
 		goto clear_hash;
 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
@@ -1454,7 +1456,7 @@ static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
 	return ERR_PTR(-ESOCKTNOSUPPORT);
 }
 
-#define TCP_AO_KEYF_ALL		(0)
+#define TCP_AO_KEYF_ALL		(TCP_AO_KEYF_EXCLUDE_OPT)
 
 static struct tcp_ao_key *tcp_ao_key_alloc(struct sock *sk,
 					   struct tcp_ao_add *cmd)
-- 
cgit v1.2.3-70-g09d2


From ef84703a911f4ee52ca585e8308b7084093941f4 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:10 +0100
Subject: net/tcp: Add TCP-AO getsockopt()s

Introduce getsockopt(TCP_AO_GET_KEYS) that lets a user get TCP-AO keys
and their properties from a socket. The user can provide a filter
to match the specific key to be dumped or ::get_all = 1 may be
used to dump all keys in one syscall.

Add another getsockopt(TCP_AO_INFO) for providing per-socket/per-ao_info
stats: packet counters, Current_key/RNext_key and flags like
::ao_required and ::accept_icmps.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp_ao.h     |  12 ++
 include/uapi/linux/tcp.h |  63 +++++++---
 net/ipv4/tcp.c           |  13 +++
 net/ipv4/tcp_ao.c        | 295 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 369 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index a9d38b9e8bcb..061c358a3c8a 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -194,6 +194,8 @@ int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 void tcp_ao_destroy_sock(struct sock *sk, bool twsk);
 void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp);
 bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code);
+int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen);
+int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen);
 enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
 			const struct sk_buff *skb, unsigned short int family,
 			const struct request_sock *req,
@@ -316,6 +318,16 @@ static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw,
 static inline void tcp_ao_connect_init(struct sock *sk)
 {
 }
+
+static inline int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+	return -ENOPROTOOPT;
+}
+
+static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+	return -ENOPROTOOPT;
+}
 #endif
 
 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 320aab010f9a..201b3cbd6020 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -131,7 +131,8 @@ enum {
 
 #define TCP_AO_ADD_KEY		38	/* Add/Set MKT */
 #define TCP_AO_DEL_KEY		39	/* Delete MKT */
-#define TCP_AO_INFO		40	/* Modify TCP-AO per-socket options */
+#define TCP_AO_INFO		40	/* Set/list TCP-AO per-socket options */
+#define TCP_AO_GET_KEYS		41	/* List MKT(s) */
 
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF		0
@@ -405,21 +406,55 @@ struct tcp_ao_del { /* setsockopt(TCP_AO_DEL_KEY) */
 	__u8	keyflags;		/* see TCP_AO_KEYF_ */
 } __attribute__((aligned(8)));
 
-struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */
-	__u32   set_current	:1,	/* corresponding ::current_key */
-		set_rnext	:1,	/* corresponding ::rnext */
-		ao_required	:1,	/* don't accept non-AO connects */
-		set_counters	:1,	/* set/clear ::pkt_* counters */
-		accept_icmps	:1,	/* accept incoming ICMPs */
+struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO), getsockopt(TCP_AO_INFO) */
+	/* Here 'in' is for setsockopt(), 'out' is for getsockopt() */
+	__u32   set_current	:1,	/* in/out: corresponding ::current_key */
+		set_rnext	:1,	/* in/out: corresponding ::rnext */
+		ao_required	:1,	/* in/out: don't accept non-AO connects */
+		set_counters	:1,	/* in: set/clear ::pkt_* counters */
+		accept_icmps	:1,	/* in/out: accept incoming ICMPs */
 		reserved	:27;	/* must be 0 */
 	__u16	reserved2;		/* padding, must be 0 */
-	__u8	current_key;		/* KeyID to set as Current_key */
-	__u8	rnext;			/* KeyID to set as Rnext_key */
-	__u64	pkt_good;		/* verified segments */
-	__u64	pkt_bad;		/* failed verification */
-	__u64	pkt_key_not_found;	/* could not find a key to verify */
-	__u64	pkt_ao_required;	/* segments missing TCP-AO sign */
-	__u64	pkt_dropped_icmp;	/* ICMPs that were ignored */
+	__u8	current_key;		/* in/out: KeyID of Current_key */
+	__u8	rnext;			/* in/out: keyid of RNext_key */
+	__u64	pkt_good;		/* in/out: verified segments */
+	__u64	pkt_bad;		/* in/out: failed verification */
+	__u64	pkt_key_not_found;	/* in/out: could not find a key to verify */
+	__u64	pkt_ao_required;	/* in/out: segments missing TCP-AO sign */
+	__u64	pkt_dropped_icmp;	/* in/out: ICMPs that were ignored */
+} __attribute__((aligned(8)));
+
+struct tcp_ao_getsockopt { /* getsockopt(TCP_AO_GET_KEYS) */
+	struct __kernel_sockaddr_storage addr;	/* in/out: dump keys for peer
+						 * with this address/prefix
+						 */
+	char	alg_name[64];		/* out: crypto hash algorithm */
+	__u8	key[TCP_AO_MAXKEYLEN];
+	__u32	nkeys;			/* in: size of the userspace buffer
+					 * @optval, measured in @optlen - the
+					 * sizeof(struct tcp_ao_getsockopt)
+					 * out: number of keys that matched
+					 */
+	__u16   is_current	:1,	/* in: match and dump Current_key,
+					 * out: the dumped key is Current_key
+					 */
+
+		is_rnext	:1,	/* in: match and dump RNext_key,
+					 * out: the dumped key is RNext_key
+					 */
+		get_all		:1,	/* in: dump all keys */
+		reserved	:13;	/* padding, must be 0 */
+	__u8	sndid;			/* in/out: dump keys with SendID */
+	__u8	rcvid;			/* in/out: dump keys with RecvID */
+	__u8	prefix;			/* in/out: dump keys with address/prefix */
+	__u8	maclen;			/* out: key's length of authentication
+					 * code (hash)
+					 */
+	__u8	keyflags;		/* in/out: see TCP_AO_KEYF_ */
+	__u8	keylen;			/* out: length of ::key */
+	__s32	ifindex;		/* in/out: L3 dev index for VRF */
+	__u64	pkt_good;		/* out: verified segments */
+	__u64	pkt_bad;		/* out: segments that failed verification */
 } __attribute__((aligned(8)));
 
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1be6467a059a..ce33eee9d0f2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4284,6 +4284,19 @@ zerocopy_rcv_out:
 		return err;
 	}
 #endif
+	case TCP_AO_GET_KEYS:
+	case TCP_AO_INFO: {
+		int err;
+
+		sockopt_lock_sock(sk);
+		if (optname == TCP_AO_GET_KEYS)
+			err = tcp_ao_get_mkts(sk, optval, optlen);
+		else
+			err = tcp_ao_get_sock_info(sk, optval, optlen);
+		sockopt_release_sock(sk);
+
+		return err;
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 10cc6be4d537..cbc1ee0f5b9a 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -1894,3 +1894,298 @@ int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen)
 	return tcp_parse_ao(sk, cmd, AF_INET, optval, optlen);
 }
 
+/* tcp_ao_copy_mkts_to_user(ao_info, optval, optlen)
+ *
+ * @ao_info:	struct tcp_ao_info on the socket that
+ *		socket getsockopt(TCP_AO_GET_KEYS) is executed on
+ * @optval:	pointer to array of tcp_ao_getsockopt structures in user space.
+ *		Must be != NULL.
+ * @optlen:	pointer to size of tcp_ao_getsockopt structure.
+ *		Must be != NULL.
+ *
+ * Return value: 0 on success, a negative error number otherwise.
+ *
+ * optval points to an array of tcp_ao_getsockopt structures in user space.
+ * optval[0] is used as both input and output to getsockopt. It determines
+ * which keys are returned by the kernel.
+ * optval[0].nkeys is the size of the array in user space. On return it contains
+ * the number of keys matching the search criteria.
+ * If tcp_ao_getsockopt::get_all is set, then all keys in the socket are
+ * returned, otherwise only keys matching <addr, prefix, sndid, rcvid>
+ * in optval[0] are returned.
+ * optlen is also used as both input and output. The user provides the size
+ * of struct tcp_ao_getsockopt in user space, and the kernel returns the size
+ * of the structure in kernel space.
+ * The size of struct tcp_ao_getsockopt may differ between user and kernel.
+ * There are three cases to consider:
+ *  * If usize == ksize, then keys are copied verbatim.
+ *  * If usize < ksize, then the userspace has passed an old struct to a
+ *    newer kernel. The rest of the trailing bytes in optval[0]
+ *    (ksize - usize) are interpreted as 0 by the kernel.
+ *  * If usize > ksize, then the userspace has passed a new struct to an
+ *    older kernel. The trailing bytes unknown to the kernel (usize - ksize)
+ *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
+ * On return the kernel fills in min(usize, ksize) in each entry of the array.
+ * The layout of the fields in the user and kernel structures is expected to
+ * be the same (including in the 32bit vs 64bit case).
+ */
+static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info,
+				    sockptr_t optval, sockptr_t optlen)
+{
+	struct tcp_ao_getsockopt opt_in, opt_out;
+	struct tcp_ao_key *key, *current_key;
+	bool do_address_matching = true;
+	union tcp_ao_addr *addr = NULL;
+	unsigned int max_keys;	/* maximum number of keys to copy to user */
+	size_t out_offset = 0;
+	size_t bytes_to_write;	/* number of bytes to write to user level */
+	int err, user_len;
+	u32 matched_keys;	/* keys from ao_info matched so far */
+	int optlen_out;
+	__be16 port = 0;
+
+	if (copy_from_sockptr(&user_len, optlen, sizeof(int)))
+		return -EFAULT;
+
+	if (user_len <= 0)
+		return -EINVAL;
+
+	memset(&opt_in, 0, sizeof(struct tcp_ao_getsockopt));
+	err = copy_struct_from_sockptr(&opt_in, sizeof(opt_in),
+				       optval, user_len);
+	if (err < 0)
+		return err;
+
+	if (opt_in.pkt_good || opt_in.pkt_bad)
+		return -EINVAL;
+
+	if (opt_in.reserved != 0)
+		return -EINVAL;
+
+	max_keys = opt_in.nkeys;
+
+	if (opt_in.get_all || opt_in.is_current || opt_in.is_rnext) {
+		if (opt_in.get_all && (opt_in.is_current || opt_in.is_rnext))
+			return -EINVAL;
+		do_address_matching = false;
+	}
+
+	switch (opt_in.addr.ss_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin;
+		__be32 mask;
+
+		sin = (struct sockaddr_in *)&opt_in.addr;
+		port = sin->sin_port;
+		addr = (union tcp_ao_addr *)&sin->sin_addr;
+
+		if (opt_in.prefix > 32)
+			return -EINVAL;
+
+		if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY &&
+		    opt_in.prefix != 0)
+			return -EINVAL;
+
+		mask = inet_make_mask(opt_in.prefix);
+		if (sin->sin_addr.s_addr & ~mask)
+			return -EINVAL;
+
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin6;
+		struct in6_addr *addr6;
+
+		sin6 = (struct sockaddr_in6 *)&opt_in.addr;
+		addr = (union tcp_ao_addr *)&sin6->sin6_addr;
+		addr6 = &sin6->sin6_addr;
+		port = sin6->sin6_port;
+
+		/* We don't have to change family and @addr here if
+		 * ipv6_addr_v4mapped() like in key adding:
+		 * tcp_ao_key_cmp() does it. Do the sanity checks though.
+		 */
+		if (opt_in.prefix != 0) {
+			if (ipv6_addr_v4mapped(addr6)) {
+				__be32 mask, addr4 = addr6->s6_addr32[3];
+
+				if (opt_in.prefix > 32 ||
+				    ntohl(addr4) == INADDR_ANY)
+					return -EINVAL;
+				mask = inet_make_mask(opt_in.prefix);
+				if (addr4 & ~mask)
+					return -EINVAL;
+			} else {
+				struct in6_addr pfx;
+
+				if (ipv6_addr_any(addr6) ||
+				    opt_in.prefix > 128)
+					return -EINVAL;
+
+				ipv6_addr_prefix(&pfx, addr6, opt_in.prefix);
+				if (ipv6_addr_cmp(&pfx, addr6))
+					return -EINVAL;
+			}
+		} else if (!ipv6_addr_any(addr6)) {
+			return -EINVAL;
+		}
+		break;
+	}
+	case 0:
+		if (!do_address_matching)
+			break;
+		fallthrough;
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	if (!do_address_matching) {
+		/* We could just ignore those, but let's do stricter checks */
+		if (addr || port)
+			return -EINVAL;
+		if (opt_in.prefix || opt_in.sndid || opt_in.rcvid)
+			return -EINVAL;
+	}
+
+	bytes_to_write = min_t(int, user_len, sizeof(struct tcp_ao_getsockopt));
+	matched_keys = 0;
+	/* May change in RX, while we're dumping, pre-fetch it */
+	current_key = READ_ONCE(ao_info->current_key);
+
+	hlist_for_each_entry_rcu(key, &ao_info->head, node) {
+		if (opt_in.get_all)
+			goto match;
+
+		if (opt_in.is_current || opt_in.is_rnext) {
+			if (opt_in.is_current && key == current_key)
+				goto match;
+			if (opt_in.is_rnext && key == ao_info->rnext_key)
+				goto match;
+			continue;
+		}
+
+		if (tcp_ao_key_cmp(key, addr, opt_in.prefix,
+				   opt_in.addr.ss_family,
+				   opt_in.sndid, opt_in.rcvid) != 0)
+			continue;
+match:
+		matched_keys++;
+		if (matched_keys > max_keys)
+			continue;
+
+		memset(&opt_out, 0, sizeof(struct tcp_ao_getsockopt));
+
+		if (key->family == AF_INET) {
+			struct sockaddr_in *sin_out = (struct sockaddr_in *)&opt_out.addr;
+
+			sin_out->sin_family = key->family;
+			sin_out->sin_port = 0;
+			memcpy(&sin_out->sin_addr, &key->addr, sizeof(struct in_addr));
+		} else {
+			struct sockaddr_in6 *sin6_out = (struct sockaddr_in6 *)&opt_out.addr;
+
+			sin6_out->sin6_family = key->family;
+			sin6_out->sin6_port = 0;
+			memcpy(&sin6_out->sin6_addr, &key->addr, sizeof(struct in6_addr));
+		}
+		opt_out.sndid = key->sndid;
+		opt_out.rcvid = key->rcvid;
+		opt_out.prefix = key->prefixlen;
+		opt_out.keyflags = key->keyflags;
+		opt_out.is_current = (key == current_key);
+		opt_out.is_rnext = (key == ao_info->rnext_key);
+		opt_out.nkeys = 0;
+		opt_out.maclen = key->maclen;
+		opt_out.keylen = key->keylen;
+		opt_out.pkt_good = atomic64_read(&key->pkt_good);
+		opt_out.pkt_bad = atomic64_read(&key->pkt_bad);
+		memcpy(&opt_out.key, key->key, key->keylen);
+		tcp_sigpool_algo(key->tcp_sigpool_id, opt_out.alg_name, 64);
+
+		/* Copy key to user */
+		if (copy_to_sockptr_offset(optval, out_offset,
+					   &opt_out, bytes_to_write))
+			return -EFAULT;
+		out_offset += user_len;
+	}
+
+	optlen_out = (int)sizeof(struct tcp_ao_getsockopt);
+	if (copy_to_sockptr(optlen, &optlen_out, sizeof(int)))
+		return -EFAULT;
+
+	out_offset = offsetof(struct tcp_ao_getsockopt, nkeys);
+	if (copy_to_sockptr_offset(optval, out_offset,
+				   &matched_keys, sizeof(u32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+	struct tcp_ao_info *ao_info;
+
+	ao_info = setsockopt_ao_info(sk);
+	if (IS_ERR(ao_info))
+		return PTR_ERR(ao_info);
+	if (!ao_info)
+		return -ENOENT;
+
+	return tcp_ao_copy_mkts_to_user(ao_info, optval, optlen);
+}
+
+int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+	struct tcp_ao_info_opt out, in = {};
+	struct tcp_ao_key *current_key;
+	struct tcp_ao_info *ao;
+	int err, len;
+
+	if (copy_from_sockptr(&len, optlen, sizeof(int)))
+		return -EFAULT;
+
+	if (len <= 0)
+		return -EINVAL;
+
+	/* Copying this "in" only to check ::reserved, ::reserved2,
+	 * that may be needed to extend (struct tcp_ao_info_opt) and
+	 * what getsockopt() provides in future.
+	 */
+	err = copy_struct_from_sockptr(&in, sizeof(in), optval, len);
+	if (err)
+		return err;
+
+	if (in.reserved != 0 || in.reserved2 != 0)
+		return -EINVAL;
+
+	ao = setsockopt_ao_info(sk);
+	if (IS_ERR(ao))
+		return PTR_ERR(ao);
+	if (!ao)
+		return -ENOENT;
+
+	memset(&out, 0, sizeof(out));
+	out.ao_required		= ao->ao_required;
+	out.accept_icmps	= ao->accept_icmps;
+	out.pkt_good		= atomic64_read(&ao->counters.pkt_good);
+	out.pkt_bad		= atomic64_read(&ao->counters.pkt_bad);
+	out.pkt_key_not_found	= atomic64_read(&ao->counters.key_not_found);
+	out.pkt_ao_required	= atomic64_read(&ao->counters.ao_required);
+	out.pkt_dropped_icmp	= atomic64_read(&ao->counters.dropped_icmp);
+
+	current_key = READ_ONCE(ao->current_key);
+	if (current_key) {
+		out.set_current = 1;
+		out.current_key = current_key->sndid;
+	}
+	if (ao->rnext_key) {
+		out.set_rnext = 1;
+		out.rnext = ao->rnext_key->rcvid;
+	}
+
+	if (copy_to_sockptr(optval, &out, min_t(int, len, sizeof(out))))
+		return -EFAULT;
+
+	return 0;
+}
+
-- 
cgit v1.2.3-70-g09d2


From d6732b95b6fbbc6d5bb9d2f809e275763640c4a2 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:11 +0100
Subject: net/tcp: Allow asynchronous delete for TCP-AO keys (MKTs)

Delete becomes very, very fast - almost free, but after setsockopt()
syscall returns, the key is still alive until next RCU grace period.
Which is fine for listen sockets as userspace needs to be aware of
setsockopt(TCP_AO) and accept() race and resolve it with verification
by getsockopt() after TCP connection was accepted.

The benchmark results (on non-loaded box, worse with more RCU work pending):
> ok 33    Worst case delete    16384 keys: min=5ms max=10ms mean=6.93904ms stddev=0.263421
> ok 34        Add a new key    16384 keys: min=1ms max=4ms mean=2.17751ms stddev=0.147564
> ok 35 Remove random-search    16384 keys: min=5ms max=10ms mean=6.50243ms stddev=0.254999
> ok 36         Remove async    16384 keys: min=0ms max=0ms mean=0.0296107ms stddev=0.0172078

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |  3 ++-
 net/ipv4/tcp_ao.c        | 21 ++++++++++++++++++---
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 201b3cbd6020..be34d7c5c531 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -396,7 +396,8 @@ struct tcp_ao_del { /* setsockopt(TCP_AO_DEL_KEY) */
 	__s32	ifindex;		/* L3 dev index for VRF */
 	__u32   set_current	:1,	/* corresponding ::current_key */
 		set_rnext	:1,	/* corresponding ::rnext */
-		reserved	:30;	/* must be 0 */
+		del_async	:1,	/* only valid for listen sockets */
+		reserved	:29;	/* must be 0 */
 	__u16	reserved2;		/* padding, must be 0 */
 	__u8	prefix;			/* peer's address prefix */
 	__u8	sndid;			/* SendID for outgoing segments */
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index cbc1ee0f5b9a..acbeb635fe29 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -1628,7 +1628,7 @@ err_free_ao:
 }
 
 static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info,
-			     struct tcp_ao_key *key,
+			     bool del_async, struct tcp_ao_key *key,
 			     struct tcp_ao_key *new_current,
 			     struct tcp_ao_key *new_rnext)
 {
@@ -1636,11 +1636,24 @@ static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info,
 
 	hlist_del_rcu(&key->node);
 
+	/* Support for async delete on listening sockets: as they don't
+	 * need current_key/rnext_key maintaining, we don't need to check
+	 * them and we can just free all resources in RCU fashion.
+	 */
+	if (del_async) {
+		atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+		call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+		return 0;
+	}
+
 	/* At this moment another CPU could have looked this key up
 	 * while it was unlinked from the list. Wait for RCU grace period,
 	 * after which the key is off-list and can't be looked up again;
 	 * the rx path [just before RCU came] might have used it and set it
 	 * as current_key (very unlikely).
+	 * Free the key with next RCU grace period (in case it was
+	 * current_key before tcp_ao_current_rnext() might have
+	 * changed it in forced-delete).
 	 */
 	synchronize_rcu();
 	if (new_current)
@@ -1711,6 +1724,8 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
 		if (!new_rnext)
 			return -ENOENT;
 	}
+	if (cmd.del_async && sk->sk_state != TCP_LISTEN)
+		return -EINVAL;
 
 	if (family == AF_INET) {
 		struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.addr;
@@ -1758,8 +1773,8 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
 		if (key == new_current || key == new_rnext)
 			continue;
 
-		return tcp_ao_delete_key(sk, ao_info, key,
-					  new_current, new_rnext);
+		return tcp_ao_delete_key(sk, ao_info, cmd.del_async, key,
+					 new_current, new_rnext);
 	}
 	return -ENOENT;
 }
-- 
cgit v1.2.3-70-g09d2


From 67fa83f7c86a86913ab9cd5a13b4bebd8d2ebb43 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:12 +0100
Subject: net/tcp: Add static_key for TCP-AO

Similarly to TCP-MD5, add a static key to TCP-AO that is patched out
when there are no keys on a machine and dynamically enabled with the
first setsockopt(TCP_AO) adds a key on any socket. The static key is as
well dynamically disabled later when the socket is destructed.

The lifetime of enabled static key here is the same as ao_info: it is
enabled on allocation, passed over from full socket to twsk and
destructed when ao_info is scheduled for destruction.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 24 ++++++++++++++++--------
 include/net/tcp_ao.h |  2 ++
 net/ipv4/tcp_ao.c    | 22 ++++++++++++++++++++++
 net/ipv4/tcp_input.c | 42 ++++++++++++++++++++++++++++--------------
 net/ipv4/tcp_ipv4.c  | 25 ++++++++++++++-----------
 net/ipv6/tcp_ipv6.c  | 25 ++++++++++++++-----------
 6 files changed, 96 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 54226d85feb8..e3617c433cf1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2288,14 +2288,18 @@ static inline void tcp_get_current_key(const struct sock *sk,
 #if defined(CONFIG_TCP_AO) || defined(CONFIG_TCP_MD5SIG)
 	const struct tcp_sock *tp = tcp_sk(sk);
 #endif
-#ifdef CONFIG_TCP_AO
-	struct tcp_ao_info *ao;
 
-	ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk));
-	if (ao) {
-		out->ao_key = READ_ONCE(ao->current_key);
-		out->type = TCP_KEY_AO;
-		return;
+#ifdef CONFIG_TCP_AO
+	if (static_branch_unlikely(&tcp_ao_needed.key)) {
+		struct tcp_ao_info *ao;
+
+		ao = rcu_dereference_protected(tp->ao_info,
+					       lockdep_sock_is_held(sk));
+		if (ao) {
+			out->ao_key = READ_ONCE(ao->current_key);
+			out->type = TCP_KEY_AO;
+			return;
+		}
 	}
 #endif
 #ifdef CONFIG_TCP_MD5SIG
@@ -2324,7 +2328,8 @@ static inline bool tcp_key_is_md5(const struct tcp_key *key)
 static inline bool tcp_key_is_ao(const struct tcp_key *key)
 {
 #ifdef CONFIG_TCP_AO
-	if (key->type == TCP_KEY_AO)
+	if (static_branch_unlikely(&tcp_ao_needed.key) &&
+	    key->type == TCP_KEY_AO)
 		return true;
 #endif
 	return false;
@@ -2718,6 +2723,9 @@ static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
 	struct tcp_ao_info *ao_info;
 	struct tcp_ao_key *ao_key;
 
+	if (!static_branch_unlikely(&tcp_ao_needed.key))
+		return false;
+
 	ao_info = rcu_dereference_check(tcp_sk(sk)->ao_info,
 					lockdep_sock_is_held(sk));
 	if (!ao_info)
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 061c358a3c8a..a38408072ea8 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -151,6 +151,8 @@ do {									\
 
 #ifdef CONFIG_TCP_AO
 /* TCP-AO structures and functions */
+#include <linux/jump_label.h>
+extern struct static_key_false_deferred tcp_ao_needed;
 
 struct tcp4_ao_context {
 	__be32		saddr;
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index acbeb635fe29..ffce8ca60ff2 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -17,6 +17,8 @@
 #include <net/ipv6.h>
 #include <net/icmp.h>
 
+DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_ao_needed, HZ);
+
 int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp)
 {
@@ -50,6 +52,9 @@ bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code)
 	bool ignore_icmp = false;
 	struct tcp_ao_info *ao;
 
+	if (!static_branch_unlikely(&tcp_ao_needed.key))
+		return false;
+
 	/* RFC5925, 7.8:
 	 * >> A TCP-AO implementation MUST default to ignore incoming ICMPv4
 	 * messages of Type 3 (destination unreachable), Codes 2-4 (protocol
@@ -185,6 +190,9 @@ static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk,
 	struct tcp_ao_key *key;
 	struct tcp_ao_info *ao;
 
+	if (!static_branch_unlikely(&tcp_ao_needed.key))
+		return NULL;
+
 	ao = rcu_dereference_check(tcp_sk(sk)->ao_info,
 				   lockdep_sock_is_held(sk));
 	if (!ao)
@@ -276,6 +284,7 @@ void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
 	}
 
 	kfree_rcu(ao, rcu);
+	static_branch_slow_dec_deferred(&tcp_ao_needed);
 }
 
 void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp)
@@ -1180,6 +1189,11 @@ int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
 		goto free_and_exit;
 	}
 
+	if (!static_key_fast_inc_not_disabled(&tcp_ao_needed.key.key)) {
+		ret = -EUSERS;
+		goto free_and_exit;
+	}
+
 	key_head = rcu_dereference(hlist_first_rcu(&new_ao->head));
 	first_key = hlist_entry_safe(key_head, struct tcp_ao_key, node);
 
@@ -1607,6 +1621,10 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 
 	tcp_ao_link_mkt(ao_info, key);
 	if (first) {
+		if (!static_branch_inc(&tcp_ao_needed.key)) {
+			ret = -EUSERS;
+			goto err_free_sock;
+		}
 		sk_gso_disable(sk);
 		rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
 	}
@@ -1875,6 +1893,10 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
 	if (new_rnext)
 		WRITE_ONCE(ao_info->rnext_key, new_rnext);
 	if (first) {
+		if (!static_branch_inc(&tcp_ao_needed.key)) {
+			err = -EUSERS;
+			goto out;
+		}
 		sk_gso_disable(sk);
 		rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 67ca72185f14..50aaa1527150 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3572,41 +3572,55 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
 		(ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin));
 }
 
-/* If we update tp->snd_una, also update tp->bytes_acked */
-static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
 {
-	u32 delta = ack - tp->snd_una;
 #ifdef CONFIG_TCP_AO
 	struct tcp_ao_info *ao;
-#endif
 
-	sock_owned_by_me((struct sock *)tp);
-	tp->bytes_acked += delta;
-#ifdef CONFIG_TCP_AO
+	if (!static_branch_unlikely(&tcp_ao_needed.key))
+		return;
+
 	ao = rcu_dereference_protected(tp->ao_info,
 				       lockdep_sock_is_held((struct sock *)tp));
 	if (ao && ack < tp->snd_una)
 		ao->snd_sne++;
 #endif
+}
+
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+	u32 delta = ack - tp->snd_una;
+
+	sock_owned_by_me((struct sock *)tp);
+	tp->bytes_acked += delta;
+	tcp_snd_sne_update(tp, ack);
 	tp->snd_una = ack;
 }
 
-/* If we update tp->rcv_nxt, also update tp->bytes_received */
-static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
 {
-	u32 delta = seq - tp->rcv_nxt;
 #ifdef CONFIG_TCP_AO
 	struct tcp_ao_info *ao;
-#endif
 
-	sock_owned_by_me((struct sock *)tp);
-	tp->bytes_received += delta;
-#ifdef CONFIG_TCP_AO
+	if (!static_branch_unlikely(&tcp_ao_needed.key))
+		return;
+
 	ao = rcu_dereference_protected(tp->ao_info,
 				       lockdep_sock_is_held((struct sock *)tp));
 	if (ao && seq < tp->rcv_nxt)
 		ao->rcv_sne++;
 #endif
+}
+
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+	u32 delta = seq - tp->rcv_nxt;
+
+	sock_owned_by_me((struct sock *)tp);
+	tp->bytes_received += delta;
+	tcp_rcv_sne_update(tp, seq);
 	WRITE_ONCE(tp->rcv_nxt, seq);
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8f98c58e2689..18c5595e3814 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1024,18 +1024,20 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 #ifdef CONFIG_TCP_AO
 	struct tcp_ao_info *ao_info;
 
-	/* FIXME: the segment to-be-acked is not verified yet */
-	ao_info = rcu_dereference(tcptw->ao_info);
-	if (ao_info) {
-		const struct tcp_ao_hdr *aoh;
+	if (static_branch_unlikely(&tcp_ao_needed.key)) {
+		/* FIXME: the segment to-be-acked is not verified yet */
+		ao_info = rcu_dereference(tcptw->ao_info);
+		if (ao_info) {
+			const struct tcp_ao_hdr *aoh;
+
+			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
+				inet_twsk_put(tw);
+				return;
+			}
 
-		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
-			inet_twsk_put(tw);
-			return;
+			if (aoh)
+				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
 		}
-
-		if (aoh)
-			key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
 	}
 	if (key.ao_key) {
 		struct tcp_ao_key *rnext_key;
@@ -1081,7 +1083,8 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 					     tcp_sk(sk)->snd_nxt;
 
 #ifdef CONFIG_TCP_AO
-	if (tcp_rsk_used_ao(req)) {
+	if (static_branch_unlikely(&tcp_ao_needed.key) &&
+	    tcp_rsk_used_ao(req)) {
 		const union tcp_md5_addr *addr;
 		const struct tcp_ao_hdr *aoh;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fa7050579e9a..b5936294dba2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1154,17 +1154,19 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 #ifdef CONFIG_TCP_AO
 	struct tcp_ao_info *ao_info;
 
-	/* FIXME: the segment to-be-acked is not verified yet */
-	ao_info = rcu_dereference(tcptw->ao_info);
-	if (ao_info) {
-		const struct tcp_ao_hdr *aoh;
+	if (static_branch_unlikely(&tcp_ao_needed.key)) {
 
-		/* Invalid TCP option size or twice included auth */
-		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
-			goto out;
-		if (aoh) {
-			key.ao_key = tcp_ao_established_key(ao_info,
-							    aoh->rnext_keyid, -1);
+		/* FIXME: the segment to-be-acked is not verified yet */
+		ao_info = rcu_dereference(tcptw->ao_info);
+		if (ao_info) {
+			const struct tcp_ao_hdr *aoh;
+
+			/* Invalid TCP option size or twice included auth */
+			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+				goto out;
+			if (aoh)
+				key.ao_key = tcp_ao_established_key(ao_info,
+						aoh->rnext_keyid, -1);
 		}
 	}
 	if (key.ao_key) {
@@ -1206,7 +1208,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	struct tcp_key key = {};
 
 #ifdef CONFIG_TCP_AO
-	if (tcp_rsk_used_ao(req)) {
+	if (static_branch_unlikely(&tcp_ao_needed.key) &&
+	    tcp_rsk_used_ao(req)) {
 		const struct in6_addr *addr = &ipv6_hdr(skb)->saddr;
 		const struct tcp_ao_hdr *aoh;
 		int l3index;
-- 
cgit v1.2.3-70-g09d2


From 248411b8cb8974a1e1c8e43123c1e682fbd64969 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:13 +0100
Subject: net/tcp: Wire up l3index to TCP-AO

Similarly how TCP_MD5SIG_FLAG_IFINDEX works for TCP-MD5,
TCP_AO_KEYF_IFINDEX is an AO-key flag that binds that MKT to a specified
by L3 ifinndex. Similarly, without this flag the key will work in
the default VRF l3index = 0 for connections.
To prevent AO-keys from overlapping, it's restricted to add key B for a
socket that has key A, which have the same sndid/rcvid and one of
the following is true:
- !(A.keyflags & TCP_AO_KEYF_IFINDEX) or !(B.keyflags & TCP_AO_KEYF_IFINDEX)
  so that any key is non-bound to a VRF
- A.l3index == B.l3index
  both want to work for the same VRF

Additionally, it's restricted to match TCP-MD5 keys for the same peer
the following way:
|--------------|--------------------|----------------|---------------|
|              | MD5 key without    |     MD5 key    |    MD5 key    |
|              |     l3index        |    l3index=0   |   l3index=N   |
|--------------|--------------------|----------------|---------------|
|  TCP-AO key  |                    |                |               |
|  without     |       reject       |    reject      |   reject      |
|  l3index     |                    |                |               |
|--------------|--------------------|----------------|---------------|
|  TCP-AO key  |                    |                |               |
|  l3index=0   |       reject       |    reject      |   allow       |
|--------------|--------------------|----------------|---------------|
|  TCP-AO key  |                    |                |               |
|  l3index=N   |       reject       |    allow       |   reject      |
|--------------|--------------------|----------------|---------------|

This is done with the help of tcp_md5_do_lookup_any_l3index() to reject
adding AO key without TCP_AO_KEYF_IFINDEX if there's TCP-MD5 in any VRF.
This is important for case where sysctl_tcp_l3mdev_accept = 1
Similarly, for TCP-AO lookups tcp_ao_do_lookup() may be used with
l3index < 0, so that __tcp_ao_key_cmp() will match TCP-AO key in any VRF.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  11 ++--
 include/net/tcp_ao.h  |  18 +++---
 net/ipv4/syncookies.c |   6 +-
 net/ipv4/tcp_ao.c     | 170 +++++++++++++++++++++++++++++++++++++-------------
 net/ipv4/tcp_ipv4.c   |  10 ++-
 net/ipv6/syncookies.c |   5 +-
 net/ipv6/tcp_ao.c     |  21 +++----
 net/ipv6/tcp_ipv6.c   |  15 +++--
 8 files changed, 177 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e3617c433cf1..d2f0736b76b8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2717,7 +2717,7 @@ static inline int tcp_parse_auth_options(const struct tcphdr *th,
 }
 
 static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
-				   int family, bool stat_inc)
+				   int family, int l3index, bool stat_inc)
 {
 #ifdef CONFIG_TCP_AO
 	struct tcp_ao_info *ao_info;
@@ -2731,7 +2731,7 @@ static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
 	if (!ao_info)
 		return false;
 
-	ao_key = tcp_ao_do_lookup(sk, saddr, family, -1, -1);
+	ao_key = tcp_ao_do_lookup(sk, l3index, saddr, family, -1, -1);
 	if (ao_info->ao_required || ao_key) {
 		if (stat_inc) {
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED);
@@ -2784,21 +2784,22 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
 		 * the last key is impossible to remove, so there's
 		 * always at least one current_key.
 		 */
-		if (tcp_ao_required(sk, saddr, family, true)) {
+		if (tcp_ao_required(sk, saddr, family, l3index, true)) {
 			tcp_hash_fail("AO hash is required, but not found",
 					family, skb, "L3 index %d", l3index);
 			return SKB_DROP_REASON_TCP_AONOTFOUND;
 		}
 		if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
-			tcp_hash_fail("MD5 Hash not found", family, skb, "");
+			tcp_hash_fail("MD5 Hash not found",
+				      family, skb, "L3 index %d", l3index);
 			return SKB_DROP_REASON_TCP_MD5NOTFOUND;
 		}
 		return SKB_NOT_DROPPED_YET;
 	}
 
 	if (aoh)
-		return tcp_inbound_ao_hash(sk, skb, family, req, aoh);
+		return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh);
 
 	return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
 				    l3index, md5_location);
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index a38408072ea8..edd6748b2cfa 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -33,6 +33,7 @@ struct tcp_ao_key {
 	u8			key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align;
 	unsigned int		tcp_sigpool_id;
 	unsigned int		digest_size;
+	int			l3index;
 	u8			prefixlen;
 	u8			family;
 	u8			keylen;
@@ -200,10 +201,10 @@ int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen);
 int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen);
 enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
 			const struct sk_buff *skb, unsigned short int family,
-			const struct request_sock *req,
+			const struct request_sock *req, int l3index,
 			const struct tcp_ao_hdr *aoh);
 u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq);
-struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
+struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index,
 				    const union tcp_ao_addr *addr,
 				    int family, int sndid, int rcvid);
 int tcp_ao_hash_hdr(unsigned short family, char *ao_hash,
@@ -245,9 +246,6 @@ int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  __be32 disn, bool send);
 int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
 			   struct request_sock *req);
-struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
-				       const struct in6_addr *addr,
-				       int sndid, int rcvid);
 struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 				    struct sock *addr_sk, int sndid, int rcvid);
 struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk,
@@ -265,7 +263,7 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb);
 void tcp_ao_connect_init(struct sock *sk);
 void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 		      struct tcp_request_sock *treq,
-		      unsigned short int family);
+		      unsigned short int family, int l3index);
 #else /* CONFIG_TCP_AO */
 
 static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
@@ -277,7 +275,7 @@ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 
 static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 				    struct tcp_request_sock *treq,
-				    unsigned short int family)
+				    unsigned short int family, int l3index)
 {
 }
 
@@ -289,13 +287,15 @@ static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family,
 
 static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
 		const struct sk_buff *skb, unsigned short int family,
-		const struct request_sock *req, const struct tcp_ao_hdr *aoh)
+		const struct request_sock *req, int l3index,
+		const struct tcp_ao_hdr *aoh)
 {
 	return SKB_NOT_DROPPED_YET;
 }
 
 static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
-		const union tcp_ao_addr *addr, int family, int sndid, int rcvid)
+		int l3index, const union tcp_ao_addr *addr,
+		int family, int sndid, int rcvid)
 {
 	return NULL;
 }
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0681d3e82b11..98b25e5d147b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -344,6 +344,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	__u8 rcv_wscale;
 	struct flowi4 fl4;
 	u32 tsoff = 0;
+	int l3index;
 
 	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) ||
 	    !th->ack || th->rst)
@@ -400,13 +401,14 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_synack	= 0;
 	treq->tfo_listener	= false;
 
-	tcp_ao_syncookie(sk, skb, treq, AF_INET);
-
 	if (IS_ENABLED(CONFIG_SMC))
 		ireq->smc_ok = 0;
 
 	ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
 
+	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+	tcp_ao_syncookie(sk, skb, treq, AF_INET, l3index);
+
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
 	 */
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index ffce8ca60ff2..b5ac3e73e1da 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -136,7 +136,7 @@ static int ipv4_prefix_cmp(const struct in_addr *addr1,
 	return memcmp(&a1, &a2, sizeof(a1));
 }
 
-static int __tcp_ao_key_cmp(const struct tcp_ao_key *key,
+static int __tcp_ao_key_cmp(const struct tcp_ao_key *key, int l3index,
 			    const union tcp_ao_addr *addr, u8 prefixlen,
 			    int family, int sndid, int rcvid)
 {
@@ -144,6 +144,10 @@ static int __tcp_ao_key_cmp(const struct tcp_ao_key *key,
 		return (key->sndid > sndid) ? 1 : -1;
 	if (rcvid >= 0 && key->rcvid != rcvid)
 		return (key->rcvid > rcvid) ? 1 : -1;
+	if (l3index >= 0 && (key->keyflags & TCP_AO_KEYF_IFINDEX)) {
+		if (key->l3index != l3index)
+			return (key->l3index > l3index) ? 1 : -1;
+	}
 
 	if (family == AF_UNSPEC)
 		return 0;
@@ -168,7 +172,7 @@ static int __tcp_ao_key_cmp(const struct tcp_ao_key *key,
 	return -1;
 }
 
-static int tcp_ao_key_cmp(const struct tcp_ao_key *key,
+static int tcp_ao_key_cmp(const struct tcp_ao_key *key, int l3index,
 			  const union tcp_ao_addr *addr, u8 prefixlen,
 			  int family, int sndid, int rcvid)
 {
@@ -176,14 +180,16 @@ static int tcp_ao_key_cmp(const struct tcp_ao_key *key,
 	if (family == AF_INET6 && ipv6_addr_v4mapped(&addr->a6)) {
 		__be32 addr4 = addr->a6.s6_addr32[3];
 
-		return __tcp_ao_key_cmp(key, (union tcp_ao_addr *)&addr4,
+		return __tcp_ao_key_cmp(key, l3index,
+					(union tcp_ao_addr *)&addr4,
 					prefixlen, AF_INET, sndid, rcvid);
 	}
 #endif
-	return __tcp_ao_key_cmp(key, addr, prefixlen, family, sndid, rcvid);
+	return __tcp_ao_key_cmp(key, l3index, addr,
+				prefixlen, family, sndid, rcvid);
 }
 
-static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk,
+static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk, int l3index,
 		const union tcp_ao_addr *addr, int family, u8 prefix,
 		int sndid, int rcvid)
 {
@@ -201,17 +207,18 @@ static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk,
 	hlist_for_each_entry_rcu(key, &ao->head, node) {
 		u8 prefixlen = min(prefix, key->prefixlen);
 
-		if (!tcp_ao_key_cmp(key, addr, prefixlen, family, sndid, rcvid))
+		if (!tcp_ao_key_cmp(key, l3index, addr, prefixlen,
+				    family, sndid, rcvid))
 			return key;
 	}
 	return NULL;
 }
 
-struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
+struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index,
 				    const union tcp_ao_addr *addr,
 				    int family, int sndid, int rcvid)
 {
-	return __tcp_ao_do_lookup(sk, addr, family, U8_MAX, sndid, rcvid);
+	return __tcp_ao_do_lookup(sk, l3index, addr, family, U8_MAX, sndid, rcvid);
 }
 
 static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
@@ -677,18 +684,22 @@ struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk,
 					struct request_sock *req,
 					int sndid, int rcvid)
 {
-	union tcp_ao_addr *addr =
-			(union tcp_ao_addr *)&inet_rsk(req)->ir_rmt_addr;
+	struct inet_request_sock *ireq = inet_rsk(req);
+	union tcp_ao_addr *addr = (union tcp_ao_addr *)&ireq->ir_rmt_addr;
+	int l3index;
 
-	return tcp_ao_do_lookup(sk, addr, AF_INET, sndid, rcvid);
+	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+	return tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid, rcvid);
 }
 
 struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 				    int sndid, int rcvid)
 {
+	int l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+						     addr_sk->sk_bound_dev_if);
 	union tcp_ao_addr *addr = (union tcp_ao_addr *)&addr_sk->sk_daddr;
 
-	return tcp_ao_do_lookup(sk, addr, AF_INET, sndid, rcvid);
+	return tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid, rcvid);
 }
 
 int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
@@ -738,7 +749,8 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 		ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
 		if (!ao_info)
 			return -ENOENT;
-		*key = tcp_ao_do_lookup(sk, addr, family, -1, aoh->rnext_keyid);
+		*key = tcp_ao_do_lookup(sk, l3index, addr, family,
+					-1, aoh->rnext_keyid);
 		if (!*key)
 			return -ENOENT;
 		*traffic_key = kmalloc(tcp_ao_digest_size(*key), GFP_ATOMIC);
@@ -814,24 +826,26 @@ int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 
 static struct tcp_ao_key *tcp_ao_inbound_lookup(unsigned short int family,
 		const struct sock *sk, const struct sk_buff *skb,
-		int sndid, int rcvid)
+		int sndid, int rcvid, int l3index)
 {
 	if (family == AF_INET) {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)&iph->saddr,
-				AF_INET, sndid, rcvid);
+		return tcp_ao_do_lookup(sk, l3index,
+					(union tcp_ao_addr *)&iph->saddr,
+					AF_INET, sndid, rcvid);
 	} else {
 		const struct ipv6hdr *iph = ipv6_hdr(skb);
 
-		return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)&iph->saddr,
-				AF_INET6, sndid, rcvid);
+		return tcp_ao_do_lookup(sk, l3index,
+					(union tcp_ao_addr *)&iph->saddr,
+					AF_INET6, sndid, rcvid);
 	}
 }
 
 void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 		      struct tcp_request_sock *treq,
-		      unsigned short int family)
+		      unsigned short int family, int l3index)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	const struct tcp_ao_hdr *aoh;
@@ -842,7 +856,7 @@ void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 	if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh)
 		return;
 
-	key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid);
+	key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid, l3index);
 	if (!key)
 		/* Key not found, continue without TCP-AO */
 		return;
@@ -856,7 +870,7 @@ static enum skb_drop_reason
 tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
 		   unsigned short int family, struct tcp_ao_info *info,
 		   const struct tcp_ao_hdr *aoh, struct tcp_ao_key *key,
-		   u8 *traffic_key, u8 *phash, u32 sne)
+		   u8 *traffic_key, u8 *phash, u32 sne, int l3index)
 {
 	u8 maclen = aoh->length - sizeof(struct tcp_ao_hdr);
 	const struct tcphdr *th = tcp_hdr(skb);
@@ -867,7 +881,8 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
 		atomic64_inc(&info->counters.pkt_bad);
 		atomic64_inc(&key->pkt_bad);
 		tcp_hash_fail("AO hash wrong length", family, skb,
-			      "%u != %d", maclen, tcp_ao_maclen(key));
+			      "%u != %d L3index: %d", maclen,
+			      tcp_ao_maclen(key), l3index);
 		return SKB_DROP_REASON_TCP_AOFAILURE;
 	}
 
@@ -882,7 +897,8 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
 		atomic64_inc(&info->counters.pkt_bad);
 		atomic64_inc(&key->pkt_bad);
-		tcp_hash_fail("AO hash mismatch", family, skb, "");
+		tcp_hash_fail("AO hash mismatch", family, skb,
+			      "L3index: %d", l3index);
 		kfree(hash_buf);
 		return SKB_DROP_REASON_TCP_AOFAILURE;
 	}
@@ -896,7 +912,7 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
 enum skb_drop_reason
 tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
 		    unsigned short int family, const struct request_sock *req,
-		    const struct tcp_ao_hdr *aoh)
+		    int l3index, const struct tcp_ao_hdr *aoh)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	u8 *phash = (u8 *)(aoh + 1); /* hash goes just after the header */
@@ -911,7 +927,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
 	if (!info) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
 		tcp_hash_fail("AO key not found", family, skb,
-			      "keyid: %u", aoh->keyid);
+			      "keyid: %u L3index: %d", aoh->keyid, l3index);
 		return SKB_DROP_REASON_TCP_AOUNEXPECTED;
 	}
 
@@ -945,7 +961,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
 		/* Established socket, traffic key are cached */
 		traffic_key = rcv_other_key(key);
 		err = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
-					 traffic_key, phash, sne);
+					 traffic_key, phash, sne, l3index);
 		if (err)
 			return err;
 		current_key = READ_ONCE(info->current_key);
@@ -966,7 +982,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
 	 * - request sockets would race on those key pointers
 	 * - tcp_ao_del_cmd() allows async key removal
 	 */
-	key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid);
+	key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid, l3index);
 	if (!key)
 		goto key_not_found;
 
@@ -1006,7 +1022,7 @@ verify_hash:
 		return SKB_DROP_REASON_NOT_SPECIFIED;
 	tcp_ao_calc_key_skb(key, traffic_key, skb, sisn, disn, family);
 	ret = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
-				 traffic_key, phash, sne);
+				 traffic_key, phash, sne, l3index);
 	kfree(traffic_key);
 	return ret;
 
@@ -1014,7 +1030,7 @@ key_not_found:
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
 	atomic64_inc(&info->counters.key_not_found);
 	tcp_hash_fail("Requested by the peer AO key id not found",
-		      family, skb, "");
+		      family, skb, "L3index: %d", l3index);
 	return SKB_DROP_REASON_TCP_AOKEYNOTFOUND;
 }
 
@@ -1042,7 +1058,7 @@ void tcp_ao_connect_init(struct sock *sk)
 	struct tcp_ao_info *ao_info;
 	union tcp_ao_addr *addr;
 	struct tcp_ao_key *key;
-	int family;
+	int family, l3index;
 
 	ao_info = rcu_dereference_protected(tp->ao_info,
 					    lockdep_sock_is_held(sk));
@@ -1059,9 +1075,11 @@ void tcp_ao_connect_init(struct sock *sk)
 #endif
 	else
 		return;
+	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+						 sk->sk_bound_dev_if);
 
 	hlist_for_each_entry_rcu(key, &ao_info->head, node) {
-		if (!tcp_ao_key_cmp(key, addr, key->prefixlen, family, -1, -1))
+		if (!tcp_ao_key_cmp(key, l3index, addr, key->prefixlen, family, -1, -1))
 			continue;
 
 		if (key == ao_info->current_key)
@@ -1134,9 +1152,9 @@ int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
 	struct tcp_ao_key *key, *new_key, *first_key;
 	struct tcp_ao_info *new_ao, *ao;
 	struct hlist_node *key_head;
+	int l3index, ret = -ENOMEM;
 	union tcp_ao_addr *addr;
 	bool match = false;
-	int ret = -ENOMEM;
 
 	ao = rcu_dereference(tcp_sk(sk)->ao_info);
 	if (!ao)
@@ -1164,9 +1182,11 @@ int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
 		ret = -EAFNOSUPPORT;
 		goto free_ao;
 	}
+	l3index = l3mdev_master_ifindex_by_index(sock_net(newsk),
+						 newsk->sk_bound_dev_if);
 
 	hlist_for_each_entry_rcu(key, &ao->head, node) {
-		if (tcp_ao_key_cmp(key, addr, key->prefixlen, family, -1, -1))
+		if (tcp_ao_key_cmp(key, l3index, addr, key->prefixlen, family, -1, -1))
 			continue;
 
 		new_key = tcp_ao_copy_key(newsk, key);
@@ -1470,7 +1490,8 @@ static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
 	return ERR_PTR(-ESOCKTNOSUPPORT);
 }
 
-#define TCP_AO_KEYF_ALL		(TCP_AO_KEYF_EXCLUDE_OPT)
+#define TCP_AO_KEYF_ALL (TCP_AO_KEYF_IFINDEX | TCP_AO_KEYF_EXCLUDE_OPT)
+#define TCP_AO_GET_KEYF_VALID	(TCP_AO_KEYF_IFINDEX)
 
 static struct tcp_ao_key *tcp_ao_key_alloc(struct sock *sk,
 					   struct tcp_ao_add *cmd)
@@ -1534,8 +1555,8 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 	union tcp_ao_addr *addr;
 	struct tcp_ao_key *key;
 	struct tcp_ao_add cmd;
+	int ret, l3index = 0;
 	bool first = false;
-	int ret;
 
 	if (optlen < sizeof(cmd))
 		return -EINVAL;
@@ -1565,9 +1586,46 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 			return -EINVAL;
 	}
 
+	if (cmd.ifindex && !(cmd.keyflags & TCP_AO_KEYF_IFINDEX))
+		return -EINVAL;
+
+	/* For cmd.tcp_ifindex = 0 the key will apply to the default VRF */
+	if (cmd.keyflags & TCP_AO_KEYF_IFINDEX && cmd.ifindex) {
+		int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+		struct net_device *dev;
+
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(sock_net(sk), cmd.ifindex);
+		if (dev && netif_is_l3_master(dev))
+			l3index = dev->ifindex;
+		rcu_read_unlock();
+
+		if (!dev || !l3index)
+			return -EINVAL;
+
+		/* It's still possible to bind after adding keys or even
+		 * re-bind to a different dev (with CAP_NET_RAW).
+		 * So, no reason to return error here, rather try to be
+		 * nice and warn the user.
+		 */
+		if (bound_dev_if && bound_dev_if != cmd.ifindex)
+			net_warn_ratelimited("AO key ifindex %d != sk bound ifindex %d\n",
+					     cmd.ifindex, bound_dev_if);
+	}
+
 	/* Don't allow keys for peers that have a matching TCP-MD5 key */
-	if (tcp_md5_do_lookup_any_l3index(sk, addr, family))
-		return -EKEYREJECTED;
+	if (cmd.keyflags & TCP_AO_KEYF_IFINDEX) {
+		/* Non-_exact version of tcp_md5_do_lookup() will
+		 * as well match keys that aren't bound to a specific VRF
+		 * (that will make them match AO key with
+		 * sysctl_tcp_l3dev_accept = 1
+		 */
+		if (tcp_md5_do_lookup(sk, l3index, addr, family))
+			return -EKEYREJECTED;
+	} else {
+		if (tcp_md5_do_lookup_any_l3index(sk, addr, family))
+			return -EKEYREJECTED;
+	}
 
 	ao_info = setsockopt_ao_info(sk);
 	if (IS_ERR(ao_info))
@@ -1584,10 +1642,9 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 		 * > The IDs of MKTs MUST NOT overlap where their
 		 * > TCP connection identifiers overlap.
 		 */
-		if (__tcp_ao_do_lookup(sk, addr, family,
-				       cmd.prefix, -1, cmd.rcvid))
+		if (__tcp_ao_do_lookup(sk, l3index, addr, family, cmd.prefix, -1, cmd.rcvid))
 			return -EEXIST;
-		if (__tcp_ao_do_lookup(sk, addr, family,
+		if (__tcp_ao_do_lookup(sk, l3index, addr, family,
 				       cmd.prefix, cmd.sndid, -1))
 			return -EEXIST;
 	}
@@ -1606,6 +1663,7 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 	key->keyflags	= cmd.keyflags;
 	key->sndid	= cmd.sndid;
 	key->rcvid	= cmd.rcvid;
+	key->l3index	= l3index;
 	atomic64_set(&key->pkt_good, 0);
 	atomic64_set(&key->pkt_bad, 0);
 
@@ -1694,17 +1752,17 @@ add_key:
 	return err;
 }
 
+#define TCP_AO_DEL_KEYF_ALL (TCP_AO_KEYF_IFINDEX)
 static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
 			  sockptr_t optval, int optlen)
 {
 	struct tcp_ao_key *key, *new_current = NULL, *new_rnext = NULL;
+	int err, addr_len, l3index = 0;
 	struct tcp_ao_info *ao_info;
 	union tcp_ao_addr *addr;
 	struct tcp_ao_del cmd;
-	int addr_len;
 	__u8 prefix;
 	u16 port;
-	int err;
 
 	if (optlen < sizeof(cmd))
 		return -EINVAL;
@@ -1721,6 +1779,17 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
 			return -EINVAL;
 	}
 
+	if (cmd.keyflags & ~TCP_AO_DEL_KEYF_ALL)
+		return -EINVAL;
+
+	/* No sanity check for TCP_AO_KEYF_IFINDEX as if a VRF
+	 * was destroyed, there still should be a way to delete keys,
+	 * that were bound to that l3intf. So, fail late at lookup stage
+	 * if there is no key for that ifindex.
+	 */
+	if (cmd.ifindex && !(cmd.keyflags & TCP_AO_KEYF_IFINDEX))
+		return -EINVAL;
+
 	ao_info = setsockopt_ao_info(sk);
 	if (IS_ERR(ao_info))
 		return PTR_ERR(ao_info);
@@ -1788,6 +1857,13 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
 		    memcmp(addr, &key->addr, addr_len))
 			continue;
 
+		if ((cmd.keyflags & TCP_AO_KEYF_IFINDEX) !=
+		    (key->keyflags & TCP_AO_KEYF_IFINDEX))
+			continue;
+
+		if (key->l3index != l3index)
+			continue;
+
 		if (key == new_current || key == new_rnext)
 			continue;
 
@@ -1973,10 +2049,10 @@ static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info,
 	struct tcp_ao_key *key, *current_key;
 	bool do_address_matching = true;
 	union tcp_ao_addr *addr = NULL;
+	int err, l3index, user_len;
 	unsigned int max_keys;	/* maximum number of keys to copy to user */
 	size_t out_offset = 0;
 	size_t bytes_to_write;	/* number of bytes to write to user level */
-	int err, user_len;
 	u32 matched_keys;	/* keys from ao_info matched so far */
 	int optlen_out;
 	__be16 port = 0;
@@ -1995,11 +2071,16 @@ static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info,
 
 	if (opt_in.pkt_good || opt_in.pkt_bad)
 		return -EINVAL;
+	if (opt_in.keyflags & ~TCP_AO_GET_KEYF_VALID)
+		return -EINVAL;
+	if (opt_in.ifindex && !(opt_in.keyflags & TCP_AO_KEYF_IFINDEX))
+		return -EINVAL;
 
 	if (opt_in.reserved != 0)
 		return -EINVAL;
 
 	max_keys = opt_in.nkeys;
+	l3index = (opt_in.keyflags & TCP_AO_KEYF_IFINDEX) ? opt_in.ifindex : -1;
 
 	if (opt_in.get_all || opt_in.is_current || opt_in.is_rnext) {
 		if (opt_in.get_all && (opt_in.is_current || opt_in.is_rnext))
@@ -2101,7 +2182,7 @@ static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info,
 			continue;
 		}
 
-		if (tcp_ao_key_cmp(key, addr, opt_in.prefix,
+		if (tcp_ao_key_cmp(key, l3index, addr, opt_in.prefix,
 				   opt_in.addr.ss_family,
 				   opt_in.sndid, opt_in.rcvid) != 0)
 			continue;
@@ -2134,6 +2215,7 @@ match:
 		opt_out.nkeys = 0;
 		opt_out.maclen = key->maclen;
 		opt_out.keylen = key->keylen;
+		opt_out.ifindex = key->l3index;
 		opt_out.pkt_good = atomic64_read(&key->pkt_good);
 		opt_out.pkt_bad = atomic64_read(&key->pkt_bad);
 		memcpy(&opt_out.key, key->key, key->keylen);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 18c5595e3814..5f693bbd578d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1087,6 +1087,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	    tcp_rsk_used_ao(req)) {
 		const union tcp_md5_addr *addr;
 		const struct tcp_ao_hdr *aoh;
+		int l3index;
 
 		/* Invalid TCP option size or twice included auth */
 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
@@ -1095,11 +1096,12 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			return;
 
 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
-		key.ao_key = tcp_ao_do_lookup(sk, addr, AF_INET,
+		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
 					      aoh->rnext_keyid, -1);
 		if (unlikely(!key.ao_key)) {
 			/* Send ACK with any matching MKT for the peer */
-			key.ao_key = tcp_ao_do_lookup(sk, addr, AF_INET, -1, -1);
+			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
 			/* Matching key disappeared (user removed the key?)
 			 * let the handshake timeout.
 			 */
@@ -1493,6 +1495,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
 	const union tcp_md5_addr *addr;
 	u8 prefixlen = 32;
 	int l3index = 0;
+	bool l3flag;
 	u8 flags;
 
 	if (optlen < sizeof(cmd))
@@ -1505,6 +1508,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
 		return -EINVAL;
 
 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
 
 	if (optname == TCP_MD5SIG_EXT &&
 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
@@ -1542,7 +1546,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
 	/* Don't allow keys for peers that have a matching TCP-AO key.
 	 * See the comment in tcp_ao_add_cmd()
 	 */
-	if (tcp_ao_required(sk, addr, AF_INET, false))
+	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
 		return -EKEYREJECTED;
 
 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index ad7a8caa7b2a..500f6ed3b8cf 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -140,6 +140,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	struct dst_entry *dst;
 	__u8 rcv_wscale;
 	u32 tsoff = 0;
+	int l3index;
 
 	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) ||
 	    !th->ack || th->rst)
@@ -214,7 +215,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_isn = cookie;
 	treq->ts_off = 0;
 	treq->txhash = net_tx_rndhash();
-	tcp_ao_syncookie(sk, skb, treq, AF_INET6);
+
+	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+	tcp_ao_syncookie(sk, skb, treq, AF_INET6, l3index);
 
 	if (IS_ENABLED(CONFIG_SMC))
 		ireq->smc_ok = 0;
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
index 8b04611c9078..3c09ac26206e 100644
--- a/net/ipv6/tcp_ao.c
+++ b/net/ipv6/tcp_ao.c
@@ -87,30 +87,29 @@ int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
 			htonl(tcp_rsk(req)->rcv_isn));
 }
 
-struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
-				       const struct in6_addr *addr,
-				       int sndid, int rcvid)
-{
-	return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)addr, AF_INET6,
-				sndid, rcvid);
-}
-
 struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 				    struct sock *addr_sk,
 				    int sndid, int rcvid)
 {
+	int l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+						     addr_sk->sk_bound_dev_if);
 	struct in6_addr *addr = &addr_sk->sk_v6_daddr;
 
-	return tcp_v6_ao_do_lookup(sk, addr, sndid, rcvid);
+	return tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr,
+				AF_INET6, sndid, rcvid);
 }
 
 struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk,
 					struct request_sock *req,
 					int sndid, int rcvid)
 {
-	struct in6_addr *addr = &inet_rsk(req)->ir_v6_rmt_addr;
+	struct inet_request_sock *ireq = inet_rsk(req);
+	struct in6_addr *addr = &ireq->ir_v6_rmt_addr;
+	int l3index;
 
-	return tcp_v6_ao_do_lookup(sk, addr, sndid, rcvid);
+	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+	return tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr,
+				AF_INET6, sndid, rcvid);
 }
 
 int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b5936294dba2..937a02c2e534 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -610,6 +610,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 	union tcp_ao_addr *addr;
 	int l3index = 0;
 	u8 prefixlen;
+	bool l3flag;
 	u8 flags;
 
 	if (optlen < sizeof(cmd))
@@ -622,6 +623,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 		return -EINVAL;
 
 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
 
 	if (optname == TCP_MD5SIG_EXT &&
 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
@@ -668,7 +670,8 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 		/* Don't allow keys for peers that have a matching TCP-AO key.
 		 * See the comment in tcp_ao_add_cmd()
 		 */
-		if (tcp_ao_required(sk, addr, AF_INET, false))
+		if (tcp_ao_required(sk, addr, AF_INET,
+				    l3flag ? l3index : -1, false))
 			return -EKEYREJECTED;
 		return tcp_md5_do_add(sk, addr,
 				      AF_INET, prefixlen, l3index, flags,
@@ -680,7 +683,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 	/* Don't allow keys for peers that have a matching TCP-AO key.
 	 * See the comment in tcp_ao_add_cmd()
 	 */
-	if (tcp_ao_required(sk, addr, AF_INET6, false))
+	if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false))
 		return -EKEYREJECTED;
 
 	return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags,
@@ -1220,10 +1223,14 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			return;
 		if (!aoh)
 			return;
-		key.ao_key = tcp_v6_ao_do_lookup(sk, addr, aoh->rnext_keyid, -1);
+		key.ao_key = tcp_ao_do_lookup(sk, l3index,
+					      (union tcp_ao_addr *)addr,
+					      AF_INET6, aoh->rnext_keyid, -1);
 		if (unlikely(!key.ao_key)) {
 			/* Send ACK with any matching MKT for the peer */
-			key.ao_key = tcp_v6_ao_do_lookup(sk, addr, -1, -1);
+			key.ao_key = tcp_ao_do_lookup(sk, l3index,
+						      (union tcp_ao_addr *)addr,
+						      AF_INET6, -1, -1);
 			/* Matching key disappeared (user removed the key?)
 			 * let the handshake timeout.
 			 */
-- 
cgit v1.2.3-70-g09d2


From faadfaba5e018ca0f9595f17115ff48416b7b85e Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:14 +0100
Subject: net/tcp: Add TCP_AO_REPAIR

Add TCP_AO_REPAIR setsockopt(), getsockopt(). They let a user to repair
TCP-AO ISNs/SNEs. Also let the user hack around when (tp->repair) is on
and add ao_info on a socket in any supported state.
As SNEs now can be read/written at any moment, use
WRITE_ONCE()/READ_ONCE() to set/read them.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp_ao.h     | 14 ++++++++
 include/uapi/linux/tcp.h |  8 +++++
 net/ipv4/tcp.c           | 24 +++++++++----
 net/ipv4/tcp_ao.c        | 90 +++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 125 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index edd6748b2cfa..a375a171ef3c 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -199,6 +199,8 @@ void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp);
 bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code);
 int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen);
 int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen);
+int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen);
+int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen);
 enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk,
 			const struct sk_buff *skb, unsigned short int family,
 			const struct request_sock *req, int l3index,
@@ -330,6 +332,18 @@ static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockpt
 {
 	return -ENOPROTOOPT;
 }
+
+static inline int tcp_ao_get_repair(struct sock *sk,
+				    sockptr_t optval, sockptr_t optlen)
+{
+	return -ENOPROTOOPT;
+}
+
+static inline int tcp_ao_set_repair(struct sock *sk,
+				    sockptr_t optval, unsigned int optlen)
+{
+	return -ENOPROTOOPT;
+}
 #endif
 
 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index be34d7c5c531..c07e9f90c084 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -133,6 +133,7 @@ enum {
 #define TCP_AO_DEL_KEY		39	/* Delete MKT */
 #define TCP_AO_INFO		40	/* Set/list TCP-AO per-socket options */
 #define TCP_AO_GET_KEYS		41	/* List MKT(s) */
+#define TCP_AO_REPAIR		42	/* Get/Set SNEs and ISNs */
 
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF		0
@@ -458,6 +459,13 @@ struct tcp_ao_getsockopt { /* getsockopt(TCP_AO_GET_KEYS) */
 	__u64	pkt_bad;		/* out: segments that failed verification */
 } __attribute__((aligned(8)));
 
+struct tcp_ao_repair { /* {s,g}etsockopt(TCP_AO_REPAIR) */
+	__be32			snt_isn;
+	__be32			rcv_isn;
+	__u32			snd_sne;
+	__u32			rcv_sne;
+} __attribute__((aligned(8)));
+
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
 
 #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ce33eee9d0f2..53bcc17c91e4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3593,20 +3593,28 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 		__tcp_sock_set_quickack(sk, val);
 		break;
 
+	case TCP_AO_REPAIR:
+		err = tcp_ao_set_repair(sk, optval, optlen);
+		break;
 #ifdef CONFIG_TCP_AO
 	case TCP_AO_ADD_KEY:
 	case TCP_AO_DEL_KEY:
 	case TCP_AO_INFO: {
 		/* If this is the first TCP-AO setsockopt() on the socket,
-		 * sk_state has to be LISTEN or CLOSE
+		 * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR
+		 * in any state.
 		 */
-		if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) ||
-		    rcu_dereference_protected(tcp_sk(sk)->ao_info,
+		if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+			goto ao_parse;
+		if (rcu_dereference_protected(tcp_sk(sk)->ao_info,
 					      lockdep_sock_is_held(sk)))
-			err = tp->af_specific->ao_parse(sk, optname, optval,
-							optlen);
-		else
-			err = -EISCONN;
+			goto ao_parse;
+		if (tp->repair)
+			goto ao_parse;
+		err = -EISCONN;
+		break;
+ao_parse:
+		err = tp->af_specific->ao_parse(sk, optname, optval, optlen);
 		break;
 	}
 #endif
@@ -4284,6 +4292,8 @@ zerocopy_rcv_out:
 		return err;
 	}
 #endif
+	case TCP_AO_REPAIR:
+		return tcp_ao_get_repair(sk, optval, optlen);
 	case TCP_AO_GET_KEYS:
 	case TCP_AO_INFO: {
 		int err;
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index b5ac3e73e1da..6a845e906a1d 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -1490,6 +1490,16 @@ static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
 	return ERR_PTR(-ESOCKTNOSUPPORT);
 }
 
+static struct tcp_ao_info *getsockopt_ao_info(struct sock *sk)
+{
+	if (sk_fullsock(sk))
+		return rcu_dereference(tcp_sk(sk)->ao_info);
+	else if (sk->sk_state == TCP_TIME_WAIT)
+		return rcu_dereference(tcp_twsk(sk)->ao_info);
+
+	return ERR_PTR(-ESOCKTNOSUPPORT);
+}
+
 #define TCP_AO_KEYF_ALL (TCP_AO_KEYF_IFINDEX | TCP_AO_KEYF_EXCLUDE_OPT)
 #define TCP_AO_GET_KEYF_VALID	(TCP_AO_KEYF_IFINDEX)
 
@@ -1671,11 +1681,13 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
 	if (ret < 0)
 		goto err_free_sock;
 
-	/* Change this condition if we allow adding keys in states
-	 * like close_wait, syn_sent or fin_wait...
-	 */
-	if (sk->sk_state == TCP_ESTABLISHED)
+	if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))) {
 		tcp_ao_cache_traffic_keys(sk, ao_info, key);
+		if (first) {
+			ao_info->current_key = key;
+			ao_info->rnext_key = key;
+		}
+	}
 
 	tcp_ao_link_mkt(ao_info, key);
 	if (first) {
@@ -1926,6 +1938,8 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
 	if (IS_ERR(ao_info))
 		return PTR_ERR(ao_info);
 	if (!ao_info) {
+		if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)))
+			return -EINVAL;
 		ao_info = tcp_ao_alloc_info(GFP_KERNEL);
 		if (!ao_info)
 			return -ENOMEM;
@@ -2308,3 +2322,71 @@ int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen)
 	return 0;
 }
 
+int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_ao_repair cmd;
+	struct tcp_ao_key *key;
+	struct tcp_ao_info *ao;
+	int err;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+	if (err)
+		return err;
+
+	if (!tp->repair)
+		return -EPERM;
+
+	ao = setsockopt_ao_info(sk);
+	if (IS_ERR(ao))
+		return PTR_ERR(ao);
+	if (!ao)
+		return -ENOENT;
+
+	WRITE_ONCE(ao->lisn, cmd.snt_isn);
+	WRITE_ONCE(ao->risn, cmd.rcv_isn);
+	WRITE_ONCE(ao->snd_sne, cmd.snd_sne);
+	WRITE_ONCE(ao->rcv_sne, cmd.rcv_sne);
+
+	hlist_for_each_entry_rcu(key, &ao->head, node)
+		tcp_ao_cache_traffic_keys(sk, ao, key);
+
+	return 0;
+}
+
+int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_ao_repair opt;
+	struct tcp_ao_info *ao;
+	int len;
+
+	if (copy_from_sockptr(&len, optlen, sizeof(int)))
+		return -EFAULT;
+
+	if (len <= 0)
+		return -EINVAL;
+
+	if (!tp->repair)
+		return -EPERM;
+
+	rcu_read_lock();
+	ao = getsockopt_ao_info(sk);
+	if (IS_ERR_OR_NULL(ao)) {
+		rcu_read_unlock();
+		return ao ? PTR_ERR(ao) : -ENOENT;
+	}
+
+	opt.snt_isn	= ao->lisn;
+	opt.rcv_isn	= ao->risn;
+	opt.snd_sne	= READ_ONCE(ao->snd_sne);
+	opt.rcv_sne	= READ_ONCE(ao->rcv_sne);
+	rcu_read_unlock();
+
+	if (copy_to_sockptr(optval, &opt, min_t(int, len, sizeof(opt))))
+		return -EFAULT;
+	return 0;
+}
-- 
cgit v1.2.3-70-g09d2


From 83c1bbeb864f2a197603b91b3e0f748cca64543d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 25 Oct 2023 15:30:14 +0300
Subject: bridge: add MDB get uAPI attributes

Add MDB get attributes that correspond to the MDB set attributes used in
RTM_NEWMDB messages. Specifically, add 'MDBA_GET_ENTRY' which will hold
a 'struct br_mdb_entry' and 'MDBA_GET_ENTRY_ATTRS' which will hold
'MDBE_ATTR_*' attributes that are used as indexes (source IP and source
VNI).

An example request will look as follows:

[ struct nlmsghdr ]
[ struct br_port_msg ]
[ MDBA_GET_ENTRY ]
	struct br_mdb_entry
[ MDBA_GET_ENTRY_ATTRS ]
	[ MDBE_ATTR_SOURCE ]
		struct in_addr / struct in6_addr
	[ MDBE_ATTR_SRC_VNI ]
		u32

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index f95326fce6bb..2e23f99dc0f1 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -723,6 +723,24 @@ enum {
 };
 #define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1)
 
+/* [MDBA_GET_ENTRY] = {
+ *    struct br_mdb_entry
+ *    [MDBA_GET_ENTRY_ATTRS] = {
+ *       [MDBE_ATTR_SOURCE]
+ *          struct in_addr / struct in6_addr
+ *       [MDBE_ATTR_SRC_VNI]
+ *          u32
+ *    }
+ * }
+ */
+enum {
+	MDBA_GET_ENTRY_UNSPEC,
+	MDBA_GET_ENTRY,
+	MDBA_GET_ENTRY_ATTRS,
+	__MDBA_GET_ENTRY_MAX,
+};
+#define MDBA_GET_ENTRY_MAX (__MDBA_GET_ENTRY_MAX - 1)
+
 /* [MDBA_SET_ENTRY_ATTRS] = {
  *    [MDBE_ATTR_xxx]
  *    ...
-- 
cgit v1.2.3-70-g09d2


From 62f47bf9e2c00eea457cad8fa43c24ed0282b37a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 25 Oct 2023 15:30:15 +0300
Subject: net: Add MDB get device operation

Add MDB net device operation that will be invoked by rtnetlink code in
response to received RTM_GETMDB messages. Subsequent patches will
implement the operation in the bridge and VXLAN drivers.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b8bf669212cc..a16c9cc063fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1609,6 +1609,10 @@ struct net_device_ops {
 	int			(*ndo_mdb_dump)(struct net_device *dev,
 						struct sk_buff *skb,
 						struct netlink_callback *cb);
+	int			(*ndo_mdb_get)(struct net_device *dev,
+					       struct nlattr *tb[], u32 portid,
+					       u32 seq,
+					       struct netlink_ext_ack *extack);
 	int			(*ndo_bridge_setlink)(struct net_device *dev,
 						      struct nlmsghdr *nlh,
 						      u16 flags,
-- 
cgit v1.2.3-70-g09d2


From 6479c975b20a7fe6ecacec3a60dc6f838ecee9d6 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 27 Oct 2023 16:04:54 +0200
Subject: doc/netlink: Update schema to support cmd-cnt-name and cmd-max-name

allow specifying cmd-cnt-name and cmd-max-name in netlink specs, in
accordance with Documentation/userspace-api/netlink/c-code-gen.rst.

Use cmd-cnt-name and attr-cnt-name in the mptcp yaml spec and in the
corresponding uAPI headers, to preserve the #defines we had in the past
and avoid adding new ones.

v2:
 - squash modification in mptcp.yaml and MPTCP uAPI headers

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Link: https://lore.kernel.org/r/12d4ed0116d8883cf4b533b856f3125a34e56749.1698415310.git.dcaratti@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/genetlink-c.yaml      | 6 ++++++
 Documentation/netlink/genetlink-legacy.yaml | 6 ++++++
 Documentation/netlink/netlink-raw.yaml      | 6 ++++++
 Documentation/netlink/specs/mptcp.yaml      | 2 ++
 include/uapi/linux/mptcp.h                  | 4 ----
 include/uapi/linux/mptcp_pm.h               | 8 ++++----
 6 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml
index 9d13bbb7ae47..c58f7153fcf8 100644
--- a/Documentation/netlink/genetlink-c.yaml
+++ b/Documentation/netlink/genetlink-c.yaml
@@ -47,6 +47,12 @@ properties:
   max-by-define:
     description: Makes the number of attributes and commands be specified by a define, not an enum value.
     type: boolean
+  cmd-max-name:
+    description: Name of the define for the last operation in the list.
+    type: string
+  cmd-cnt-name:
+    description: The explicit name for constant holding the count of operations (last operation + 1).
+    type: string
   # End genetlink-c
 
   definitions:
diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml
index 0daf40402a29..938703088306 100644
--- a/Documentation/netlink/genetlink-legacy.yaml
+++ b/Documentation/netlink/genetlink-legacy.yaml
@@ -47,6 +47,12 @@ properties:
   max-by-define:
     description: Makes the number of attributes and commands be specified by a define, not an enum value.
     type: boolean
+  cmd-max-name:
+    description: Name of the define for the last operation in the list.
+    type: string
+  cmd-cnt-name:
+    description: The explicit name for constant holding the count of operations (last operation + 1).
+    type: string
   # End genetlink-c
   # Start genetlink-legacy
   kernel-policy:
diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml
index 48db31f1d059..775cce8c548a 100644
--- a/Documentation/netlink/netlink-raw.yaml
+++ b/Documentation/netlink/netlink-raw.yaml
@@ -47,6 +47,12 @@ properties:
   max-by-define:
     description: Makes the number of attributes and commands be specified by a define, not an enum value.
     type: boolean
+  cmd-max-name:
+    description: Name of the define for the last operation in the list.
+    type: string
+  cmd-cnt-name:
+    description: The explicit name for constant holding the count of operations (last operation + 1).
+    type: string
   # End genetlink-c
   # Start genetlink-legacy
   kernel-policy:
diff --git a/Documentation/netlink/specs/mptcp.yaml b/Documentation/netlink/specs/mptcp.yaml
index ec5c454a87ea..49f90cfb4698 100644
--- a/Documentation/netlink/specs/mptcp.yaml
+++ b/Documentation/netlink/specs/mptcp.yaml
@@ -8,6 +8,7 @@ c-family-name: mptcp-pm-name
 c-version-name: mptcp-pm-ver
 max-by-define: true
 kernel-policy: per-op
+cmd-cnt-name: --mptcp-pm-cmd-after-last
 
 definitions:
   -
@@ -167,6 +168,7 @@ attribute-sets:
   -
     name: attr
     name-prefix: mptcp-pm-attr-
+    attr-cnt-name: --mptcp-attr-after-last
     attributes:
       -
         name: unspec
diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 64ecc8a3f9f2..a6451561f3f8 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -28,10 +28,6 @@
 
 #include <linux/mptcp_pm.h>
 
-/* for backward compatibility */
-#define	__MPTCP_PM_CMD_AFTER_LAST	__MPTCP_PM_CMD_MAX
-#define	__MPTCP_ATTR_AFTER_LAST		__MPTCP_ATTR_MAX
-
 #define MPTCP_INFO_FLAG_FALLBACK		_BITUL(0)
 #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED	_BITUL(1)
 
diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h
index 0ad598fe940b..b5d11aece408 100644
--- a/include/uapi/linux/mptcp_pm.h
+++ b/include/uapi/linux/mptcp_pm.h
@@ -100,9 +100,9 @@ enum {
 	MPTCP_PM_ATTR_LOC_ID,
 	MPTCP_PM_ATTR_ADDR_REMOTE,
 
-	__MPTCP_PM_ATTR_MAX
+	__MPTCP_ATTR_AFTER_LAST
 };
-#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1)
+#define MPTCP_PM_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1)
 
 enum mptcp_event_attr {
 	MPTCP_ATTR_UNSPEC,
@@ -143,8 +143,8 @@ enum {
 	MPTCP_PM_CMD_SUBFLOW_CREATE,
 	MPTCP_PM_CMD_SUBFLOW_DESTROY,
 
-	__MPTCP_PM_CMD_MAX
+	__MPTCP_PM_CMD_AFTER_LAST
 };
-#define MPTCP_PM_CMD_MAX (__MPTCP_PM_CMD_MAX - 1)
+#define MPTCP_PM_CMD_MAX (__MPTCP_PM_CMD_AFTER_LAST - 1)
 
 #endif /* _UAPI_LINUX_MPTCP_PM_H */
-- 
cgit v1.2.3-70-g09d2