From f9bceaa59c5c47a8a08f48e19cbe887e500a1978 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 20 May 2021 20:06:53 -0700
Subject: libbpf: Skip bpf_object__probe_loading for light skeleton

I'm getting the following error when running 'gen skeleton -L' as
regular user:

libbpf: Error in bpf_object__probe_loading():Operation not permitted(1).
Couldn't load trivial BPF program. Make sure your kernel supports BPF
(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is set to big enough
value.

Fixes: 67234743736a ("libbpf: Generate loader program out of BPF ELF file.")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210521030653.2626513-1-sdf@google.com
---
 tools/lib/bpf/libbpf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index dc4d5fe6d9d2..b396e45b17ea 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3971,6 +3971,9 @@ bpf_object__probe_loading(struct bpf_object *obj)
 	};
 	int ret;
 
+	if (obj->gen_loader)
+		return 0;
+
 	/* make sure basic loading works */
 
 	memset(&attr, 0, sizeof(attr));
-- 
cgit v1.2.3-70-g09d2


From 3e87f192b405960c0fe83e0925bd0dadf4f8cf43 Mon Sep 17 00:00:00 2001
From: Denis Salopek <denis.salopek@sartura.hr>
Date: Tue, 11 May 2021 23:00:04 +0200
Subject: bpf: Add lookup_and_delete_elem support to hashtab

Extend the existing bpf_map_lookup_and_delete_elem() functionality to
hashtab map types, in addition to stacks and queues.
Create a new hashtab bpf_map_ops function that does lookup and deletion
of the element under the same bucket lock and add the created map_ops to
bpf.h.

Signed-off-by: Denis Salopek <denis.salopek@sartura.hr>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/4d18480a3e990ffbf14751ddef0325eed3be2966.1620763117.git.denis.salopek@sartura.hr
---
 include/linux/bpf.h            |  2 +
 include/uapi/linux/bpf.h       | 13 ++++++
 kernel/bpf/hashtab.c           | 98 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           | 34 +++++++++++++--
 tools/include/uapi/linux/bpf.h | 13 ++++++
 5 files changed, 156 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9dc44ba97584..1e9a0ff3217b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -70,6 +70,8 @@ struct bpf_map_ops {
 	void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
 	int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
 				union bpf_attr __user *uattr);
+	int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key,
+					  void *value, u64 flags);
 	int (*map_lookup_and_delete_batch)(struct bpf_map *map,
 					   const union bpf_attr *attr,
 					   union bpf_attr __user *uattr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 418b9b813d65..562adeac1d67 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -527,6 +527,15 @@ union bpf_iter_link_info {
  *		Look up an element with the given *key* in the map referred to
  *		by the file descriptor *fd*, and if found, delete the element.
  *
+ *		For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map
+ *		types, the *flags* argument needs to be set to 0, but for other
+ *		map types, it may be specified as:
+ *
+ *		**BPF_F_LOCK**
+ *			Look up and delete the value of a spin-locked map
+ *			without returning the lock. This must be specified if
+ *			the elements contain a spinlock.
+ *
  *		The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
  *		implement this command as a "pop" operation, deleting the top
  *		element rather than one corresponding to *key*.
@@ -536,6 +545,10 @@ union bpf_iter_link_info {
  *		This command is only valid for the following map types:
  *		* **BPF_MAP_TYPE_QUEUE**
  *		* **BPF_MAP_TYPE_STACK**
+ *		* **BPF_MAP_TYPE_HASH**
+ *		* **BPF_MAP_TYPE_PERCPU_HASH**
+ *		* **BPF_MAP_TYPE_LRU_HASH**
+ *		* **BPF_MAP_TYPE_LRU_PERCPU_HASH**
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d7ebb12ffffc..9da0a0413a53 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1401,6 +1401,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					     void *value, bool is_lru_map,
+					     bool is_percpu, u64 flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_nulls_head *head;
+	unsigned long bflags;
+	struct htab_elem *l;
+	u32 hash, key_size;
+	struct bucket *b;
+	int ret;
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	ret = htab_lock_bucket(htab, b, hash, &bflags);
+	if (ret)
+		return ret;
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+	if (!l) {
+		ret = -ENOENT;
+	} else {
+		if (is_percpu) {
+			u32 roundup_value_size = round_up(map->value_size, 8);
+			void __percpu *pptr;
+			int off = 0, cpu;
+
+			pptr = htab_elem_get_ptr(l, key_size);
+			for_each_possible_cpu(cpu) {
+				bpf_long_memcpy(value + off,
+						per_cpu_ptr(pptr, cpu),
+						roundup_value_size);
+				off += roundup_value_size;
+			}
+		} else {
+			u32 roundup_key_size = round_up(map->key_size, 8);
+
+			if (flags & BPF_F_LOCK)
+				copy_map_value_locked(map, value, l->key +
+						      roundup_key_size,
+						      true);
+			else
+				copy_map_value(map, value, l->key +
+					       roundup_key_size);
+			check_and_init_map_lock(map, value);
+		}
+
+		hlist_nulls_del_rcu(&l->hash_node);
+		if (!is_lru_map)
+			free_htab_elem(htab, l);
+	}
+
+	htab_unlock_bucket(htab, b, hash, bflags);
+
+	if (is_lru_map && l)
+		bpf_lru_push_free(&htab->lru, &l->lru_node);
+
+	return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					   void *value, u64 flags)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
+						 flags);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+						  void *key, void *value,
+						  u64 flags)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
+						 flags);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					       void *value, u64 flags)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
+						 flags);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+						      void *key, void *value,
+						      u64 flags)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
+						 flags);
+}
+
 static int
 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 				   const union bpf_attr *attr,
@@ -1934,6 +2028,7 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
 	.map_update_elem = htab_map_update_elem,
 	.map_delete_elem = htab_map_delete_elem,
 	.map_gen_lookup = htab_map_gen_lookup,
@@ -1954,6 +2049,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_lru_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
 	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
 	.map_update_elem = htab_lru_map_update_elem,
 	.map_delete_elem = htab_lru_map_delete_elem,
@@ -2077,6 +2173,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_percpu_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
 	.map_update_elem = htab_percpu_map_update_elem,
 	.map_delete_elem = htab_map_delete_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
@@ -2096,6 +2193,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
 	.map_update_elem = htab_lru_percpu_map_update_elem,
 	.map_delete_elem = htab_lru_map_delete_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1d1cd80a6e67..50457019da27 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1483,7 +1483,7 @@ free_buf:
 	return err;
 }
 
-#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
 
 static int map_lookup_and_delete_elem(union bpf_attr *attr)
 {
@@ -1499,6 +1499,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
 		return -EINVAL;
 
+	if (attr->flags & ~BPF_F_LOCK)
+		return -EINVAL;
+
 	f = fdget(ufd);
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
@@ -1509,24 +1512,47 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if (attr->flags &&
+	    (map->map_type == BPF_MAP_TYPE_QUEUE ||
+	     map->map_type == BPF_MAP_TYPE_STACK)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
+	if ((attr->flags & BPF_F_LOCK) &&
+	    !map_value_has_spin_lock(map)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
 	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
 		goto err_put;
 	}
 
-	value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
 
 	err = -ENOMEM;
 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 	if (!value)
 		goto free_key;
 
+	err = -ENOTSUPP;
 	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
 	    map->map_type == BPF_MAP_TYPE_STACK) {
 		err = map->ops->map_pop_elem(map, value);
-	} else {
-		err = -ENOTSUPP;
+	} else if (map->map_type == BPF_MAP_TYPE_HASH ||
+		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		if (!bpf_map_is_dev_bound(map)) {
+			bpf_disable_instrumentation();
+			rcu_read_lock();
+			err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
+			rcu_read_unlock();
+			bpf_enable_instrumentation();
+		}
 	}
 
 	if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 418b9b813d65..562adeac1d67 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -527,6 +527,15 @@ union bpf_iter_link_info {
  *		Look up an element with the given *key* in the map referred to
  *		by the file descriptor *fd*, and if found, delete the element.
  *
+ *		For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map
+ *		types, the *flags* argument needs to be set to 0, but for other
+ *		map types, it may be specified as:
+ *
+ *		**BPF_F_LOCK**
+ *			Look up and delete the value of a spin-locked map
+ *			without returning the lock. This must be specified if
+ *			the elements contain a spinlock.
+ *
  *		The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
  *		implement this command as a "pop" operation, deleting the top
  *		element rather than one corresponding to *key*.
@@ -536,6 +545,10 @@ union bpf_iter_link_info {
  *		This command is only valid for the following map types:
  *		* **BPF_MAP_TYPE_QUEUE**
  *		* **BPF_MAP_TYPE_STACK**
+ *		* **BPF_MAP_TYPE_HASH**
+ *		* **BPF_MAP_TYPE_PERCPU_HASH**
+ *		* **BPF_MAP_TYPE_LRU_HASH**
+ *		* **BPF_MAP_TYPE_LRU_PERCPU_HASH**
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
-- 
cgit v1.2.3-70-g09d2


From d59b9f2d1b2211e948044a099fd0a65941d06570 Mon Sep 17 00:00:00 2001
From: Denis Salopek <denis.salopek@sartura.hr>
Date: Tue, 11 May 2021 23:00:05 +0200
Subject: bpf: Extend libbpf with bpf_map_lookup_and_delete_elem_flags

Add bpf_map_lookup_and_delete_elem_flags() libbpf API in order to use
the BPF_F_LOCK flag with the map_lookup_and_delete_elem() function.

Signed-off-by: Denis Salopek <denis.salopek@sartura.hr>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/15b05dafe46c7e0750d110f233977372029d1f62.1620763117.git.denis.salopek@sartura.hr
---
 tools/lib/bpf/bpf.c      | 13 +++++++++++++
 tools/lib/bpf/bpf.h      |  2 ++
 tools/lib/bpf/libbpf.map |  1 +
 3 files changed, 16 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index bba48ff4c5c0..b7c2cc12034c 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -458,6 +458,19 @@ int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value)
 	return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr));
 }
 
+int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags)
+{
+	union bpf_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.map_fd = fd;
+	attr.key = ptr_to_u64(key);
+	attr.value = ptr_to_u64(value);
+	attr.flags = flags;
+
+	return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr));
+}
+
 int bpf_map_delete_elem(int fd, const void *key)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 875dde20d56e..4f758f8f50cd 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -124,6 +124,8 @@ LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void *value,
 					 __u64 flags);
 LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
 					      void *value);
+LIBBPF_API int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key,
+						    void *value, __u64 flags);
 LIBBPF_API int bpf_map_delete_elem(int fd, const void *key);
 LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key);
 LIBBPF_API int bpf_map_freeze(int fd);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 2abef6f17c06..0229e01e8ccc 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -361,6 +361,7 @@ LIBBPF_0.4.0 {
 		bpf_linker__new;
 		bpf_map__initial_value;
 		bpf_map__inner_map;
+		bpf_map_lookup_and_delete_elem_flags;
 		bpf_object__gen_loader;
 		bpf_object__set_kversion;
 		bpf_tc_attach;
-- 
cgit v1.2.3-70-g09d2


From 49c299b69444d58a1d234769a13a3697841deb54 Mon Sep 17 00:00:00 2001
From: Denis Salopek <denis.salopek@sartura.hr>
Date: Tue, 11 May 2021 23:00:06 +0200
Subject: selftests/bpf: Add bpf_lookup_and_delete_elem tests

Add bpf selftests and extend existing ones for a new function
bpf_lookup_and_delete_elem() for (percpu) hash and (percpu) LRU hash map
types.
In test_lru_map and test_maps we add an element, lookup_and_delete it,
then check whether it's deleted.
The newly added lookup_and_delete prog tests practically do the same
thing but additionally use a BPF program to change the value of the
element for LRU maps.

Signed-off-by: Denis Salopek <denis.salopek@sartura.hr>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/d30d3e0060c1f750e133579623cf1c60ff58f3d9.1620763117.git.denis.salopek@sartura.hr
---
 .../selftests/bpf/prog_tests/lookup_and_delete.c   | 288 +++++++++++++++++++++
 .../selftests/bpf/progs/test_lookup_and_delete.c   |  26 ++
 tools/testing/selftests/bpf/test_lru_map.c         |   8 +
 tools/testing/selftests/bpf/test_maps.c            |  17 ++
 4 files changed, 339 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_lookup_and_delete.c

diff --git a/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
new file mode 100644
index 000000000000..beebfa9730e1
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <test_progs.h>
+#include "test_lookup_and_delete.skel.h"
+
+#define START_VALUE 1234
+#define NEW_VALUE 4321
+#define MAX_ENTRIES 2
+
+static int duration;
+static int nr_cpus;
+
+static int fill_values(int map_fd)
+{
+	__u64 key, value = START_VALUE;
+	int err;
+
+	for (key = 1; key < MAX_ENTRIES + 1; key++) {
+		err = bpf_map_update_elem(map_fd, &key, &value, BPF_NOEXIST);
+		if (!ASSERT_OK(err, "bpf_map_update_elem"))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int fill_values_percpu(int map_fd)
+{
+	__u64 key, value[nr_cpus];
+	int i, err;
+
+	for (i = 0; i < nr_cpus; i++)
+		value[i] = START_VALUE;
+
+	for (key = 1; key < MAX_ENTRIES + 1; key++) {
+		err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST);
+		if (!ASSERT_OK(err, "bpf_map_update_elem"))
+			return -1;
+	}
+
+	return 0;
+}
+
+static struct test_lookup_and_delete *setup_prog(enum bpf_map_type map_type,
+						 int *map_fd)
+{
+	struct test_lookup_and_delete *skel;
+	int err;
+
+	skel = test_lookup_and_delete__open();
+	if (!ASSERT_OK_PTR(skel, "test_lookup_and_delete__open"))
+		return NULL;
+
+	err = bpf_map__set_type(skel->maps.hash_map, map_type);
+	if (!ASSERT_OK(err, "bpf_map__set_type"))
+		goto cleanup;
+
+	err = bpf_map__set_max_entries(skel->maps.hash_map, MAX_ENTRIES);
+	if (!ASSERT_OK(err, "bpf_map__set_max_entries"))
+		goto cleanup;
+
+	err = test_lookup_and_delete__load(skel);
+	if (!ASSERT_OK(err, "test_lookup_and_delete__load"))
+		goto cleanup;
+
+	*map_fd = bpf_map__fd(skel->maps.hash_map);
+	if (!ASSERT_GE(*map_fd, 0, "bpf_map__fd"))
+		goto cleanup;
+
+	return skel;
+
+cleanup:
+	test_lookup_and_delete__destroy(skel);
+	return NULL;
+}
+
+/* Triggers BPF program that updates map with given key and value */
+static int trigger_tp(struct test_lookup_and_delete *skel, __u64 key,
+		      __u64 value)
+{
+	int err;
+
+	skel->bss->set_pid = getpid();
+	skel->bss->set_key = key;
+	skel->bss->set_value = value;
+
+	err = test_lookup_and_delete__attach(skel);
+	if (!ASSERT_OK(err, "test_lookup_and_delete__attach"))
+		return -1;
+
+	syscall(__NR_getpgid);
+
+	test_lookup_and_delete__detach(skel);
+
+	return 0;
+}
+
+static void test_lookup_and_delete_hash(void)
+{
+	struct test_lookup_and_delete *skel;
+	__u64 key, value;
+	int map_fd, err;
+
+	/* Setup program and fill the map. */
+	skel = setup_prog(BPF_MAP_TYPE_HASH, &map_fd);
+	if (!ASSERT_OK_PTR(skel, "setup_prog"))
+		return;
+
+	err = fill_values(map_fd);
+	if (!ASSERT_OK(err, "fill_values"))
+		goto cleanup;
+
+	/* Lookup and delete element. */
+	key = 1;
+	err = bpf_map_lookup_and_delete_elem(map_fd, &key, &value);
+	if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem"))
+		goto cleanup;
+
+	/* Fetched value should match the initially set value. */
+	if (CHECK(value != START_VALUE, "bpf_map_lookup_and_delete_elem",
+		  "unexpected value=%lld\n", value))
+		goto cleanup;
+
+	/* Check that the entry is non existent. */
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem"))
+		goto cleanup;
+
+cleanup:
+	test_lookup_and_delete__destroy(skel);
+}
+
+static void test_lookup_and_delete_percpu_hash(void)
+{
+	struct test_lookup_and_delete *skel;
+	__u64 key, val, value[nr_cpus];
+	int map_fd, err, i;
+
+	/* Setup program and fill the map. */
+	skel = setup_prog(BPF_MAP_TYPE_PERCPU_HASH, &map_fd);
+	if (!ASSERT_OK_PTR(skel, "setup_prog"))
+		return;
+
+	err = fill_values_percpu(map_fd);
+	if (!ASSERT_OK(err, "fill_values_percpu"))
+		goto cleanup;
+
+	/* Lookup and delete element. */
+	key = 1;
+	err = bpf_map_lookup_and_delete_elem(map_fd, &key, value);
+	if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem"))
+		goto cleanup;
+
+	for (i = 0; i < nr_cpus; i++) {
+		val = value[i];
+
+		/* Fetched value should match the initially set value. */
+		if (CHECK(val != START_VALUE, "map value",
+			  "unexpected for cpu %d: %lld\n", i, val))
+			goto cleanup;
+	}
+
+	/* Check that the entry is non existent. */
+	err = bpf_map_lookup_elem(map_fd, &key, value);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem"))
+		goto cleanup;
+
+cleanup:
+	test_lookup_and_delete__destroy(skel);
+}
+
+static void test_lookup_and_delete_lru_hash(void)
+{
+	struct test_lookup_and_delete *skel;
+	__u64 key, value;
+	int map_fd, err;
+
+	/* Setup program and fill the LRU map. */
+	skel = setup_prog(BPF_MAP_TYPE_LRU_HASH, &map_fd);
+	if (!ASSERT_OK_PTR(skel, "setup_prog"))
+		return;
+
+	err = fill_values(map_fd);
+	if (!ASSERT_OK(err, "fill_values"))
+		goto cleanup;
+
+	/* Insert new element at key=3, should reuse LRU element. */
+	key = 3;
+	err = trigger_tp(skel, key, NEW_VALUE);
+	if (!ASSERT_OK(err, "trigger_tp"))
+		goto cleanup;
+
+	/* Lookup and delete element 3. */
+	err = bpf_map_lookup_and_delete_elem(map_fd, &key, &value);
+	if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem"))
+		goto cleanup;
+
+	/* Value should match the new value. */
+	if (CHECK(value != NEW_VALUE, "bpf_map_lookup_and_delete_elem",
+		  "unexpected value=%lld\n", value))
+		goto cleanup;
+
+	/* Check that entries 3 and 1 are non existent. */
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem"))
+		goto cleanup;
+
+	key = 1;
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem"))
+		goto cleanup;
+
+cleanup:
+	test_lookup_and_delete__destroy(skel);
+}
+
+static void test_lookup_and_delete_lru_percpu_hash(void)
+{
+	struct test_lookup_and_delete *skel;
+	__u64 key, val, value[nr_cpus];
+	int map_fd, err, i, cpucnt = 0;
+
+	/* Setup program and fill the LRU map. */
+	skel = setup_prog(BPF_MAP_TYPE_LRU_PERCPU_HASH, &map_fd);
+	if (!ASSERT_OK_PTR(skel, "setup_prog"))
+		return;
+
+	err = fill_values_percpu(map_fd);
+	if (!ASSERT_OK(err, "fill_values_percpu"))
+		goto cleanup;
+
+	/* Insert new element at key=3, should reuse LRU element 1. */
+	key = 3;
+	err = trigger_tp(skel, key, NEW_VALUE);
+	if (!ASSERT_OK(err, "trigger_tp"))
+		goto cleanup;
+
+	/* Clean value. */
+	for (i = 0; i < nr_cpus; i++)
+		value[i] = 0;
+
+	/* Lookup and delete element 3. */
+	err = bpf_map_lookup_and_delete_elem(map_fd, &key, value);
+	if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) {
+		goto cleanup;
+	}
+
+	/* Check if only one CPU has set the value. */
+	for (i = 0; i < nr_cpus; i++) {
+		val = value[i];
+		if (val) {
+			if (CHECK(val != NEW_VALUE, "map value",
+				  "unexpected for cpu %d: %lld\n", i, val))
+				goto cleanup;
+			cpucnt++;
+		}
+	}
+	if (CHECK(cpucnt != 1, "map value", "set for %d CPUs instead of 1!\n",
+		  cpucnt))
+		goto cleanup;
+
+	/* Check that entries 3 and 1 are non existent. */
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem"))
+		goto cleanup;
+
+	key = 1;
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem"))
+		goto cleanup;
+
+cleanup:
+	test_lookup_and_delete__destroy(skel);
+}
+
+void test_lookup_and_delete(void)
+{
+	nr_cpus = bpf_num_possible_cpus();
+
+	if (test__start_subtest("lookup_and_delete"))
+		test_lookup_and_delete_hash();
+	if (test__start_subtest("lookup_and_delete_percpu"))
+		test_lookup_and_delete_percpu_hash();
+	if (test__start_subtest("lookup_and_delete_lru"))
+		test_lookup_and_delete_lru_hash();
+	if (test__start_subtest("lookup_and_delete_lru_percpu"))
+		test_lookup_and_delete_lru_percpu_hash();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c
new file mode 100644
index 000000000000..3a193f42c7e7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+__u32 set_pid = 0;
+__u64 set_key = 0;
+__u64 set_value = 0;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 2);
+	__type(key, __u64);
+	__type(value, __u64);
+} hash_map SEC(".maps");
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int bpf_lookup_and_delete_test(const void *ctx)
+{
+	if (set_pid == bpf_get_current_pid_tgid() >> 32)
+		bpf_map_update_elem(&hash_map, &set_key, &set_value, BPF_NOEXIST);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index 6a5349f9eb14..7e9049fa3edf 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -231,6 +231,14 @@ static void test_lru_sanity0(int map_type, int map_flags)
 	assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
 	       errno == ENOENT);
 
+	/* lookup elem key=1 and delete it, then check it doesn't exist */
+	key = 1;
+	assert(!bpf_map_lookup_and_delete_elem(lru_map_fd, &key, &value));
+	assert(value[0] == 1234);
+
+	/* remove the same element from the expected map */
+	assert(!bpf_map_delete_elem(expected_map_fd, &key));
+
 	assert(map_equal(lru_map_fd, expected_map_fd));
 
 	close(expected_map_fd);
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 51adc42b2b40..8410a730c82f 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -65,6 +65,13 @@ static void test_hashmap(unsigned int task, void *data)
 	assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 1234);
 
 	key = 2;
+	value = 1234;
+	/* Insert key=2 element. */
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
+
+	/* Check that key=2 matches the value and delete it */
+	assert(bpf_map_lookup_and_delete_elem(fd, &key, &value) == 0 && value == 1234);
+
 	/* Check that key=2 is not found. */
 	assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
 
@@ -166,6 +173,16 @@ static void test_hashmap_percpu(unsigned int task, void *data)
 	/* Insert key=1 element. */
 	assert(!(expected_key_mask & key));
 	assert(bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0);
+
+	/* Lookup and delete elem key=1 and check value. */
+	assert(bpf_map_lookup_and_delete_elem(fd, &key, value) == 0 &&
+	       bpf_percpu(value,0) == 100);
+
+	for (i = 0; i < nr_cpus; i++)
+		bpf_percpu(value,i) = i + 100;
+
+	/* Insert key=1 element which should not exist. */
+	assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == 0);
 	expected_key_mask |= key;
 
 	/* BPF_NOEXIST means add new element if it doesn't exist. */
-- 
cgit v1.2.3-70-g09d2


From 9f0c317f6aa12b160103ee3946d79276c14b95e2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 22 May 2021 09:23:41 -0700
Subject: libbpf: Add support for new llvm bpf relocations

LLVM patch https://reviews.llvm.org/D102712
narrowed the scope of existing R_BPF_64_64
and R_BPF_64_32 relocations, and added three
new relocations, R_BPF_64_ABS64, R_BPF_64_ABS32
and R_BPF_64_NODYLD32. The main motivation is
to make relocations linker friendly.

This change, unfortunately, breaks libbpf build,
and we will see errors like below:
  libbpf: ELF relo #0 in section #6 has unexpected type 2 in
     /home/yhs/work/bpf-next/tools/testing/selftests/bpf/bpf_tcp_nogpl.o
  Error: failed to link
     '/home/yhs/work/bpf-next/tools/testing/selftests/bpf/bpf_tcp_nogpl.o':
     Unknown error -22 (-22)
The new relocation R_BPF_64_ABS64 is generated
and libbpf linker sanity check doesn't understand it.
Relocation section '.rel.struct_ops' at offset 0x1410 contains 1 entries:
    Offset             Info             Type               Symbol's Value  Symbol's Name
0000000000000018  0000000700000002 R_BPF_64_ABS64         0000000000000000 nogpltcp_init

Look at the selftests/bpf/bpf_tcp_nogpl.c,
  void BPF_STRUCT_OPS(nogpltcp_init, struct sock *sk)
  {
  }

  SEC(".struct_ops")
  struct tcp_congestion_ops bpf_nogpltcp = {
          .init           = (void *)nogpltcp_init,
          .name           = "bpf_nogpltcp",
  };
The new llvm relocation scheme categorizes 'nogpltcp_init' reference
as R_BPF_64_ABS64 instead of R_BPF_64_64 which is used to specify
ld_imm64 relocation in the new scheme.

Let us fix the linker sanity checking by including
R_BPF_64_ABS64 and R_BPF_64_ABS32. There is no need to
check R_BPF_64_NODYLD32 which is used for .BTF and .BTF.ext.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210522162341.3687617-1-yhs@fb.com
---
 tools/lib/bpf/libbpf_internal.h | 6 ++++++
 tools/lib/bpf/linker.c          | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 55d9b4dca64f..e2db08573bf0 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -28,6 +28,12 @@
 #ifndef R_BPF_64_64
 #define R_BPF_64_64 1
 #endif
+#ifndef R_BPF_64_ABS64
+#define R_BPF_64_ABS64 2
+#endif
+#ifndef R_BPF_64_ABS32
+#define R_BPF_64_ABS32 3
+#endif
 #ifndef R_BPF_64_32
 #define R_BPF_64_32 10
 #endif
diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index b594a88620ce..1dca41a24f75 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -892,7 +892,8 @@ static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *se
 		size_t sym_idx = ELF64_R_SYM(relo->r_info);
 		size_t sym_type = ELF64_R_TYPE(relo->r_info);
 
-		if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) {
+		if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32 &&
+		    sym_type != R_BPF_64_ABS64 && sym_type != R_BPF_64_ABS32) {
 			pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n",
 				i, sec->sec_idx, sym_type, obj->filename);
 			return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From 4ce7d68beb9e63d0a0a427cc2b89ec0c68f24b3d Mon Sep 17 00:00:00 2001
From: Aditya Srivastava <yashsri421@gmail.com>
Date: Sun, 23 May 2021 20:44:08 +0530
Subject: samples: bpf: Ix kernel-doc syntax in file header

The opening comment mark '/**' is used for highlighting the beginning of
kernel-doc comments.
The header for samples/bpf/ibumad_kern.c follows this syntax, but
the content inside does not comply with kernel-doc.

This line was probably not meant for kernel-doc parsing, but is parsed
due to the presence of kernel-doc like comment syntax(i.e, '/**'), which
causes unexpected warnings from kernel-doc:
warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * ibumad BPF sample kernel side

Provide a simple fix by replacing this occurrence with general comment
format, i.e. '/*', to prevent kernel-doc from parsing it.

Signed-off-by: Aditya Srivastava <yashsri421@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/bpf/20210523151408.22280-1-yashsri421@gmail.com
---
 samples/bpf/ibumad_kern.c | 2 +-
 samples/bpf/ibumad_user.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c
index 26dcd4dde946..9b193231024a 100644
--- a/samples/bpf/ibumad_kern.c
+++ b/samples/bpf/ibumad_kern.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 
-/**
+/*
  * ibumad BPF sample kernel side
  *
  * This program is free software; you can redistribute it and/or
diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c
index d83d8102f489..0746ca516097 100644
--- a/samples/bpf/ibumad_user.c
+++ b/samples/bpf/ibumad_user.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 
-/**
+/*
  * ibumad BPF sample user side
  *
  * This program is free software; you can redistribute it and/or
-- 
cgit v1.2.3-70-g09d2


From 8fb33b6055300a23f26868680c22a5726834785e Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 25 May 2021 10:56:59 +0800
Subject: bpf: Fix spelling mistakes

Fix some spelling mistakes in comments:
aother ==> another
Netiher ==> Neither
desribe ==> describe
intializing ==> initializing
funciton ==> function
wont ==> won't and move the word 'the' at the end to the next line
accross ==> across
pathes ==> paths
triggerred ==> triggered
excute ==> execute
ether ==> either
conervative ==> conservative
convetion ==> convention
markes ==> marks
interpeter ==> interpreter

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210525025659.8898-2-thunder.leizhen@huawei.com
---
 include/linux/bpf_local_storage.h |  4 ++--
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/btf.c                  |  6 +++---
 kernel/bpf/devmap.c               |  4 ++--
 kernel/bpf/hashtab.c              |  4 ++--
 kernel/bpf/reuseport_array.c      |  2 +-
 kernel/bpf/trampoline.c           |  2 +-
 kernel/bpf/verifier.c             | 12 ++++++------
 8 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index b902c580c48d..24496bc28e7b 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -58,7 +58,7 @@ struct bpf_local_storage_data {
 	 * from the object's bpf_local_storage.
 	 *
 	 * Put it in the same cacheline as the data to minimize
-	 * the number of cachelines access during the cache hit case.
+	 * the number of cachelines accessed during the cache hit case.
 	 */
 	struct bpf_local_storage_map __rcu *smap;
 	u8 data[] __aligned(8);
@@ -71,7 +71,7 @@ struct bpf_local_storage_elem {
 	struct bpf_local_storage __rcu *local_storage;
 	struct rcu_head rcu;
 	/* 8 bytes hole */
-	/* The data is stored in aother cacheline to minimize
+	/* The data is stored in another cacheline to minimize
 	 * the number of cachelines access during a cache hit.
 	 */
 	struct bpf_local_storage_data sdata ____cacheline_aligned;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 2921ca39a93e..96ceed0e0fb5 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -72,7 +72,7 @@ void bpf_inode_storage_free(struct inode *inode)
 		return;
 	}
 
-	/* Netiher the bpf_prog nor the bpf-map's syscall
+	/* Neither the bpf_prog nor the bpf-map's syscall
 	 * could be modifying the local_storage->list now.
 	 * Thus, no elem can be added-to or deleted-from the
 	 * local_storage->list by the bpf_prog or by the bpf-map's syscall.
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 85716327c375..a6e39c5ea0bf 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -51,7 +51,7 @@
  * The BTF type section contains a list of 'struct btf_type' objects.
  * Each one describes a C type.  Recall from the above section
  * that a 'struct btf_type' object could be immediately followed by extra
- * data in order to desribe some particular C types.
+ * data in order to describe some particular C types.
  *
  * type_id:
  * ~~~~~~~
@@ -1143,7 +1143,7 @@ static void *btf_show_obj_safe(struct btf_show *show,
 
 	/*
 	 * We need a new copy to our safe object, either because we haven't
-	 * yet copied and are intializing safe data, or because the data
+	 * yet copied and are initializing safe data, or because the data
 	 * we want falls outside the boundaries of the safe object.
 	 */
 	if (!safe) {
@@ -3417,7 +3417,7 @@ static struct btf_kind_operations func_proto_ops = {
 	 * BTF_KIND_FUNC_PROTO cannot be directly referred by
 	 * a struct's member.
 	 *
-	 * It should be a funciton pointer instead.
+	 * It should be a function pointer instead.
 	 * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
 	 *
 	 * Hence, there is no btf_func_check_member().
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index aa516472ce46..d60d617ec0d7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -382,8 +382,8 @@ void __dev_flush(void)
 }
 
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put wont happen until after reading the
- * ifindex.
+ * update happens in parallel here a dev_put won't happen until after reading
+ * the ifindex.
  */
 static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9da0a0413a53..6f6681b07364 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -46,12 +46,12 @@
  * events, kprobes and tracing to be invoked before the prior invocation
  * from one of these contexts completed. sys_bpf() uses the same mechanism
  * by pinning the task to the current CPU and incrementing the recursion
- * protection accross the map operation.
+ * protection across the map operation.
  *
  * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
  * operations like memory allocations (even with GFP_ATOMIC) from atomic
  * contexts. This is required because even with GFP_ATOMIC the memory
- * allocator calls into code pathes which acquire locks with long held lock
+ * allocator calls into code paths which acquire locks with long held lock
  * sections. To ensure the deterministic behaviour these locks are regular
  * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
  * true atomic contexts on an RT kernel are the low level hardware
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 4838922f723d..93a55391791a 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -102,7 +102,7 @@ static void reuseport_array_free(struct bpf_map *map)
 	/*
 	 * ops->map_*_elem() will not be able to access this
 	 * array now. Hence, this function only races with
-	 * bpf_sk_reuseport_detach() which was triggerred by
+	 * bpf_sk_reuseport_detach() which was triggered by
 	 * close() or disconnect().
 	 *
 	 * This function and bpf_sk_reuseport_detach() are
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 2d44b5aa0057..28a3630c48ee 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -552,7 +552,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
  * __bpf_prog_enter returns:
  * 0 - skip execution of the bpf prog
  * 1 - execute bpf prog
- * [2..MAX_U64] - excute bpf prog and record execution time.
+ * [2..MAX_U64] - execute bpf prog and record execution time.
  *     This is start time.
  */
 u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9189eecb26dd..1de4b8c6ee42 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -47,7 +47,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * - unreachable insns exist (shouldn't be a forest. program = one function)
  * - out of bounds or malformed jumps
  * The second pass is all possible path descent from the 1st insn.
- * Since it's analyzing all pathes through the program, the length of the
+ * Since it's analyzing all paths through the program, the length of the
  * analysis is limited to 64k insn, which may be hit even if total number of
  * insn is less then 4K, but there are too many branches that change stack/regs.
  * Number of 'branches to be analyzed' is limited to 1k
@@ -132,7 +132,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * If it's ok, then verifier allows this BPF_CALL insn and looks at
  * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
  * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
- * returns ether pointer to map value or NULL.
+ * returns either pointer to map value or NULL.
  *
  * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
  * insn, the register holding that pointer in the true branch changes state to
@@ -2616,7 +2616,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		if (dst_reg != BPF_REG_FP) {
 			/* The backtracking logic can only recognize explicit
 			 * stack slot address like [fp - 8]. Other spill of
-			 * scalar via different register has to be conervative.
+			 * scalar via different register has to be conservative.
 			 * Backtrack from here and mark all registers as precise
 			 * that contributed into 'reg' being a constant.
 			 */
@@ -9053,7 +9053,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	    !prog->aux->attach_func_proto->type)
 		return 0;
 
-	/* eBPF calling convetion is such that R0 is used
+	/* eBPF calling convention is such that R0 is used
 	 * to return the value from eBPF program.
 	 * Make sure that it's readable at this time
 	 * of bpf_exit, which means that program wrote
@@ -9844,7 +9844,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
  * Since the verifier pushes the branch states as it sees them while exploring
  * the program the condition of walking the branch instruction for the second
  * time means that all states below this branch were already explored and
- * their final liveness markes are already propagated.
+ * their final liveness marks are already propagated.
  * Hence when the verifier completes the search of state list in is_state_visited()
  * we can call this clean_live_states() function to mark all liveness states
  * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
@@ -12464,7 +12464,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			prog->aux->max_pkt_offset = MAX_PACKET_OFF;
 
 			/* mark bpf_tail_call as different opcode to avoid
-			 * conditional branch in the interpeter for every normal
+			 * conditional branch in the interpreter for every normal
 			 * call and to prevent accidental JITing by JIT compiler
 			 * that doesn't support bpf_tail_call yet
 			 */
-- 
cgit v1.2.3-70-g09d2


From a720a2a0ad6cb6f769b6c7cbc3c54287a7d54ff8 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Fri, 21 May 2021 10:33:01 +0200
Subject: xsk: Use kvcalloc to support large umems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use kvcalloc() instead of kcalloc() to support large umems with, on my
server, one million pages or more in the umem.

Reported-by: Dan Siemon <dan@coverfire.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Björn Töpel <bjorn@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210521083301.26921-1-magnus.karlsson@gmail.com
---
 net/xdp/xdp_umem.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 56a28a686988..f01ef6bda390 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -27,7 +27,7 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
 	unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
 
-	kfree(umem->pgs);
+	kvfree(umem->pgs);
 	umem->pgs = NULL;
 }
 
@@ -99,8 +99,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
 	long npgs;
 	int err;
 
-	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
-			    GFP_KERNEL | __GFP_NOWARN);
+	umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN);
 	if (!umem->pgs)
 		return -ENOMEM;
 
@@ -123,7 +122,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
 out_pin:
 	xdp_umem_unpin_pages(umem);
 out_pgs:
-	kfree(umem->pgs);
+	kvfree(umem->pgs);
 	umem->pgs = NULL;
 	return err;
 }
-- 
cgit v1.2.3-70-g09d2


From 5981881d21dff612abf8fce484f8efa67f49aae4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 24 May 2021 20:59:31 -0700
Subject: libbpf: Add libbpf_set_strict_mode() API to turn on libbpf 1.0
 behaviors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add libbpf_set_strict_mode() API that allows application to simulate libbpf
1.0 breaking changes before libbpf 1.0 is released. This will help users
migrate gradually and with confidence.

For now only ALL or NONE options are available, subsequent patches will add
more flags. This patch is preliminary for selftests/bpf changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210525035935.1461796-2-andrii@kernel.org
---
 tools/lib/bpf/Makefile        |  1 +
 tools/lib/bpf/libbpf.c        | 17 ++++++++++++++++
 tools/lib/bpf/libbpf.h        |  1 +
 tools/lib/bpf/libbpf.map      |  5 +++++
 tools/lib/bpf/libbpf_legacy.h | 47 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 71 insertions(+)
 create mode 100644 tools/lib/bpf/libbpf_legacy.h

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index e43e1896cb4b..15420303cf06 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -229,6 +229,7 @@ install_headers: $(BPF_HELPER_DEFS)
 		$(call do_install,libbpf.h,$(prefix)/include/bpf,644); \
 		$(call do_install,btf.h,$(prefix)/include/bpf,644); \
 		$(call do_install,libbpf_common.h,$(prefix)/include/bpf,644); \
+		$(call do_install,libbpf_legacy.h,$(prefix)/include/bpf,644); \
 		$(call do_install,xsk.h,$(prefix)/include/bpf,644); \
 		$(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \
 		$(call do_install,$(BPF_HELPER_DEFS),$(prefix)/include/bpf,644); \
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b396e45b17ea..1425d7ed0f2f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -151,6 +151,23 @@ static inline __u64 ptr_to_u64(const void *ptr)
 	return (__u64) (unsigned long) ptr;
 }
 
+/* this goes away in libbpf 1.0 */
+enum libbpf_strict_mode libbpf_mode = LIBBPF_STRICT_NONE;
+
+int libbpf_set_strict_mode(enum libbpf_strict_mode mode)
+{
+	/* __LIBBPF_STRICT_LAST is the last power-of-2 value used + 1, so to
+	 * get all possible values we compensate last +1, and then (2*x - 1)
+	 * to get the bit mask
+	 */
+	if (mode != LIBBPF_STRICT_ALL
+	    && (mode & ~((__LIBBPF_STRICT_LAST - 1) * 2 - 1)))
+		return errno = EINVAL, -EINVAL;
+
+	libbpf_mode = mode;
+	return 0;
+}
+
 enum kern_feature_id {
 	/* v4.14: kernel support for program & map names. */
 	FEAT_PROG_NAME,
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index d98523558f39..6e61342ba56c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -18,6 +18,7 @@
 #include <linux/bpf.h>
 
 #include "libbpf_common.h"
+#include "libbpf_legacy.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 0229e01e8ccc..bbe99b1db1a9 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -370,3 +370,8 @@ LIBBPF_0.4.0 {
 		bpf_tc_hook_destroy;
 		bpf_tc_query;
 } LIBBPF_0.3.0;
+
+LIBBPF_0.5.0 {
+	global:
+		libbpf_set_strict_mode;
+} LIBBPF_0.4.0;
diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h
new file mode 100644
index 000000000000..7482cfe22ab2
--- /dev/null
+++ b/tools/lib/bpf/libbpf_legacy.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+/*
+ * Libbpf legacy APIs (either discouraged or deprecated, as mentioned in [0])
+ *
+ *   [0] https://docs.google.com/document/d/1UyjTZuPFWiPFyKk1tV5an11_iaRuec6U-ZESZ54nNTY
+ *
+ * Copyright (C) 2021 Facebook
+ */
+#ifndef __LIBBPF_LEGACY_BPF_H
+#define __LIBBPF_LEGACY_BPF_H
+
+#include <linux/bpf.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "libbpf_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum libbpf_strict_mode {
+	/* Turn on all supported strict features of libbpf to simulate libbpf
+	 * v1.0 behavior.
+	 * This will be the default behavior in libbpf v1.0.
+	 */
+	LIBBPF_STRICT_ALL = 0xffffffff,
+
+	/*
+	 * Disable any libbpf 1.0 behaviors. This is the default before libbpf
+	 * v1.0. It won't be supported anymore in v1.0, please update your
+	 * code so that it handles LIBBPF_STRICT_ALL mode before libbpf v1.0.
+	 */
+	LIBBPF_STRICT_NONE = 0x00,
+
+	__LIBBPF_STRICT_LAST,
+};
+
+LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* __LIBBPF_LEGACY_BPF_H */
-- 
cgit v1.2.3-70-g09d2


From bad2e478af3b4df9fd84b4db7779ea91bd618c16 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 24 May 2021 20:59:32 -0700
Subject: selftests/bpf: Turn on libbpf 1.0 mode and fix all IS_ERR checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turn ony libbpf 1.0 mode. Fix all the explicit IS_ERR checks that now will be
broken because libbpf returns NULL on error (and sets errno). Fix
ASSERT_OK_PTR and ASSERT_ERR_PTR to work for both old mode and new modes and
use them throughout selftests. This is trivial to do by using
libbpf_get_error() API that all libbpf users are supposed to use, instead of
IS_ERR checks.

A bunch of checks also did explicit -1 comparison for various fd-returning
APIs. Such checks are replaced with >= 0 or < 0 cases.

There were also few misuses of bpf_object__find_map_by_name() in test_maps.
Those are fixed in this patch as well.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210525035935.1461796-3-andrii@kernel.org
---
 tools/testing/selftests/bpf/bench.c                |   1 +
 tools/testing/selftests/bpf/benchs/bench_rename.c  |   2 +-
 .../testing/selftests/bpf/benchs/bench_ringbufs.c  |   6 +-
 tools/testing/selftests/bpf/benchs/bench_trigger.c |   2 +-
 .../selftests/bpf/prog_tests/attach_probe.c        |  12 +-
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c  |  31 ++--
 .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c  |   8 +-
 tools/testing/selftests/bpf/prog_tests/btf.c       |  93 ++++++------
 tools/testing/selftests/bpf/prog_tests/btf_dump.c  |   8 +-
 tools/testing/selftests/bpf/prog_tests/btf_write.c |   4 +-
 .../selftests/bpf/prog_tests/cg_storage_multi.c    |  84 ++++-------
 .../selftests/bpf/prog_tests/cgroup_attach_multi.c |   2 +-
 .../testing/selftests/bpf/prog_tests/cgroup_link.c |  14 +-
 .../bpf/prog_tests/cgroup_skb_sk_lookup.c          |   2 +-
 tools/testing/selftests/bpf/prog_tests/check_mtu.c |   2 +-
 .../testing/selftests/bpf/prog_tests/core_reloc.c  |  15 +-
 .../selftests/bpf/prog_tests/fexit_bpf2bpf.c       |  25 +--
 .../selftests/bpf/prog_tests/flow_dissector.c      |   2 +-
 .../bpf/prog_tests/flow_dissector_reattach.c       |  10 +-
 .../selftests/bpf/prog_tests/get_stack_raw_tp.c    |  10 +-
 .../bpf/prog_tests/get_stackid_cannot_attach.c     |   9 +-
 tools/testing/selftests/bpf/prog_tests/hashmap.c   |   9 +-
 tools/testing/selftests/bpf/prog_tests/kfree_skb.c |  19 +--
 tools/testing/selftests/bpf/prog_tests/ksyms_btf.c |   3 +-
 .../selftests/bpf/prog_tests/link_pinning.c        |   7 +-
 tools/testing/selftests/bpf/prog_tests/obj_name.c  |   8 +-
 .../selftests/bpf/prog_tests/perf_branches.c       |   4 +-
 .../testing/selftests/bpf/prog_tests/perf_buffer.c |   2 +-
 .../selftests/bpf/prog_tests/perf_event_stackmap.c |   3 +-
 .../testing/selftests/bpf/prog_tests/probe_user.c  |   7 +-
 .../selftests/bpf/prog_tests/prog_run_xattr.c      |   4 +-
 .../selftests/bpf/prog_tests/raw_tp_test_run.c     |   4 +-
 .../testing/selftests/bpf/prog_tests/rdonly_maps.c |   7 +-
 .../selftests/bpf/prog_tests/reference_tracking.c  |   2 +-
 .../selftests/bpf/prog_tests/resolve_btfids.c      |   2 +-
 .../selftests/bpf/prog_tests/ringbuf_multi.c       |   2 +-
 .../selftests/bpf/prog_tests/select_reuseport.c    |  53 +++----
 .../testing/selftests/bpf/prog_tests/send_signal.c |   3 +-
 tools/testing/selftests/bpf/prog_tests/sk_lookup.c |   2 +-
 .../testing/selftests/bpf/prog_tests/sock_fields.c |  14 +-
 .../selftests/bpf/prog_tests/sockmap_basic.c       |   8 +-
 .../selftests/bpf/prog_tests/sockmap_ktls.c        |   2 +-
 .../selftests/bpf/prog_tests/sockmap_listen.c      |  10 +-
 .../bpf/prog_tests/stacktrace_build_id_nmi.c       |   3 +-
 .../selftests/bpf/prog_tests/stacktrace_map.c      |   2 +-
 .../bpf/prog_tests/stacktrace_map_raw_tp.c         |   5 +-
 .../selftests/bpf/prog_tests/tcp_hdr_options.c     |  15 +-
 .../selftests/bpf/prog_tests/test_overhead.c       |  12 +-
 .../selftests/bpf/prog_tests/trampoline_count.c    |  14 +-
 tools/testing/selftests/bpf/prog_tests/udp_limit.c |   7 +-
 .../testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c |   2 +-
 tools/testing/selftests/bpf/prog_tests/xdp_link.c  |   8 +-
 tools/testing/selftests/bpf/test_maps.c            | 168 +++++++++++----------
 tools/testing/selftests/bpf/test_progs.c           |   3 +
 tools/testing/selftests/bpf/test_progs.h           |   9 +-
 tools/testing/selftests/bpf/test_tcpnotify_user.c  |   7 +-
 56 files changed, 347 insertions(+), 425 deletions(-)

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 332ed2f7b402..6ea15b93a2f8 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -43,6 +43,7 @@ void setup_libbpf()
 {
 	int err;
 
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 	libbpf_set_print(libbpf_print_fn);
 
 	err = bump_memlock_rlimit();
diff --git a/tools/testing/selftests/bpf/benchs/bench_rename.c b/tools/testing/selftests/bpf/benchs/bench_rename.c
index a967674098ad..c7ec114eca56 100644
--- a/tools/testing/selftests/bpf/benchs/bench_rename.c
+++ b/tools/testing/selftests/bpf/benchs/bench_rename.c
@@ -65,7 +65,7 @@ static void attach_bpf(struct bpf_program *prog)
 	struct bpf_link *link;
 
 	link = bpf_program__attach(prog);
-	if (IS_ERR(link)) {
+	if (!link) {
 		fprintf(stderr, "failed to attach program!\n");
 		exit(1);
 	}
diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
index bde6c9d4cbd4..d167bffac679 100644
--- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
+++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
@@ -181,7 +181,7 @@ static void ringbuf_libbpf_setup()
 	}
 
 	link = bpf_program__attach(ctx->skel->progs.bench_ringbuf);
-	if (IS_ERR(link)) {
+	if (!link) {
 		fprintf(stderr, "failed to attach program!\n");
 		exit(1);
 	}
@@ -271,7 +271,7 @@ static void ringbuf_custom_setup()
 	}
 
 	link = bpf_program__attach(ctx->skel->progs.bench_ringbuf);
-	if (IS_ERR(link)) {
+	if (!link) {
 		fprintf(stderr, "failed to attach program\n");
 		exit(1);
 	}
@@ -430,7 +430,7 @@ static void perfbuf_libbpf_setup()
 	}
 
 	link = bpf_program__attach(ctx->skel->progs.bench_perfbuf);
-	if (IS_ERR(link)) {
+	if (!link) {
 		fprintf(stderr, "failed to attach program\n");
 		exit(1);
 	}
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 2a0b6c9885a4..f41a491a8cc0 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -60,7 +60,7 @@ static void attach_bpf(struct bpf_program *prog)
 	struct bpf_link *link;
 
 	link = bpf_program__attach(prog);
-	if (IS_ERR(link)) {
+	if (!link) {
 		fprintf(stderr, "failed to attach program!\n");
 		exit(1);
 	}
diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
index 9dc4e3dfbcf3..ec11e20d2b92 100644
--- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c
+++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
@@ -85,16 +85,14 @@ void test_attach_probe(void)
 	kprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kprobe,
 						 false /* retprobe */,
 						 SYS_NANOSLEEP_KPROBE_NAME);
-	if (CHECK(IS_ERR(kprobe_link), "attach_kprobe",
-		  "err %ld\n", PTR_ERR(kprobe_link)))
+	if (!ASSERT_OK_PTR(kprobe_link, "attach_kprobe"))
 		goto cleanup;
 	skel->links.handle_kprobe = kprobe_link;
 
 	kretprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kretprobe,
 						    true /* retprobe */,
 						    SYS_NANOSLEEP_KPROBE_NAME);
-	if (CHECK(IS_ERR(kretprobe_link), "attach_kretprobe",
-		  "err %ld\n", PTR_ERR(kretprobe_link)))
+	if (!ASSERT_OK_PTR(kretprobe_link, "attach_kretprobe"))
 		goto cleanup;
 	skel->links.handle_kretprobe = kretprobe_link;
 
@@ -103,8 +101,7 @@ void test_attach_probe(void)
 						 0 /* self pid */,
 						 "/proc/self/exe",
 						 uprobe_offset);
-	if (CHECK(IS_ERR(uprobe_link), "attach_uprobe",
-		  "err %ld\n", PTR_ERR(uprobe_link)))
+	if (!ASSERT_OK_PTR(uprobe_link, "attach_uprobe"))
 		goto cleanup;
 	skel->links.handle_uprobe = uprobe_link;
 
@@ -113,8 +110,7 @@ void test_attach_probe(void)
 						    -1 /* any pid */,
 						    "/proc/self/exe",
 						    uprobe_offset);
-	if (CHECK(IS_ERR(uretprobe_link), "attach_uretprobe",
-		  "err %ld\n", PTR_ERR(uretprobe_link)))
+	if (!ASSERT_OK_PTR(uretprobe_link, "attach_uretprobe"))
 		goto cleanup;
 	skel->links.handle_uretprobe = uretprobe_link;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 2d3590cfb5e1..1f1aade56504 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -47,7 +47,7 @@ static void do_dummy_read(struct bpf_program *prog)
 	int iter_fd, len;
 
 	link = bpf_program__attach_iter(prog, NULL);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		return;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -201,7 +201,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
 	int ret = 0;
 
 	link = bpf_program__attach_iter(prog, NULL);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		return ret;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -396,7 +396,7 @@ static void test_file_iter(void)
 		return;
 
 	link = bpf_program__attach_iter(skel1->progs.dump_task, NULL);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
 	/* unlink this path if it exists. */
@@ -502,7 +502,7 @@ static void test_overflow(bool test_e2big_overflow, bool ret1)
 	skel->bss->map2_id = map_info.id;
 
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto free_map2;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -607,14 +607,12 @@ static void test_bpf_hash_map(void)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
-	if (CHECK(!IS_ERR(link), "attach_iter",
-		  "attach_iter for hashmap2 unexpected succeeded\n"))
+	if (!ASSERT_ERR_PTR(link, "attach_iter"))
 		goto out;
 
 	linfo.map.map_fd = bpf_map__fd(skel->maps.hashmap3);
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
-	if (CHECK(!IS_ERR(link), "attach_iter",
-		  "attach_iter for hashmap3 unexpected succeeded\n"))
+	if (!ASSERT_ERR_PTR(link, "attach_iter"))
 		goto out;
 
 	/* hashmap1 should be good, update map values here */
@@ -636,7 +634,7 @@ static void test_bpf_hash_map(void)
 
 	linfo.map.map_fd = map_fd;
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -727,7 +725,7 @@ static void test_bpf_percpu_hash_map(void)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_percpu_hash_map, &opts);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -798,7 +796,7 @@ static void test_bpf_array_map(void)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_array_map, &opts);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -894,7 +892,7 @@ static void test_bpf_percpu_array_map(void)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_percpu_array_map, &opts);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -957,7 +955,7 @@ static void test_bpf_sk_storage_delete(void)
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(skel->progs.delete_bpf_sk_storage_map,
 					&opts);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -1075,7 +1073,7 @@ static void test_bpf_sk_storage_map(void)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_sk_storage_map, &opts);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -1128,7 +1126,7 @@ static void test_rdonly_buf_out_of_bound(void)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
-	if (CHECK(!IS_ERR(link), "attach_iter", "unexpected success\n"))
+	if (!ASSERT_ERR_PTR(link, "attach_iter"))
 		bpf_link__destroy(link);
 
 	bpf_iter_test_kern5__destroy(skel);
@@ -1186,8 +1184,7 @@ static void test_task_vma(void)
 	skel->links.proc_maps = bpf_program__attach_iter(
 		skel->progs.proc_maps, NULL);
 
-	if (CHECK(IS_ERR(skel->links.proc_maps), "bpf_program__attach_iter",
-		  "attach iterator failed\n")) {
+	if (!ASSERT_OK_PTR(skel->links.proc_maps, "bpf_program__attach_iter")) {
 		skel->links.proc_maps = NULL;
 		goto out;
 	}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index e25917f04602..efe1e979affb 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -82,7 +82,7 @@ static void *server(void *arg)
 	      bytes, total_bytes, nr_sent, errno);
 
 done:
-	if (fd != -1)
+	if (fd >= 0)
 		close(fd);
 	if (err) {
 		WRITE_ONCE(stop, 1);
@@ -191,8 +191,7 @@ static void test_cubic(void)
 		return;
 
 	link = bpf_map__attach_struct_ops(cubic_skel->maps.cubic);
-	if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n",
-		  PTR_ERR(link))) {
+	if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) {
 		bpf_cubic__destroy(cubic_skel);
 		return;
 	}
@@ -213,8 +212,7 @@ static void test_dctcp(void)
 		return;
 
 	link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp);
-	if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n",
-		  PTR_ERR(link))) {
+	if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) {
 		bpf_dctcp__destroy(dctcp_skel);
 		return;
 	}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 0457ae32b270..857e3f26086f 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -3811,7 +3811,7 @@ static void do_test_raw(unsigned int test_num)
 			      always_log);
 	free(raw_btf);
 
-	err = ((btf_fd == -1) != test->btf_load_err);
+	err = ((btf_fd < 0) != test->btf_load_err);
 	if (CHECK(err, "btf_fd:%d test->btf_load_err:%u",
 		  btf_fd, test->btf_load_err) ||
 	    CHECK(test->err_str && !strstr(btf_log_buf, test->err_str),
@@ -3820,7 +3820,7 @@ static void do_test_raw(unsigned int test_num)
 		goto done;
 	}
 
-	if (err || btf_fd == -1)
+	if (err || btf_fd < 0)
 		goto done;
 
 	create_attr.name = test->map_name;
@@ -3834,16 +3834,16 @@ static void do_test_raw(unsigned int test_num)
 
 	map_fd = bpf_create_map_xattr(&create_attr);
 
-	err = ((map_fd == -1) != test->map_create_err);
+	err = ((map_fd < 0) != test->map_create_err);
 	CHECK(err, "map_fd:%d test->map_create_err:%u",
 	      map_fd, test->map_create_err);
 
 done:
 	if (*btf_log_buf && (err || always_log))
 		fprintf(stderr, "\n%s", btf_log_buf);
-	if (btf_fd != -1)
+	if (btf_fd >= 0)
 		close(btf_fd);
-	if (map_fd != -1)
+	if (map_fd >= 0)
 		close(map_fd);
 }
 
@@ -3941,7 +3941,7 @@ static int test_big_btf_info(unsigned int test_num)
 	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
 			      btf_log_buf, BTF_LOG_BUF_SIZE,
 			      always_log);
-	if (CHECK(btf_fd == -1, "errno:%d", errno)) {
+	if (CHECK(btf_fd < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -3987,7 +3987,7 @@ done:
 	free(raw_btf);
 	free(user_btf);
 
-	if (btf_fd != -1)
+	if (btf_fd >= 0)
 		close(btf_fd);
 
 	return err;
@@ -4029,7 +4029,7 @@ static int test_btf_id(unsigned int test_num)
 	btf_fd[0] = bpf_load_btf(raw_btf, raw_btf_size,
 				 btf_log_buf, BTF_LOG_BUF_SIZE,
 				 always_log);
-	if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) {
+	if (CHECK(btf_fd[0] < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -4043,7 +4043,7 @@ static int test_btf_id(unsigned int test_num)
 	}
 
 	btf_fd[1] = bpf_btf_get_fd_by_id(info[0].id);
-	if (CHECK(btf_fd[1] == -1, "errno:%d", errno)) {
+	if (CHECK(btf_fd[1] < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -4071,7 +4071,7 @@ static int test_btf_id(unsigned int test_num)
 	create_attr.btf_value_type_id = 2;
 
 	map_fd = bpf_create_map_xattr(&create_attr);
-	if (CHECK(map_fd == -1, "errno:%d", errno)) {
+	if (CHECK(map_fd < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -4094,7 +4094,7 @@ static int test_btf_id(unsigned int test_num)
 
 	/* Test BTF ID is removed from the kernel */
 	btf_fd[0] = bpf_btf_get_fd_by_id(map_info.btf_id);
-	if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) {
+	if (CHECK(btf_fd[0] < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -4105,7 +4105,7 @@ static int test_btf_id(unsigned int test_num)
 	close(map_fd);
 	map_fd = -1;
 	btf_fd[0] = bpf_btf_get_fd_by_id(map_info.btf_id);
-	if (CHECK(btf_fd[0] != -1, "BTF lingers")) {
+	if (CHECK(btf_fd[0] >= 0, "BTF lingers")) {
 		err = -1;
 		goto done;
 	}
@@ -4117,11 +4117,11 @@ done:
 		fprintf(stderr, "\n%s", btf_log_buf);
 
 	free(raw_btf);
-	if (map_fd != -1)
+	if (map_fd >= 0)
 		close(map_fd);
 	for (i = 0; i < 2; i++) {
 		free(user_btf[i]);
-		if (btf_fd[i] != -1)
+		if (btf_fd[i] >= 0)
 			close(btf_fd[i]);
 	}
 
@@ -4166,7 +4166,7 @@ static void do_test_get_info(unsigned int test_num)
 	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
 			      btf_log_buf, BTF_LOG_BUF_SIZE,
 			      always_log);
-	if (CHECK(btf_fd == -1, "errno:%d", errno)) {
+	if (CHECK(btf_fd <= 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -4212,7 +4212,7 @@ done:
 	free(raw_btf);
 	free(user_btf);
 
-	if (btf_fd != -1)
+	if (btf_fd >= 0)
 		close(btf_fd);
 }
 
@@ -4249,8 +4249,9 @@ static void do_test_file(unsigned int test_num)
 		return;
 
 	btf = btf__parse_elf(test->file, &btf_ext);
-	if (IS_ERR(btf)) {
-		if (PTR_ERR(btf) == -ENOENT) {
+	err = libbpf_get_error(btf);
+	if (err) {
+		if (err == -ENOENT) {
 			printf("%s:SKIP: No ELF %s found", __func__, BTF_ELF_SEC);
 			test__skip();
 			return;
@@ -4263,7 +4264,8 @@ static void do_test_file(unsigned int test_num)
 	btf_ext__free(btf_ext);
 
 	obj = bpf_object__open(test->file);
-	if (CHECK(IS_ERR(obj), "obj: %ld", PTR_ERR(obj)))
+	err = libbpf_get_error(obj);
+	if (CHECK(err, "obj: %d", err))
 		return;
 
 	prog = bpf_program__next(NULL, obj);
@@ -4298,7 +4300,7 @@ static void do_test_file(unsigned int test_num)
 	info_len = sizeof(struct bpf_prog_info);
 	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
 
-	if (CHECK(err == -1, "invalid get info (1st) errno:%d", errno)) {
+	if (CHECK(err < 0, "invalid get info (1st) errno:%d", errno)) {
 		fprintf(stderr, "%s\n", btf_log_buf);
 		err = -1;
 		goto done;
@@ -4330,7 +4332,7 @@ static void do_test_file(unsigned int test_num)
 
 	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
 
-	if (CHECK(err == -1, "invalid get info (2nd) errno:%d", errno)) {
+	if (CHECK(err < 0, "invalid get info (2nd) errno:%d", errno)) {
 		fprintf(stderr, "%s\n", btf_log_buf);
 		err = -1;
 		goto done;
@@ -4886,7 +4888,7 @@ static void do_test_pprint(int test_num)
 			      always_log);
 	free(raw_btf);
 
-	if (CHECK(btf_fd == -1, "errno:%d", errno)) {
+	if (CHECK(btf_fd < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -4901,7 +4903,7 @@ static void do_test_pprint(int test_num)
 	create_attr.btf_value_type_id = test->value_type_id;
 
 	map_fd = bpf_create_map_xattr(&create_attr);
-	if (CHECK(map_fd == -1, "errno:%d", errno)) {
+	if (CHECK(map_fd < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -4982,7 +4984,7 @@ static void do_test_pprint(int test_num)
 
 					err = check_line(expected_line, nexpected_line,
 							 sizeof(expected_line), line);
-					if (err == -1)
+					if (err < 0)
 						goto done;
 				}
 
@@ -4998,7 +5000,7 @@ static void do_test_pprint(int test_num)
 								  cpu, cmapv);
 			err = check_line(expected_line, nexpected_line,
 					 sizeof(expected_line), line);
-			if (err == -1)
+			if (err < 0)
 				goto done;
 
 			cmapv = cmapv + rounded_value_size;
@@ -5036,9 +5038,9 @@ done:
 		fprintf(stderr, "OK");
 	if (*btf_log_buf && (err || always_log))
 		fprintf(stderr, "\n%s", btf_log_buf);
-	if (btf_fd != -1)
+	if (btf_fd >= 0)
 		close(btf_fd);
-	if (map_fd != -1)
+	if (map_fd >= 0)
 		close(map_fd);
 	if (pin_file)
 		fclose(pin_file);
@@ -5950,7 +5952,7 @@ static int test_get_finfo(const struct prog_info_raw_test *test,
 	/* get necessary lens */
 	info_len = sizeof(struct bpf_prog_info);
 	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-	if (CHECK(err == -1, "invalid get info (1st) errno:%d", errno)) {
+	if (CHECK(err < 0, "invalid get info (1st) errno:%d", errno)) {
 		fprintf(stderr, "%s\n", btf_log_buf);
 		return -1;
 	}
@@ -5980,7 +5982,7 @@ static int test_get_finfo(const struct prog_info_raw_test *test,
 	info.func_info_rec_size = rec_size;
 	info.func_info = ptr_to_u64(func_info);
 	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-	if (CHECK(err == -1, "invalid get info (2nd) errno:%d", errno)) {
+	if (CHECK(err < 0, "invalid get info (2nd) errno:%d", errno)) {
 		fprintf(stderr, "%s\n", btf_log_buf);
 		err = -1;
 		goto done;
@@ -6044,7 +6046,7 @@ static int test_get_linfo(const struct prog_info_raw_test *test,
 
 	info_len = sizeof(struct bpf_prog_info);
 	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-	if (CHECK(err == -1, "err:%d errno:%d", err, errno)) {
+	if (CHECK(err < 0, "err:%d errno:%d", err, errno)) {
 		err = -1;
 		goto done;
 	}
@@ -6123,7 +6125,7 @@ static int test_get_linfo(const struct prog_info_raw_test *test,
 	 * Only recheck the info.*line_info* fields.
 	 * Other fields are not the concern of this test.
 	 */
-	if (CHECK(err == -1 ||
+	if (CHECK(err < 0 ||
 		  info.nr_line_info != cnt ||
 		  (jited_cnt && !info.jited_line_info) ||
 		  info.nr_jited_line_info != jited_cnt ||
@@ -6260,7 +6262,7 @@ static void do_test_info_raw(unsigned int test_num)
 			      always_log);
 	free(raw_btf);
 
-	if (CHECK(btf_fd == -1, "invalid btf_fd errno:%d", errno)) {
+	if (CHECK(btf_fd < 0, "invalid btf_fd errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -6273,7 +6275,8 @@ static void do_test_info_raw(unsigned int test_num)
 	patched_linfo = patch_name_tbd(test->line_info,
 				       test->str_sec, linfo_str_off,
 				       test->str_sec_size, &linfo_size);
-	if (IS_ERR(patched_linfo)) {
+	err = libbpf_get_error(patched_linfo);
+	if (err) {
 		fprintf(stderr, "error in creating raw bpf_line_info");
 		err = -1;
 		goto done;
@@ -6297,7 +6300,7 @@ static void do_test_info_raw(unsigned int test_num)
 	}
 
 	prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
-	err = ((prog_fd == -1) != test->expected_prog_load_failure);
+	err = ((prog_fd < 0) != test->expected_prog_load_failure);
 	if (CHECK(err, "prog_fd:%d expected_prog_load_failure:%u errno:%d",
 		  prog_fd, test->expected_prog_load_failure, errno) ||
 	    CHECK(test->err_str && !strstr(btf_log_buf, test->err_str),
@@ -6306,7 +6309,7 @@ static void do_test_info_raw(unsigned int test_num)
 		goto done;
 	}
 
-	if (prog_fd == -1)
+	if (prog_fd < 0)
 		goto done;
 
 	err = test_get_finfo(test, prog_fd);
@@ -6323,12 +6326,12 @@ done:
 	if (*btf_log_buf && (err || always_log))
 		fprintf(stderr, "\n%s", btf_log_buf);
 
-	if (btf_fd != -1)
+	if (btf_fd >= 0)
 		close(btf_fd);
-	if (prog_fd != -1)
+	if (prog_fd >= 0)
 		close(prog_fd);
 
-	if (!IS_ERR(patched_linfo))
+	if (!libbpf_get_error(patched_linfo))
 		free(patched_linfo);
 }
 
@@ -6839,9 +6842,9 @@ static void do_test_dedup(unsigned int test_num)
 		return;
 
 	test_btf = btf__new((__u8 *)raw_btf, raw_btf_size);
+	err = libbpf_get_error(test_btf);
 	free(raw_btf);
-	if (CHECK(IS_ERR(test_btf), "invalid test_btf errno:%ld",
-		  PTR_ERR(test_btf))) {
+	if (CHECK(err, "invalid test_btf errno:%d", err)) {
 		err = -1;
 		goto done;
 	}
@@ -6853,9 +6856,9 @@ static void do_test_dedup(unsigned int test_num)
 	if (!raw_btf)
 		return;
 	expect_btf = btf__new((__u8 *)raw_btf, raw_btf_size);
+	err = libbpf_get_error(expect_btf);
 	free(raw_btf);
-	if (CHECK(IS_ERR(expect_btf), "invalid expect_btf errno:%ld",
-		  PTR_ERR(expect_btf))) {
+	if (CHECK(err, "invalid expect_btf errno:%d", err)) {
 		err = -1;
 		goto done;
 	}
@@ -6966,10 +6969,8 @@ static void do_test_dedup(unsigned int test_num)
 	}
 
 done:
-	if (!IS_ERR(test_btf))
-		btf__free(test_btf);
-	if (!IS_ERR(expect_btf))
-		btf__free(expect_btf);
+	btf__free(test_btf);
+	btf__free(expect_btf);
 }
 
 void test_btf(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 5e129dc2073c..1b90e684ff13 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -32,8 +32,9 @@ static int btf_dump_all_types(const struct btf *btf,
 	int err = 0, id;
 
 	d = btf_dump__new(btf, NULL, opts, btf_dump_printf);
-	if (IS_ERR(d))
-		return PTR_ERR(d);
+	err = libbpf_get_error(d);
+	if (err)
+		return err;
 
 	for (id = 1; id <= type_cnt; id++) {
 		err = btf_dump__dump_type(d, id);
@@ -56,8 +57,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t)
 	snprintf(test_file, sizeof(test_file), "%s.o", t->file);
 
 	btf = btf__parse_elf(test_file, NULL);
-	if (CHECK(IS_ERR(btf), "btf_parse_elf",
-	    "failed to load test BTF: %ld\n", PTR_ERR(btf))) {
+	if (!ASSERT_OK_PTR(btf, "btf_parse_elf")) {
 		err = -PTR_ERR(btf);
 		btf = NULL;
 		goto done;
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_write.c b/tools/testing/selftests/bpf/prog_tests/btf_write.c
index f36da15b134f..022c7d89d6f4 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_write.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_write.c
@@ -4,8 +4,6 @@
 #include <bpf/btf.h>
 #include "btf_helpers.h"
 
-static int duration = 0;
-
 void test_btf_write() {
 	const struct btf_var_secinfo *vi;
 	const struct btf_type *t;
@@ -16,7 +14,7 @@ void test_btf_write() {
 	int id, err, str_off;
 
 	btf = btf__new_empty();
-	if (CHECK(IS_ERR(btf), "new_empty", "failed: %ld\n", PTR_ERR(btf)))
+	if (!ASSERT_OK_PTR(btf, "new_empty"))
 		return;
 
 	str_off = btf__find_str(btf, "int");
diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
index 643dfa35419c..876be0ecb654 100644
--- a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
@@ -102,8 +102,7 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 	 */
 	parent_link = bpf_program__attach_cgroup(obj->progs.egress,
 						 parent_cgroup_fd);
-	if (CHECK(IS_ERR(parent_link), "parent-cg-attach",
-		  "err %ld", PTR_ERR(parent_link)))
+	if (!ASSERT_OK_PTR(parent_link, "parent-cg-attach"))
 		goto close_bpf_object;
 	err = connect_send(CHILD_CGROUP);
 	if (CHECK(err, "first-connect-send", "errno %d", errno))
@@ -126,8 +125,7 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 	 */
 	child_link = bpf_program__attach_cgroup(obj->progs.egress,
 						child_cgroup_fd);
-	if (CHECK(IS_ERR(child_link), "child-cg-attach",
-		  "err %ld", PTR_ERR(child_link)))
+	if (!ASSERT_OK_PTR(child_link, "child-cg-attach"))
 		goto close_bpf_object;
 	err = connect_send(CHILD_CGROUP);
 	if (CHECK(err, "second-connect-send", "errno %d", errno))
@@ -147,10 +145,8 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 		goto close_bpf_object;
 
 close_bpf_object:
-	if (!IS_ERR(parent_link))
-		bpf_link__destroy(parent_link);
-	if (!IS_ERR(child_link))
-		bpf_link__destroy(child_link);
+	bpf_link__destroy(parent_link);
+	bpf_link__destroy(child_link);
 
 	cg_storage_multi_egress_only__destroy(obj);
 }
@@ -176,18 +172,15 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd)
 	 */
 	parent_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
 							 parent_cgroup_fd);
-	if (CHECK(IS_ERR(parent_egress1_link), "parent-egress1-cg-attach",
-		  "err %ld", PTR_ERR(parent_egress1_link)))
+	if (!ASSERT_OK_PTR(parent_egress1_link, "parent-egress1-cg-attach"))
 		goto close_bpf_object;
 	parent_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
 							 parent_cgroup_fd);
-	if (CHECK(IS_ERR(parent_egress2_link), "parent-egress2-cg-attach",
-		  "err %ld", PTR_ERR(parent_egress2_link)))
+	if (!ASSERT_OK_PTR(parent_egress2_link, "parent-egress2-cg-attach"))
 		goto close_bpf_object;
 	parent_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
 							 parent_cgroup_fd);
-	if (CHECK(IS_ERR(parent_ingress_link), "parent-ingress-cg-attach",
-		  "err %ld", PTR_ERR(parent_ingress_link)))
+	if (!ASSERT_OK_PTR(parent_ingress_link, "parent-ingress-cg-attach"))
 		goto close_bpf_object;
 	err = connect_send(CHILD_CGROUP);
 	if (CHECK(err, "first-connect-send", "errno %d", errno))
@@ -221,18 +214,15 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd)
 	 */
 	child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
 							child_cgroup_fd);
-	if (CHECK(IS_ERR(child_egress1_link), "child-egress1-cg-attach",
-		  "err %ld", PTR_ERR(child_egress1_link)))
+	if (!ASSERT_OK_PTR(child_egress1_link, "child-egress1-cg-attach"))
 		goto close_bpf_object;
 	child_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
 							child_cgroup_fd);
-	if (CHECK(IS_ERR(child_egress2_link), "child-egress2-cg-attach",
-		  "err %ld", PTR_ERR(child_egress2_link)))
+	if (!ASSERT_OK_PTR(child_egress2_link, "child-egress2-cg-attach"))
 		goto close_bpf_object;
 	child_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
 							child_cgroup_fd);
-	if (CHECK(IS_ERR(child_ingress_link), "child-ingress-cg-attach",
-		  "err %ld", PTR_ERR(child_ingress_link)))
+	if (!ASSERT_OK_PTR(child_ingress_link, "child-ingress-cg-attach"))
 		goto close_bpf_object;
 	err = connect_send(CHILD_CGROUP);
 	if (CHECK(err, "second-connect-send", "errno %d", errno))
@@ -264,18 +254,12 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd)
 		goto close_bpf_object;
 
 close_bpf_object:
-	if (!IS_ERR(parent_egress1_link))
-		bpf_link__destroy(parent_egress1_link);
-	if (!IS_ERR(parent_egress2_link))
-		bpf_link__destroy(parent_egress2_link);
-	if (!IS_ERR(parent_ingress_link))
-		bpf_link__destroy(parent_ingress_link);
-	if (!IS_ERR(child_egress1_link))
-		bpf_link__destroy(child_egress1_link);
-	if (!IS_ERR(child_egress2_link))
-		bpf_link__destroy(child_egress2_link);
-	if (!IS_ERR(child_ingress_link))
-		bpf_link__destroy(child_ingress_link);
+	bpf_link__destroy(parent_egress1_link);
+	bpf_link__destroy(parent_egress2_link);
+	bpf_link__destroy(parent_ingress_link);
+	bpf_link__destroy(child_egress1_link);
+	bpf_link__destroy(child_egress2_link);
+	bpf_link__destroy(child_ingress_link);
 
 	cg_storage_multi_isolated__destroy(obj);
 }
@@ -301,18 +285,15 @@ static void test_shared(int parent_cgroup_fd, int child_cgroup_fd)
 	 */
 	parent_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
 							 parent_cgroup_fd);
-	if (CHECK(IS_ERR(parent_egress1_link), "parent-egress1-cg-attach",
-		  "err %ld", PTR_ERR(parent_egress1_link)))
+	if (!ASSERT_OK_PTR(parent_egress1_link, "parent-egress1-cg-attach"))
 		goto close_bpf_object;
 	parent_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
 							 parent_cgroup_fd);
-	if (CHECK(IS_ERR(parent_egress2_link), "parent-egress2-cg-attach",
-		  "err %ld", PTR_ERR(parent_egress2_link)))
+	if (!ASSERT_OK_PTR(parent_egress2_link, "parent-egress2-cg-attach"))
 		goto close_bpf_object;
 	parent_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
 							 parent_cgroup_fd);
-	if (CHECK(IS_ERR(parent_ingress_link), "parent-ingress-cg-attach",
-		  "err %ld", PTR_ERR(parent_ingress_link)))
+	if (!ASSERT_OK_PTR(parent_ingress_link, "parent-ingress-cg-attach"))
 		goto close_bpf_object;
 	err = connect_send(CHILD_CGROUP);
 	if (CHECK(err, "first-connect-send", "errno %d", errno))
@@ -338,18 +319,15 @@ static void test_shared(int parent_cgroup_fd, int child_cgroup_fd)
 	 */
 	child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
 							child_cgroup_fd);
-	if (CHECK(IS_ERR(child_egress1_link), "child-egress1-cg-attach",
-		  "err %ld", PTR_ERR(child_egress1_link)))
+	if (!ASSERT_OK_PTR(child_egress1_link, "child-egress1-cg-attach"))
 		goto close_bpf_object;
 	child_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
 							child_cgroup_fd);
-	if (CHECK(IS_ERR(child_egress2_link), "child-egress2-cg-attach",
-		  "err %ld", PTR_ERR(child_egress2_link)))
+	if (!ASSERT_OK_PTR(child_egress2_link, "child-egress2-cg-attach"))
 		goto close_bpf_object;
 	child_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
 							child_cgroup_fd);
-	if (CHECK(IS_ERR(child_ingress_link), "child-ingress-cg-attach",
-		  "err %ld", PTR_ERR(child_ingress_link)))
+	if (!ASSERT_OK_PTR(child_ingress_link, "child-ingress-cg-attach"))
 		goto close_bpf_object;
 	err = connect_send(CHILD_CGROUP);
 	if (CHECK(err, "second-connect-send", "errno %d", errno))
@@ -375,18 +353,12 @@ static void test_shared(int parent_cgroup_fd, int child_cgroup_fd)
 		goto close_bpf_object;
 
 close_bpf_object:
-	if (!IS_ERR(parent_egress1_link))
-		bpf_link__destroy(parent_egress1_link);
-	if (!IS_ERR(parent_egress2_link))
-		bpf_link__destroy(parent_egress2_link);
-	if (!IS_ERR(parent_ingress_link))
-		bpf_link__destroy(parent_ingress_link);
-	if (!IS_ERR(child_egress1_link))
-		bpf_link__destroy(child_egress1_link);
-	if (!IS_ERR(child_egress2_link))
-		bpf_link__destroy(child_egress2_link);
-	if (!IS_ERR(child_ingress_link))
-		bpf_link__destroy(child_ingress_link);
+	bpf_link__destroy(parent_egress1_link);
+	bpf_link__destroy(parent_egress2_link);
+	bpf_link__destroy(parent_ingress_link);
+	bpf_link__destroy(child_egress1_link);
+	bpf_link__destroy(child_egress2_link);
+	bpf_link__destroy(child_ingress_link);
 
 	cg_storage_multi_shared__destroy(obj);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
index 0a1fc9816cef..20bb8831dda6 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
@@ -167,7 +167,7 @@ void test_cgroup_attach_multi(void)
 	prog_cnt = 2;
 	CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
 				  BPF_F_QUERY_EFFECTIVE, &attach_flags,
-				  prog_ids, &prog_cnt) != -1);
+				  prog_ids, &prog_cnt) >= 0);
 	CHECK_FAIL(errno != ENOSPC);
 	CHECK_FAIL(prog_cnt != 4);
 	/* check that prog_ids are returned even when buffer is too small */
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
index 736796e56ed1..9091524131d6 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
@@ -65,8 +65,7 @@ void test_cgroup_link(void)
 	for (i = 0; i < cg_nr; i++) {
 		links[i] = bpf_program__attach_cgroup(skel->progs.egress,
 						      cgs[i].fd);
-		if (CHECK(IS_ERR(links[i]), "cg_attach", "i: %d, err: %ld\n",
-				 i, PTR_ERR(links[i])))
+		if (!ASSERT_OK_PTR(links[i], "cg_attach"))
 			goto cleanup;
 	}
 
@@ -121,8 +120,7 @@ void test_cgroup_link(void)
 
 	links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress,
 						    cgs[last_cg].fd);
-	if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n",
-		  PTR_ERR(links[last_cg])))
+	if (!ASSERT_OK_PTR(links[last_cg], "cg_attach"))
 		goto cleanup;
 
 	ping_and_check(cg_nr + 1, 0);
@@ -147,7 +145,7 @@ void test_cgroup_link(void)
 	/* attempt to mix in with multi-attach bpf_link */
 	tmp_link = bpf_program__attach_cgroup(skel->progs.egress,
 					      cgs[last_cg].fd);
-	if (CHECK(!IS_ERR(tmp_link), "cg_attach_fail", "unexpected success!\n")) {
+	if (!ASSERT_ERR_PTR(tmp_link, "cg_attach_fail")) {
 		bpf_link__destroy(tmp_link);
 		goto cleanup;
 	}
@@ -165,8 +163,7 @@ void test_cgroup_link(void)
 	/* attach back link-based one */
 	links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress,
 						    cgs[last_cg].fd);
-	if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n",
-		  PTR_ERR(links[last_cg])))
+	if (!ASSERT_OK_PTR(links[last_cg], "cg_attach"))
 		goto cleanup;
 
 	ping_and_check(cg_nr, 0);
@@ -249,8 +246,7 @@ cleanup:
 				 BPF_CGROUP_INET_EGRESS);
 
 	for (i = 0; i < cg_nr; i++) {
-		if (!IS_ERR(links[i]))
-			bpf_link__destroy(links[i]);
+		bpf_link__destroy(links[i]);
 	}
 	test_cgroup_link__destroy(skel);
 
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c
index 464edc1c1708..b9dc4ec655b5 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c
@@ -60,7 +60,7 @@ static void run_cgroup_bpf_test(const char *cg_path, int out_sk)
 		goto cleanup;
 
 	link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd);
-	if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "cgroup_attach"))
 		goto cleanup;
 
 	run_lookup_test(&skel->bss->g_serv_port, out_sk);
diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c
index b62a39315336..012068f33a0a 100644
--- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c
+++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c
@@ -53,7 +53,7 @@ static void test_check_mtu_xdp_attach(void)
 	prog = skel->progs.xdp_use_helper_basic;
 
 	link = bpf_program__attach_xdp(prog, IFINDEX_LO);
-	if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "link_attach"))
 		goto out;
 	skel->links.xdp_use_helper_basic = link;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 607710826dca..d02e064c535f 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -369,8 +369,7 @@ static int setup_type_id_case_local(struct core_reloc_test_case *test)
 	const char *name;
 	int i;
 
-	if (CHECK(IS_ERR(local_btf), "local_btf", "failed: %ld\n", PTR_ERR(local_btf)) ||
-	    CHECK(IS_ERR(targ_btf), "targ_btf", "failed: %ld\n", PTR_ERR(targ_btf))) {
+	if (!ASSERT_OK_PTR(local_btf, "local_btf") || !ASSERT_OK_PTR(targ_btf, "targ_btf")) {
 		btf__free(local_btf);
 		btf__free(targ_btf);
 		return -EINVAL;
@@ -848,8 +847,7 @@ void test_core_reloc(void)
 		}
 
 		obj = bpf_object__open_file(test_case->bpf_obj_file, NULL);
-		if (CHECK(IS_ERR(obj), "obj_open", "failed to open '%s': %ld\n",
-			  test_case->bpf_obj_file, PTR_ERR(obj)))
+		if (!ASSERT_OK_PTR(obj, "obj_open"))
 			continue;
 
 		probe_name = "raw_tracepoint/sys_enter";
@@ -899,8 +897,7 @@ void test_core_reloc(void)
 		data->my_pid_tgid = my_pid_tgid;
 
 		link = bpf_program__attach_raw_tracepoint(prog, tp_name);
-		if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n",
-			  PTR_ERR(link)))
+		if (!ASSERT_OK_PTR(link, "attach_raw_tp"))
 			goto cleanup;
 
 		/* trigger test run */
@@ -941,10 +938,8 @@ cleanup:
 			CHECK_FAIL(munmap(mmap_data, mmap_sz));
 			mmap_data = NULL;
 		}
-		if (!IS_ERR_OR_NULL(link)) {
-			bpf_link__destroy(link);
-			link = NULL;
-		}
+		bpf_link__destroy(link);
+		link = NULL;
 		bpf_object__close(obj);
 	}
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
index 63990842d20f..73b4c76e6b86 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
@@ -146,10 +146,8 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
 
 close_prog:
 	for (i = 0; i < prog_cnt; i++)
-		if (!IS_ERR_OR_NULL(link[i]))
-			bpf_link__destroy(link[i]);
-	if (!IS_ERR_OR_NULL(obj))
-		bpf_object__close(obj);
+		bpf_link__destroy(link[i]);
+	bpf_object__close(obj);
 	bpf_object__close(tgt_obj);
 	free(link);
 	free(prog);
@@ -231,7 +229,7 @@ static int test_second_attach(struct bpf_object *obj)
 		return err;
 
 	link = bpf_program__attach_freplace(prog, tgt_fd, tgt_name);
-	if (CHECK(IS_ERR(link), "second_link", "failed to attach second link prog_fd %d tgt_fd %d\n", bpf_program__fd(prog), tgt_fd))
+	if (!ASSERT_OK_PTR(link, "second_link"))
 		goto out;
 
 	err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6),
@@ -283,9 +281,7 @@ static void test_fmod_ret_freplace(void)
 	opts.attach_prog_fd = pkt_fd;
 
 	freplace_obj = bpf_object__open_file(freplace_name, &opts);
-	if (CHECK(IS_ERR_OR_NULL(freplace_obj), "freplace_obj_open",
-		  "failed to open %s: %ld\n", freplace_name,
-		  PTR_ERR(freplace_obj)))
+	if (!ASSERT_OK_PTR(freplace_obj, "freplace_obj_open"))
 		goto out;
 
 	err = bpf_object__load(freplace_obj);
@@ -294,14 +290,12 @@ static void test_fmod_ret_freplace(void)
 
 	prog = bpf_program__next(NULL, freplace_obj);
 	freplace_link = bpf_program__attach_trace(prog);
-	if (CHECK(IS_ERR(freplace_link), "freplace_attach_trace", "failed to link\n"))
+	if (!ASSERT_OK_PTR(freplace_link, "freplace_attach_trace"))
 		goto out;
 
 	opts.attach_prog_fd = bpf_program__fd(prog);
 	fmod_obj = bpf_object__open_file(fmod_ret_name, &opts);
-	if (CHECK(IS_ERR_OR_NULL(fmod_obj), "fmod_obj_open",
-		  "failed to open %s: %ld\n", fmod_ret_name,
-		  PTR_ERR(fmod_obj)))
+	if (!ASSERT_OK_PTR(fmod_obj, "fmod_obj_open"))
 		goto out;
 
 	err = bpf_object__load(fmod_obj);
@@ -350,9 +344,7 @@ static void test_obj_load_failure_common(const char *obj_file,
 			   );
 
 	obj = bpf_object__open_file(obj_file, &opts);
-	if (CHECK(IS_ERR_OR_NULL(obj), "obj_open",
-		  "failed to open %s: %ld\n", obj_file,
-		  PTR_ERR(obj)))
+	if (!ASSERT_OK_PTR(obj, "obj_open"))
 		goto close_prog;
 
 	/* It should fail to load the program */
@@ -361,8 +353,7 @@ static void test_obj_load_failure_common(const char *obj_file,
 		goto close_prog;
 
 close_prog:
-	if (!IS_ERR_OR_NULL(obj))
-		bpf_object__close(obj);
+	bpf_object__close(obj);
 	bpf_object__close(pkt_obj);
 }
 
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index cd6dc80edf18..225714f71ac6 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -541,7 +541,7 @@ static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd)
 		return;
 
 	link = bpf_program__attach_netns(skel->progs._dissect, net_fd);
-	if (CHECK(IS_ERR(link), "attach_netns", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_netns"))
 		goto out_close;
 
 	run_tests_skb_less(tap_fd, skel->maps.last_dissection);
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
index 172c586b6996..3931ede5c534 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
@@ -134,9 +134,9 @@ static void test_link_create_link_create(int netns, int prog1, int prog2)
 	/* Expect failure creating link when another link exists */
 	errno = 0;
 	link2 = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts);
-	if (CHECK_FAIL(link2 != -1 || errno != E2BIG))
+	if (CHECK_FAIL(link2 >= 0 || errno != E2BIG))
 		perror("bpf_prog_attach(prog2) expected E2BIG");
-	if (link2 != -1)
+	if (link2 >= 0)
 		close(link2);
 	CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
 
@@ -159,9 +159,9 @@ static void test_prog_attach_link_create(int netns, int prog1, int prog2)
 	/* Expect failure creating link when prog attached */
 	errno = 0;
 	link = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts);
-	if (CHECK_FAIL(link != -1 || errno != EEXIST))
+	if (CHECK_FAIL(link >= 0 || errno != EEXIST))
 		perror("bpf_link_create(prog2) expected EEXIST");
-	if (link != -1)
+	if (link >= 0)
 		close(link);
 	CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
 
@@ -623,7 +623,7 @@ static void run_tests(int netns)
 	}
 out_close:
 	for (i = 0; i < ARRAY_SIZE(progs); i++) {
-		if (progs[i] != -1)
+		if (progs[i] >= 0)
 			CHECK_FAIL(close(progs[i]));
 	}
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
index 925722217edf..522237aa4470 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -121,12 +121,12 @@ void test_get_stack_raw_tp(void)
 		goto close_prog;
 
 	link = bpf_program__attach_raw_tracepoint(prog, "sys_enter");
-	if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_raw_tp"))
 		goto close_prog;
 
 	pb_opts.sample_cb = get_stack_print_output;
 	pb = perf_buffer__new(bpf_map__fd(map), 8, &pb_opts);
-	if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto close_prog;
 
 	/* trigger some syscall action */
@@ -141,9 +141,7 @@ void test_get_stack_raw_tp(void)
 	}
 
 close_prog:
-	if (!IS_ERR_OR_NULL(link))
-		bpf_link__destroy(link);
-	if (!IS_ERR_OR_NULL(pb))
-		perf_buffer__free(pb);
+	bpf_link__destroy(link);
+	perf_buffer__free(pb);
 	bpf_object__close(obj);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c
index d884b2ed5bc5..8d5a6023a1bb 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c
@@ -48,8 +48,7 @@ void test_get_stackid_cannot_attach(void)
 
 	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
 							   pmu_fd);
-	CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_no_callchain",
-	      "should have failed\n");
+	ASSERT_ERR_PTR(skel->links.oncpu, "attach_perf_event_no_callchain");
 	close(pmu_fd);
 
 	/* add PERF_SAMPLE_CALLCHAIN, attach should succeed */
@@ -65,8 +64,7 @@ void test_get_stackid_cannot_attach(void)
 
 	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
 							   pmu_fd);
-	CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event_callchain",
-	      "err: %ld\n", PTR_ERR(skel->links.oncpu));
+	ASSERT_OK_PTR(skel->links.oncpu, "attach_perf_event_callchain");
 	close(pmu_fd);
 
 	/* add exclude_callchain_kernel, attach should fail */
@@ -82,8 +80,7 @@ void test_get_stackid_cannot_attach(void)
 
 	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
 							   pmu_fd);
-	CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_exclude_callchain_kernel",
-	      "should have failed\n");
+	ASSERT_ERR_PTR(skel->links.oncpu, "attach_perf_event_exclude_callchain_kernel");
 	close(pmu_fd);
 
 cleanup:
diff --git a/tools/testing/selftests/bpf/prog_tests/hashmap.c b/tools/testing/selftests/bpf/prog_tests/hashmap.c
index 428d488830c6..4747ab18f97f 100644
--- a/tools/testing/selftests/bpf/prog_tests/hashmap.c
+++ b/tools/testing/selftests/bpf/prog_tests/hashmap.c
@@ -48,8 +48,7 @@ static void test_hashmap_generic(void)
 	struct hashmap *map;
 
 	map = hashmap__new(hash_fn, equal_fn, NULL);
-	if (CHECK(IS_ERR(map), "hashmap__new",
-		  "failed to create map: %ld\n", PTR_ERR(map)))
+	if (!ASSERT_OK_PTR(map, "hashmap__new"))
 		return;
 
 	for (i = 0; i < ELEM_CNT; i++) {
@@ -267,8 +266,7 @@ static void test_hashmap_multimap(void)
 
 	/* force collisions */
 	map = hashmap__new(collision_hash_fn, equal_fn, NULL);
-	if (CHECK(IS_ERR(map), "hashmap__new",
-		  "failed to create map: %ld\n", PTR_ERR(map)))
+	if (!ASSERT_OK_PTR(map, "hashmap__new"))
 		return;
 
 	/* set up multimap:
@@ -339,8 +337,7 @@ static void test_hashmap_empty()
 
 	/* force collisions */
 	map = hashmap__new(hash_fn, equal_fn, NULL);
-	if (CHECK(IS_ERR(map), "hashmap__new",
-		  "failed to create map: %ld\n", PTR_ERR(map)))
+	if (!ASSERT_OK_PTR(map, "hashmap__new"))
 		goto cleanup;
 
 	if (CHECK(hashmap__size(map) != 0, "hashmap__size",
diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
index d65107919998..ddfb6bf97152 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -97,15 +97,13 @@ void test_kfree_skb(void)
 		goto close_prog;
 
 	link = bpf_program__attach_raw_tracepoint(prog, NULL);
-	if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_raw_tp"))
 		goto close_prog;
 	link_fentry = bpf_program__attach_trace(fentry);
-	if (CHECK(IS_ERR(link_fentry), "attach fentry", "err %ld\n",
-		  PTR_ERR(link_fentry)))
+	if (!ASSERT_OK_PTR(link_fentry, "attach fentry"))
 		goto close_prog;
 	link_fexit = bpf_program__attach_trace(fexit);
-	if (CHECK(IS_ERR(link_fexit), "attach fexit", "err %ld\n",
-		  PTR_ERR(link_fexit)))
+	if (!ASSERT_OK_PTR(link_fexit, "attach fexit"))
 		goto close_prog;
 
 	perf_buf_map = bpf_object__find_map_by_name(obj2, "perf_buf_map");
@@ -116,7 +114,7 @@ void test_kfree_skb(void)
 	pb_opts.sample_cb = on_sample;
 	pb_opts.ctx = &passed;
 	pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts);
-	if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto close_prog;
 
 	memcpy(skb.cb, &cb, sizeof(cb));
@@ -144,12 +142,9 @@ void test_kfree_skb(void)
 	CHECK_FAIL(!test_ok[0] || !test_ok[1]);
 close_prog:
 	perf_buffer__free(pb);
-	if (!IS_ERR_OR_NULL(link))
-		bpf_link__destroy(link);
-	if (!IS_ERR_OR_NULL(link_fentry))
-		bpf_link__destroy(link_fentry);
-	if (!IS_ERR_OR_NULL(link_fexit))
-		bpf_link__destroy(link_fexit);
+	bpf_link__destroy(link);
+	bpf_link__destroy(link_fentry);
+	bpf_link__destroy(link_fexit);
 	bpf_object__close(obj);
 	bpf_object__close(obj2);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
index b58b775d19f3..67bebd324147 100644
--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
@@ -87,8 +87,7 @@ void test_ksyms_btf(void)
 	struct btf *btf;
 
 	btf = libbpf_find_kernel_btf();
-	if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n",
-		  PTR_ERR(btf)))
+	if (!ASSERT_OK_PTR(btf, "btf_exists"))
 		return;
 
 	percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu",
diff --git a/tools/testing/selftests/bpf/prog_tests/link_pinning.c b/tools/testing/selftests/bpf/prog_tests/link_pinning.c
index a743288cf384..6fc97c45f71e 100644
--- a/tools/testing/selftests/bpf/prog_tests/link_pinning.c
+++ b/tools/testing/selftests/bpf/prog_tests/link_pinning.c
@@ -17,7 +17,7 @@ void test_link_pinning_subtest(struct bpf_program *prog,
 	int err, i;
 
 	link = bpf_program__attach(prog);
-	if (CHECK(IS_ERR(link), "link_attach", "err: %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "link_attach"))
 		goto cleanup;
 
 	bss->in = 1;
@@ -51,7 +51,7 @@ void test_link_pinning_subtest(struct bpf_program *prog,
 
 	/* re-open link from BPFFS */
 	link = bpf_link__open(link_pin_path);
-	if (CHECK(IS_ERR(link), "link_open", "err: %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "link_open"))
 		goto cleanup;
 
 	CHECK(strcmp(link_pin_path, bpf_link__pin_path(link)), "pin_path2",
@@ -84,8 +84,7 @@ void test_link_pinning_subtest(struct bpf_program *prog,
 	CHECK(i == 10000, "link_attached", "got to iteration #%d\n", i);
 
 cleanup:
-	if (!IS_ERR(link))
-		bpf_link__destroy(link);
+	bpf_link__destroy(link);
 }
 
 void test_link_pinning(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/obj_name.c b/tools/testing/selftests/bpf/prog_tests/obj_name.c
index e178416bddad..6194b776a28b 100644
--- a/tools/testing/selftests/bpf/prog_tests/obj_name.c
+++ b/tools/testing/selftests/bpf/prog_tests/obj_name.c
@@ -38,13 +38,13 @@ void test_obj_name(void)
 
 		fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
 		CHECK((tests[i].success && fd < 0) ||
-		      (!tests[i].success && fd != -1) ||
+		      (!tests[i].success && fd >= 0) ||
 		      (!tests[i].success && errno != tests[i].expected_errno),
 		      "check-bpf-prog-name",
 		      "fd %d(%d) errno %d(%d)\n",
 		       fd, tests[i].success, errno, tests[i].expected_errno);
 
-		if (fd != -1)
+		if (fd >= 0)
 			close(fd);
 
 		/* test different attr.map_name during BPF_MAP_CREATE */
@@ -59,13 +59,13 @@ void test_obj_name(void)
 		memcpy(attr.map_name, tests[i].name, ncopy);
 		fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
 		CHECK((tests[i].success && fd < 0) ||
-		      (!tests[i].success && fd != -1) ||
+		      (!tests[i].success && fd >= 0) ||
 		      (!tests[i].success && errno != tests[i].expected_errno),
 		      "check-bpf-map-name",
 		      "fd %d(%d) errno %d(%d)\n",
 		      fd, tests[i].success, errno, tests[i].expected_errno);
 
-		if (fd != -1)
+		if (fd >= 0)
 			close(fd);
 	}
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
index e35c444902a7..12c4f45cee1a 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_branches.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
@@ -74,7 +74,7 @@ static void test_perf_branches_common(int perf_fd,
 
 	/* attach perf_event */
 	link = bpf_program__attach_perf_event(skel->progs.perf_branches, perf_fd);
-	if (CHECK(IS_ERR(link), "attach_perf_event", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_perf_event"))
 		goto out_destroy_skel;
 
 	/* generate some branches on cpu 0 */
@@ -119,7 +119,7 @@ static void test_perf_branches_hw(void)
 	 * Some setups don't support branch records (virtual machines, !x86),
 	 * so skip test in this case.
 	 */
-	if (pfd == -1) {
+	if (pfd < 0) {
 		if (errno == ENOENT || errno == EOPNOTSUPP) {
 			printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n",
 			       __func__);
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
index ca9f0895ec84..6490e9673002 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
@@ -80,7 +80,7 @@ void test_perf_buffer(void)
 	pb_opts.sample_cb = on_sample;
 	pb_opts.ctx = &cpu_seen;
 	pb = perf_buffer__new(bpf_map__fd(skel->maps.perf_buf_map), 1, &pb_opts);
-	if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto out_close;
 
 	CHECK(perf_buffer__epoll_fd(pb) < 0, "epoll_fd",
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c
index 72c3690844fb..33144c9432ae 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c
@@ -97,8 +97,7 @@ void test_perf_event_stackmap(void)
 
 	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
 							   pmu_fd);
-	if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event",
-		  "err %ld\n", PTR_ERR(skel->links.oncpu))) {
+	if (!ASSERT_OK_PTR(skel->links.oncpu, "attach_perf_event")) {
 		close(pmu_fd);
 		goto cleanup;
 	}
diff --git a/tools/testing/selftests/bpf/prog_tests/probe_user.c b/tools/testing/selftests/bpf/prog_tests/probe_user.c
index 7aecfd9e87d1..95bd12097358 100644
--- a/tools/testing/selftests/bpf/prog_tests/probe_user.c
+++ b/tools/testing/selftests/bpf/prog_tests/probe_user.c
@@ -15,7 +15,7 @@ void test_probe_user(void)
 	static const int zero = 0;
 
 	obj = bpf_object__open_file(obj_file, &opts);
-	if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj)))
+	if (!ASSERT_OK_PTR(obj, "obj_open_file"))
 		return;
 
 	kprobe_prog = bpf_object__find_program_by_title(obj, prog_name);
@@ -33,11 +33,8 @@ void test_probe_user(void)
 		goto cleanup;
 
 	kprobe_link = bpf_program__attach(kprobe_prog);
-	if (CHECK(IS_ERR(kprobe_link), "attach_kprobe",
-		  "err %ld\n", PTR_ERR(kprobe_link))) {
-		kprobe_link = NULL;
+	if (!ASSERT_OK_PTR(kprobe_link, "attach_kprobe"))
 		goto cleanup;
-	}
 
 	memset(&curr, 0, sizeof(curr));
 	in->sin_family = AF_INET;
diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
index 131d7f7eeb42..89fc98faf19e 100644
--- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
+++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
@@ -46,7 +46,7 @@ void test_prog_run_xattr(void)
 	tattr.prog_fd = bpf_program__fd(skel->progs.test_pkt_access);
 
 	err = bpf_prog_test_run_xattr(&tattr);
-	CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run",
+	CHECK_ATTR(err >= 0 || errno != ENOSPC || tattr.retval, "run",
 	      "err %d errno %d retval %d\n", err, errno, tattr.retval);
 
 	CHECK_ATTR(tattr.data_size_out != sizeof(pkt_v4), "data_size_out",
@@ -78,6 +78,6 @@ void test_prog_run_xattr(void)
 cleanup:
 	if (skel)
 		test_pkt_access__destroy(skel);
-	if (stats_fd != -1)
+	if (stats_fd >= 0)
 		close(stats_fd);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c
index c5fb191874ac..41720a62c4fa 100644
--- a/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c
@@ -77,7 +77,7 @@ void test_raw_tp_test_run(void)
 	/* invalid cpu ID should fail with ENXIO */
 	opts.cpu = 0xffffffff;
 	err = bpf_prog_test_run_opts(prog_fd, &opts);
-	CHECK(err != -1 || errno != ENXIO,
+	CHECK(err >= 0 || errno != ENXIO,
 	      "test_run_opts_fail",
 	      "should failed with ENXIO\n");
 
@@ -85,7 +85,7 @@ void test_raw_tp_test_run(void)
 	opts.cpu = 1;
 	opts.flags = 0;
 	err = bpf_prog_test_run_opts(prog_fd, &opts);
-	CHECK(err != -1 || errno != EINVAL,
+	CHECK(err >= 0 || errno != EINVAL,
 	      "test_run_opts_fail",
 	      "should failed with EINVAL\n");
 
diff --git a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
index 563e12120e77..5f9eaa3ab584 100644
--- a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
+++ b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
@@ -30,7 +30,7 @@ void test_rdonly_maps(void)
 	struct bss bss;
 
 	obj = bpf_object__open_file(file, NULL);
-	if (CHECK(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj)))
+	if (!ASSERT_OK_PTR(obj, "obj_open"))
 		return;
 
 	err = bpf_object__load(obj);
@@ -58,11 +58,8 @@ void test_rdonly_maps(void)
 			goto cleanup;
 
 		link = bpf_program__attach_raw_tracepoint(prog, "sys_enter");
-		if (CHECK(IS_ERR(link), "attach_prog", "prog '%s', err %ld\n",
-			  t->prog_name, PTR_ERR(link))) {
-			link = NULL;
+		if (!ASSERT_OK_PTR(link, "attach_prog"))
 			goto cleanup;
-		}
 
 		/* trigger probe */
 		usleep(1);
diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
index ac1ee10cffd8..de2688166696 100644
--- a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
+++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
@@ -15,7 +15,7 @@ void test_reference_tracking(void)
 	int err = 0;
 
 	obj = bpf_object__open_file(file, &open_opts);
-	if (CHECK_FAIL(IS_ERR(obj)))
+	if (!ASSERT_OK_PTR(obj, "obj_open_file"))
 		return;
 
 	if (CHECK(strcmp(bpf_object__name(obj), obj_name), "obj_name",
diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
index d3c2de2c24d1..f62361306f6d 100644
--- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
+++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
@@ -76,7 +76,7 @@ __resolve_symbol(struct btf *btf, int type_id)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(test_symbols); i++) {
-		if (test_symbols[i].id != -1)
+		if (test_symbols[i].id >= 0)
 			continue;
 
 		if (BTF_INFO_KIND(type->info) != test_symbols[i].type)
diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c
index cef63e703924..167cd8a2edfd 100644
--- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c
@@ -63,7 +63,7 @@ void test_ringbuf_multi(void)
 		goto cleanup;
 
 	proto_fd = bpf_create_map(BPF_MAP_TYPE_RINGBUF, 0, 0, page_size, 0);
-	if (CHECK(proto_fd == -1, "bpf_create_map", "bpf_create_map failed\n"))
+	if (CHECK(proto_fd < 0, "bpf_create_map", "bpf_create_map failed\n"))
 		goto cleanup;
 
 	err = bpf_map__set_inner_map_fd(skel->maps.ringbuf_hash, proto_fd);
diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
index 821b4146b7b6..4efd337d6a3c 100644
--- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
@@ -78,7 +78,7 @@ static int create_maps(enum bpf_map_type inner_type)
 	attr.max_entries = REUSEPORT_ARRAY_SIZE;
 
 	reuseport_array = bpf_create_map_xattr(&attr);
-	RET_ERR(reuseport_array == -1, "creating reuseport_array",
+	RET_ERR(reuseport_array < 0, "creating reuseport_array",
 		"reuseport_array:%d errno:%d\n", reuseport_array, errno);
 
 	/* Creating outer_map */
@@ -89,7 +89,7 @@ static int create_maps(enum bpf_map_type inner_type)
 	attr.max_entries = 1;
 	attr.inner_map_fd = reuseport_array;
 	outer_map = bpf_create_map_xattr(&attr);
-	RET_ERR(outer_map == -1, "creating outer_map",
+	RET_ERR(outer_map < 0, "creating outer_map",
 		"outer_map:%d errno:%d\n", outer_map, errno);
 
 	return 0;
@@ -102,8 +102,9 @@ static int prepare_bpf_obj(void)
 	int err;
 
 	obj = bpf_object__open("test_select_reuseport_kern.o");
-	RET_ERR(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o",
-		"obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj));
+	err = libbpf_get_error(obj);
+	RET_ERR(err, "open test_select_reuseport_kern.o",
+		"obj:%p PTR_ERR(obj):%d\n", obj, err);
 
 	map = bpf_object__find_map_by_name(obj, "outer_map");
 	RET_ERR(!map, "find outer_map", "!map\n");
@@ -116,31 +117,31 @@ static int prepare_bpf_obj(void)
 	prog = bpf_program__next(NULL, obj);
 	RET_ERR(!prog, "get first bpf_program", "!prog\n");
 	select_by_skb_data_prog = bpf_program__fd(prog);
-	RET_ERR(select_by_skb_data_prog == -1, "get prog fd",
+	RET_ERR(select_by_skb_data_prog < 0, "get prog fd",
 		"select_by_skb_data_prog:%d\n", select_by_skb_data_prog);
 
 	map = bpf_object__find_map_by_name(obj, "result_map");
 	RET_ERR(!map, "find result_map", "!map\n");
 	result_map = bpf_map__fd(map);
-	RET_ERR(result_map == -1, "get result_map fd",
+	RET_ERR(result_map < 0, "get result_map fd",
 		"result_map:%d\n", result_map);
 
 	map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map");
 	RET_ERR(!map, "find tmp_index_ovr_map\n", "!map");
 	tmp_index_ovr_map = bpf_map__fd(map);
-	RET_ERR(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd",
+	RET_ERR(tmp_index_ovr_map < 0, "get tmp_index_ovr_map fd",
 		"tmp_index_ovr_map:%d\n", tmp_index_ovr_map);
 
 	map = bpf_object__find_map_by_name(obj, "linum_map");
 	RET_ERR(!map, "find linum_map", "!map\n");
 	linum_map = bpf_map__fd(map);
-	RET_ERR(linum_map == -1, "get linum_map fd",
+	RET_ERR(linum_map < 0, "get linum_map fd",
 		"linum_map:%d\n", linum_map);
 
 	map = bpf_object__find_map_by_name(obj, "data_check_map");
 	RET_ERR(!map, "find data_check_map", "!map\n");
 	data_check_map = bpf_map__fd(map);
-	RET_ERR(data_check_map == -1, "get data_check_map fd",
+	RET_ERR(data_check_map < 0, "get data_check_map fd",
 		"data_check_map:%d\n", data_check_map);
 
 	return 0;
@@ -237,7 +238,7 @@ static long get_linum(void)
 	int err;
 
 	err = bpf_map_lookup_elem(linum_map, &index_zero, &linum);
-	RET_ERR(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n",
+	RET_ERR(err < 0, "lookup_elem(linum_map)", "err:%d errno:%d\n",
 		err, errno);
 
 	return linum;
@@ -254,11 +255,11 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd,
 	addrlen = sizeof(cli_sa);
 	err = getsockname(cli_fd, (struct sockaddr *)&cli_sa,
 			  &addrlen);
-	RET_IF(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n",
+	RET_IF(err < 0, "getsockname(cli_fd)", "err:%d errno:%d\n",
 	       err, errno);
 
 	err = bpf_map_lookup_elem(data_check_map, &index_zero, &result);
-	RET_IF(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n",
+	RET_IF(err < 0, "lookup_elem(data_check_map)", "err:%d errno:%d\n",
 	       err, errno);
 
 	if (type == SOCK_STREAM) {
@@ -347,7 +348,7 @@ static void check_results(void)
 
 	for (i = 0; i < NR_RESULTS; i++) {
 		err = bpf_map_lookup_elem(result_map, &i, &results[i]);
-		RET_IF(err == -1, "lookup_elem(result_map)",
+		RET_IF(err < 0, "lookup_elem(result_map)",
 		       "i:%u err:%d errno:%d\n", i, err, errno);
 	}
 
@@ -524,12 +525,12 @@ static void test_syncookie(int type, sa_family_t family)
 	 */
 	err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero,
 				  &tmp_index, BPF_ANY);
-	RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)",
+	RET_IF(err < 0, "update_elem(tmp_index_ovr_map, 0, 1)",
 	       "err:%d errno:%d\n", err, errno);
 	do_test(type, family, &cmd, PASS);
 	err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero,
 				  &tmp_index);
-	RET_IF(err == -1 || tmp_index != -1,
+	RET_IF(err < 0 || tmp_index >= 0,
 	       "lookup_elem(tmp_index_ovr_map)",
 	       "err:%d errno:%d tmp_index:%d\n",
 	       err, errno, tmp_index);
@@ -569,7 +570,7 @@ static void test_detach_bpf(int type, sa_family_t family)
 
 	for (i = 0; i < NR_RESULTS; i++) {
 		err = bpf_map_lookup_elem(result_map, &i, &tmp);
-		RET_IF(err == -1, "lookup_elem(result_map)",
+		RET_IF(err < 0, "lookup_elem(result_map)",
 		       "i:%u err:%d errno:%d\n", i, err, errno);
 		nr_run_before += tmp;
 	}
@@ -584,7 +585,7 @@ static void test_detach_bpf(int type, sa_family_t family)
 
 	for (i = 0; i < NR_RESULTS; i++) {
 		err = bpf_map_lookup_elem(result_map, &i, &tmp);
-		RET_IF(err == -1, "lookup_elem(result_map)",
+		RET_IF(err < 0, "lookup_elem(result_map)",
 		       "i:%u err:%d errno:%d\n", i, err, errno);
 		nr_run_after += tmp;
 	}
@@ -632,24 +633,24 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany)
 					 SO_ATTACH_REUSEPORT_EBPF,
 					 &select_by_skb_data_prog,
 					 sizeof(select_by_skb_data_prog));
-			RET_IF(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)",
+			RET_IF(err < 0, "setsockopt(SO_ATTACH_REUEPORT_EBPF)",
 			       "err:%d errno:%d\n", err, errno);
 		}
 
 		err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen);
-		RET_IF(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n",
+		RET_IF(err < 0, "bind()", "sk_fds[%d] err:%d errno:%d\n",
 		       i, err, errno);
 
 		if (type == SOCK_STREAM) {
 			err = listen(sk_fds[i], 10);
-			RET_IF(err == -1, "listen()",
+			RET_IF(err < 0, "listen()",
 			       "sk_fds[%d] err:%d errno:%d\n",
 			       i, err, errno);
 		}
 
 		err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i],
 					  BPF_NOEXIST);
-		RET_IF(err == -1, "update_elem(reuseport_array)",
+		RET_IF(err < 0, "update_elem(reuseport_array)",
 		       "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
 
 		if (i == first) {
@@ -682,7 +683,7 @@ static void setup_per_test(int type, sa_family_t family, bool inany,
 	prepare_sk_fds(type, family, inany);
 	err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr,
 				  BPF_ANY);
-	RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)",
+	RET_IF(err < 0, "update_elem(tmp_index_ovr_map, 0, -1)",
 	       "err:%d errno:%d\n", err, errno);
 
 	/* Install reuseport_array to outer_map? */
@@ -691,7 +692,7 @@ static void setup_per_test(int type, sa_family_t family, bool inany,
 
 	err = bpf_map_update_elem(outer_map, &index_zero, &reuseport_array,
 				  BPF_ANY);
-	RET_IF(err == -1, "update_elem(outer_map, 0, reuseport_array)",
+	RET_IF(err < 0, "update_elem(outer_map, 0, reuseport_array)",
 	       "err:%d errno:%d\n", err, errno);
 }
 
@@ -720,18 +721,18 @@ static void cleanup_per_test(bool no_inner_map)
 		return;
 
 	err = bpf_map_delete_elem(outer_map, &index_zero);
-	RET_IF(err == -1, "delete_elem(outer_map)",
+	RET_IF(err < 0, "delete_elem(outer_map)",
 	       "err:%d errno:%d\n", err, errno);
 }
 
 static void cleanup(void)
 {
-	if (outer_map != -1) {
+	if (outer_map >= 0) {
 		close(outer_map);
 		outer_map = -1;
 	}
 
-	if (reuseport_array != -1) {
+	if (reuseport_array >= 0) {
 		close(reuseport_array);
 		reuseport_array = -1;
 	}
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index a1eade51d440..023cc532992d 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -91,8 +91,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 
 		skel->links.send_signal_perf =
 			bpf_program__attach_perf_event(skel->progs.send_signal_perf, pmu_fd);
-		if (CHECK(IS_ERR(skel->links.send_signal_perf), "attach_perf_event",
-			  "err %ld\n", PTR_ERR(skel->links.send_signal_perf)))
+		if (!ASSERT_OK_PTR(skel->links.send_signal_perf, "attach_perf_event"))
 			goto disable_pmu;
 	}
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index 45c82db3c58c..aee41547e7f4 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -480,7 +480,7 @@ static struct bpf_link *attach_lookup_prog(struct bpf_program *prog)
 	}
 
 	link = bpf_program__attach_netns(prog, net_fd);
-	if (CHECK(IS_ERR(link), "bpf_program__attach_netns", "failed\n")) {
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_netns")) {
 		errno = -PTR_ERR(link);
 		log_err("failed to attach program '%s' to netns",
 			bpf_program__name(prog));
diff --git a/tools/testing/selftests/bpf/prog_tests/sock_fields.c b/tools/testing/selftests/bpf/prog_tests/sock_fields.c
index af87118e748e..577d619fb07e 100644
--- a/tools/testing/selftests/bpf/prog_tests/sock_fields.c
+++ b/tools/testing/selftests/bpf/prog_tests/sock_fields.c
@@ -97,12 +97,12 @@ static void check_result(void)
 
 	err = bpf_map_lookup_elem(linum_map_fd, &egress_linum_idx,
 				  &egress_linum);
-	CHECK(err == -1, "bpf_map_lookup_elem(linum_map_fd)",
+	CHECK(err < 0, "bpf_map_lookup_elem(linum_map_fd)",
 	      "err:%d errno:%d\n", err, errno);
 
 	err = bpf_map_lookup_elem(linum_map_fd, &ingress_linum_idx,
 				  &ingress_linum);
-	CHECK(err == -1, "bpf_map_lookup_elem(linum_map_fd)",
+	CHECK(err < 0, "bpf_map_lookup_elem(linum_map_fd)",
 	      "err:%d errno:%d\n", err, errno);
 
 	memcpy(&srv_sk, &skel->bss->srv_sk, sizeof(srv_sk));
@@ -355,14 +355,12 @@ void test_sock_fields(void)
 
 	egress_link = bpf_program__attach_cgroup(skel->progs.egress_read_sock_fields,
 						 child_cg_fd);
-	if (CHECK(IS_ERR(egress_link), "attach_cgroup(egress)", "err:%ld\n",
-		  PTR_ERR(egress_link)))
+	if (!ASSERT_OK_PTR(egress_link, "attach_cgroup(egress)"))
 		goto done;
 
 	ingress_link = bpf_program__attach_cgroup(skel->progs.ingress_read_sock_fields,
 						  child_cg_fd);
-	if (CHECK(IS_ERR(ingress_link), "attach_cgroup(ingress)", "err:%ld\n",
-		  PTR_ERR(ingress_link)))
+	if (!ASSERT_OK_PTR(ingress_link, "attach_cgroup(ingress)"))
 		goto done;
 
 	linum_map_fd = bpf_map__fd(skel->maps.linum_map);
@@ -375,8 +373,8 @@ done:
 	bpf_link__destroy(egress_link);
 	bpf_link__destroy(ingress_link);
 	test_sock_fields__destroy(skel);
-	if (child_cg_fd != -1)
+	if (child_cg_fd >= 0)
 		close(child_cg_fd);
-	if (parent_cg_fd != -1)
+	if (parent_cg_fd >= 0)
 		close(parent_cg_fd);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index ab77596b64e3..1352ec104149 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -88,11 +88,11 @@ static void test_sockmap_create_update_free(enum bpf_map_type map_type)
 	int s, map, err;
 
 	s = connected_socket_v4();
-	if (CHECK_FAIL(s == -1))
+	if (CHECK_FAIL(s < 0))
 		return;
 
 	map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0);
-	if (CHECK_FAIL(map == -1)) {
+	if (CHECK_FAIL(map < 0)) {
 		perror("bpf_create_map");
 		goto out;
 	}
@@ -245,7 +245,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(skel->progs.copy, &opts);
-	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
@@ -304,7 +304,7 @@ static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first,
 	}
 
 	err = bpf_prog_attach(verdict, map, second, 0);
-	assert(err == -1 && errno == EBUSY);
+	ASSERT_EQ(err, -EBUSY, "prog_attach_fail");
 
 	err = bpf_prog_detach2(verdict, map, first);
 	if (CHECK_FAIL(err)) {
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
index 06b86addc181..7a0d64fdc192 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
@@ -98,7 +98,7 @@ static void run_tests(int family, enum bpf_map_type map_type)
 	int map;
 
 	map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0);
-	if (CHECK_FAIL(map == -1)) {
+	if (CHECK_FAIL(map < 0)) {
 		perror("bpf_map_create");
 		return;
 	}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index 648d9ae898d2..0f066b89b4af 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -139,7 +139,7 @@
 #define xbpf_map_delete_elem(fd, key)                                          \
 	({                                                                     \
 		int __ret = bpf_map_delete_elem((fd), (key));                  \
-		if (__ret == -1)                                               \
+		if (__ret < 0)                                               \
 			FAIL_ERRNO("map_delete");                              \
 		__ret;                                                         \
 	})
@@ -147,7 +147,7 @@
 #define xbpf_map_lookup_elem(fd, key, val)                                     \
 	({                                                                     \
 		int __ret = bpf_map_lookup_elem((fd), (key), (val));           \
-		if (__ret == -1)                                               \
+		if (__ret < 0)                                               \
 			FAIL_ERRNO("map_lookup");                              \
 		__ret;                                                         \
 	})
@@ -155,7 +155,7 @@
 #define xbpf_map_update_elem(fd, key, val, flags)                              \
 	({                                                                     \
 		int __ret = bpf_map_update_elem((fd), (key), (val), (flags));  \
-		if (__ret == -1)                                               \
+		if (__ret < 0)                                               \
 			FAIL_ERRNO("map_update");                              \
 		__ret;                                                         \
 	})
@@ -164,7 +164,7 @@
 	({                                                                     \
 		int __ret =                                                    \
 			bpf_prog_attach((prog), (target), (type), (flags));    \
-		if (__ret == -1)                                               \
+		if (__ret < 0)                                               \
 			FAIL_ERRNO("prog_attach(" #type ")");                  \
 		__ret;                                                         \
 	})
@@ -172,7 +172,7 @@
 #define xbpf_prog_detach2(prog, target, type)                                  \
 	({                                                                     \
 		int __ret = bpf_prog_detach2((prog), (target), (type));        \
-		if (__ret == -1)                                               \
+		if (__ret < 0)                                               \
 			FAIL_ERRNO("prog_detach2(" #type ")");                 \
 		__ret;                                                         \
 	})
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
index 11a769e18f5d..0a91d8d9954b 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
@@ -62,8 +62,7 @@ retry:
 
 	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
 							   pmu_fd);
-	if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event",
-		  "err %ld\n", PTR_ERR(skel->links.oncpu))) {
+	if (!ASSERT_OK_PTR(skel->links.oncpu, "attach_perf_event")) {
 		close(pmu_fd);
 		goto cleanup;
 	}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
index 37269d23df93..04b476bd62b9 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
@@ -21,7 +21,7 @@ void test_stacktrace_map(void)
 		goto close_prog;
 
 	link = bpf_program__attach_tracepoint(prog, "sched", "sched_switch");
-	if (CHECK(IS_ERR(link), "attach_tp", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_tp"))
 		goto close_prog;
 
 	/* find map fds */
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
index 404a5498e1a3..4fd30bb651ad 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
@@ -21,7 +21,7 @@ void test_stacktrace_map_raw_tp(void)
 		goto close_prog;
 
 	link = bpf_program__attach_raw_tracepoint(prog, "sched_switch");
-	if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_raw_tp"))
 		goto close_prog;
 
 	/* find map fds */
@@ -59,7 +59,6 @@ void test_stacktrace_map_raw_tp(void)
 		goto close_prog;
 
 close_prog:
-	if (!IS_ERR_OR_NULL(link))
-		bpf_link__destroy(link);
+	bpf_link__destroy(link);
 	bpf_object__close(obj);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
index 08d19cafd5e8..1fa772079967 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
@@ -353,8 +353,7 @@ static void fastopen_estab(void)
 		return;
 
 	link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd);
-	if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n",
-		  PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_cgroup(estab)"))
 		return;
 
 	if (sk_fds_connect(&sk_fds, true)) {
@@ -398,8 +397,7 @@ static void syncookie_estab(void)
 		return;
 
 	link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd);
-	if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n",
-		  PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_cgroup(estab)"))
 		return;
 
 	if (sk_fds_connect(&sk_fds, false)) {
@@ -431,8 +429,7 @@ static void fin(void)
 		return;
 
 	link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd);
-	if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n",
-		  PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_cgroup(estab)"))
 		return;
 
 	if (sk_fds_connect(&sk_fds, false)) {
@@ -471,8 +468,7 @@ static void __simple_estab(bool exprm)
 		return;
 
 	link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd);
-	if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n",
-		  PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_cgroup(estab)"))
 		return;
 
 	if (sk_fds_connect(&sk_fds, false)) {
@@ -509,8 +505,7 @@ static void misc(void)
 		return;
 
 	link = bpf_program__attach_cgroup(misc_skel->progs.misc_estab, cg_fd);
-	if (CHECK(IS_ERR(link), "attach_cgroup(misc_estab)", "err: %ld\n",
-		  PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_cgroup(misc_estab)"))
 		return;
 
 	if (sk_fds_connect(&sk_fds, false)) {
diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c
index 9966685866fd..123c68c1917d 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c
@@ -73,7 +73,7 @@ void test_test_overhead(void)
 		return;
 
 	obj = bpf_object__open_file("./test_overhead.o", NULL);
-	if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj)))
+	if (!ASSERT_OK_PTR(obj, "obj_open_file"))
 		return;
 
 	kprobe_prog = bpf_object__find_program_by_title(obj, kprobe_name);
@@ -108,7 +108,7 @@ void test_test_overhead(void)
 	/* attach kprobe */
 	link = bpf_program__attach_kprobe(kprobe_prog, false /* retprobe */,
 					  kprobe_func);
-	if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_kprobe"))
 		goto cleanup;
 	test_run("kprobe");
 	bpf_link__destroy(link);
@@ -116,28 +116,28 @@ void test_test_overhead(void)
 	/* attach kretprobe */
 	link = bpf_program__attach_kprobe(kretprobe_prog, true /* retprobe */,
 					  kprobe_func);
-	if (CHECK(IS_ERR(link), "attach kretprobe", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_kretprobe"))
 		goto cleanup;
 	test_run("kretprobe");
 	bpf_link__destroy(link);
 
 	/* attach raw_tp */
 	link = bpf_program__attach_raw_tracepoint(raw_tp_prog, "task_rename");
-	if (CHECK(IS_ERR(link), "attach fentry", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_raw_tp"))
 		goto cleanup;
 	test_run("raw_tp");
 	bpf_link__destroy(link);
 
 	/* attach fentry */
 	link = bpf_program__attach_trace(fentry_prog);
-	if (CHECK(IS_ERR(link), "attach fentry", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_fentry"))
 		goto cleanup;
 	test_run("fentry");
 	bpf_link__destroy(link);
 
 	/* attach fexit */
 	link = bpf_program__attach_trace(fexit_prog);
-	if (CHECK(IS_ERR(link), "attach fexit", "err %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "attach_fexit"))
 		goto cleanup;
 	test_run("fexit");
 	bpf_link__destroy(link);
diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
index f3022d934e2d..d7f5a931d7f3 100644
--- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
+++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
@@ -55,7 +55,7 @@ void test_trampoline_count(void)
 	/* attach 'allowed' trampoline programs */
 	for (i = 0; i < MAX_TRAMP_PROGS; i++) {
 		obj = bpf_object__open_file(object, NULL);
-		if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) {
+		if (!ASSERT_OK_PTR(obj, "obj_open_file")) {
 			obj = NULL;
 			goto cleanup;
 		}
@@ -68,14 +68,14 @@ void test_trampoline_count(void)
 
 		if (rand() % 2) {
 			link = load(inst[i].obj, fentry_name);
-			if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) {
+			if (!ASSERT_OK_PTR(link, "attach_prog")) {
 				link = NULL;
 				goto cleanup;
 			}
 			inst[i].link_fentry = link;
 		} else {
 			link = load(inst[i].obj, fexit_name);
-			if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) {
+			if (!ASSERT_OK_PTR(link, "attach_prog")) {
 				link = NULL;
 				goto cleanup;
 			}
@@ -85,7 +85,7 @@ void test_trampoline_count(void)
 
 	/* and try 1 extra.. */
 	obj = bpf_object__open_file(object, NULL);
-	if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) {
+	if (!ASSERT_OK_PTR(obj, "obj_open_file")) {
 		obj = NULL;
 		goto cleanup;
 	}
@@ -96,13 +96,15 @@ void test_trampoline_count(void)
 
 	/* ..that needs to fail */
 	link = load(obj, fentry_name);
-	if (CHECK(!IS_ERR(link), "cannot attach over the limit", "err %ld\n", PTR_ERR(link))) {
+	err = libbpf_get_error(link);
+	if (!ASSERT_ERR_PTR(link, "cannot attach over the limit")) {
 		bpf_link__destroy(link);
 		goto cleanup_extra;
 	}
 
 	/* with E2BIG error */
-	CHECK(PTR_ERR(link) != -E2BIG, "proper error check", "err %ld\n", PTR_ERR(link));
+	ASSERT_EQ(err, -E2BIG, "proper error check");
+	ASSERT_EQ(link, NULL, "ptr_is_null");
 
 	/* and finaly execute the probe */
 	if (CHECK_FAIL(prctl(PR_GET_NAME, comm, 0L, 0L, 0L)))
diff --git a/tools/testing/selftests/bpf/prog_tests/udp_limit.c b/tools/testing/selftests/bpf/prog_tests/udp_limit.c
index 2aba09d4d01b..56c9d6bd38a3 100644
--- a/tools/testing/selftests/bpf/prog_tests/udp_limit.c
+++ b/tools/testing/selftests/bpf/prog_tests/udp_limit.c
@@ -22,11 +22,10 @@ void test_udp_limit(void)
 		goto close_cgroup_fd;
 
 	skel->links.sock = bpf_program__attach_cgroup(skel->progs.sock, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links.sock, "cg_attach_sock"))
+		goto close_skeleton;
 	skel->links.sock_release = bpf_program__attach_cgroup(skel->progs.sock_release, cgroup_fd);
-	if (CHECK(IS_ERR(skel->links.sock) || IS_ERR(skel->links.sock_release),
-		  "cg-attach", "sock %ld sock_release %ld",
-		  PTR_ERR(skel->links.sock),
-		  PTR_ERR(skel->links.sock_release)))
+	if (!ASSERT_OK_PTR(skel->links.sock_release, "cg_attach_sock_release"))
 		goto close_skeleton;
 
 	/* BPF program enforces a single UDP socket per cgroup,
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
index 2c6c570b21f8..3bd5904b4db5 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
@@ -90,7 +90,7 @@ void test_xdp_bpf2bpf(void)
 	pb_opts.ctx = &passed;
 	pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map),
 			      1, &pb_opts);
-	if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto out;
 
 	/* Run test program */
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
index 6f814999b395..46eed0a33c23 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
@@ -51,7 +51,7 @@ void test_xdp_link(void)
 
 	/* BPF link is not allowed to replace prog attachment */
 	link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO);
-	if (CHECK(!IS_ERR(link), "link_attach_fail", "unexpected success\n")) {
+	if (!ASSERT_ERR_PTR(link, "link_attach_should_fail")) {
 		bpf_link__destroy(link);
 		/* best-effort detach prog */
 		opts.old_fd = prog_fd1;
@@ -67,7 +67,7 @@ void test_xdp_link(void)
 
 	/* now BPF link should attach successfully */
 	link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO);
-	if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "link_attach"))
 		goto cleanup;
 	skel1->links.xdp_handler = link;
 
@@ -95,7 +95,7 @@ void test_xdp_link(void)
 
 	/* BPF link is not allowed to replace another BPF link */
 	link = bpf_program__attach_xdp(skel2->progs.xdp_handler, IFINDEX_LO);
-	if (CHECK(!IS_ERR(link), "link_attach_fail", "unexpected success\n")) {
+	if (!ASSERT_ERR_PTR(link, "link_attach_should_fail")) {
 		bpf_link__destroy(link);
 		goto cleanup;
 	}
@@ -105,7 +105,7 @@ void test_xdp_link(void)
 
 	/* new link attach should succeed */
 	link = bpf_program__attach_xdp(skel2->progs.xdp_handler, IFINDEX_LO);
-	if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link)))
+	if (!ASSERT_OK_PTR(link, "link_attach"))
 		goto cleanup;
 	skel2->links.xdp_handler = link;
 
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 8410a730c82f..30cbf5d98f7d 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -53,12 +53,12 @@ static void test_hashmap(unsigned int task, void *data)
 
 	value = 0;
 	/* BPF_NOEXIST means add new element if it doesn't exist. */
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 &&
 	       /* key=1 already exists. */
 	       errno == EEXIST);
 
 	/* -1 is an invalid flag. */
-	assert(bpf_map_update_elem(fd, &key, &value, -1) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, -1) < 0 &&
 	       errno == EINVAL);
 
 	/* Check that key=1 can be found. */
@@ -73,10 +73,10 @@ static void test_hashmap(unsigned int task, void *data)
 	assert(bpf_map_lookup_and_delete_elem(fd, &key, &value) == 0 && value == 1234);
 
 	/* Check that key=2 is not found. */
-	assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
+	assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == ENOENT);
 
 	/* BPF_EXIST means update existing element. */
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) < 0 &&
 	       /* key=2 is not there. */
 	       errno == ENOENT);
 
@@ -87,7 +87,7 @@ static void test_hashmap(unsigned int task, void *data)
 	 * inserted due to max_entries limit.
 	 */
 	key = 0;
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 &&
 	       errno == E2BIG);
 
 	/* Update existing element, though the map is full. */
@@ -96,12 +96,12 @@ static void test_hashmap(unsigned int task, void *data)
 	key = 2;
 	assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
 	key = 3;
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 &&
 	       errno == E2BIG);
 
 	/* Check that key = 0 doesn't exist. */
 	key = 0;
-	assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT);
+	assert(bpf_map_delete_elem(fd, &key) < 0 && errno == ENOENT);
 
 	/* Iterate over two elements. */
 	assert(bpf_map_get_next_key(fd, NULL, &first_key) == 0 &&
@@ -111,7 +111,7 @@ static void test_hashmap(unsigned int task, void *data)
 	assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 &&
 	       (next_key == 1 || next_key == 2) &&
 	       (next_key != first_key));
-	assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 &&
+	assert(bpf_map_get_next_key(fd, &next_key, &next_key) < 0 &&
 	       errno == ENOENT);
 
 	/* Delete both elements. */
@@ -119,13 +119,13 @@ static void test_hashmap(unsigned int task, void *data)
 	assert(bpf_map_delete_elem(fd, &key) == 0);
 	key = 2;
 	assert(bpf_map_delete_elem(fd, &key) == 0);
-	assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT);
+	assert(bpf_map_delete_elem(fd, &key) < 0 && errno == ENOENT);
 
 	key = 0;
 	/* Check that map is empty. */
-	assert(bpf_map_get_next_key(fd, NULL, &next_key) == -1 &&
+	assert(bpf_map_get_next_key(fd, NULL, &next_key) < 0 &&
 	       errno == ENOENT);
-	assert(bpf_map_get_next_key(fd, &key, &next_key) == -1 &&
+	assert(bpf_map_get_next_key(fd, &key, &next_key) < 0 &&
 	       errno == ENOENT);
 
 	close(fd);
@@ -186,12 +186,12 @@ static void test_hashmap_percpu(unsigned int task, void *data)
 	expected_key_mask |= key;
 
 	/* BPF_NOEXIST means add new element if it doesn't exist. */
-	assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) < 0 &&
 	       /* key=1 already exists. */
 	       errno == EEXIST);
 
 	/* -1 is an invalid flag. */
-	assert(bpf_map_update_elem(fd, &key, value, -1) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, value, -1) < 0 &&
 	       errno == EINVAL);
 
 	/* Check that key=1 can be found. Value could be 0 if the lookup
@@ -203,10 +203,10 @@ static void test_hashmap_percpu(unsigned int task, void *data)
 
 	key = 2;
 	/* Check that key=2 is not found. */
-	assert(bpf_map_lookup_elem(fd, &key, value) == -1 && errno == ENOENT);
+	assert(bpf_map_lookup_elem(fd, &key, value) < 0 && errno == ENOENT);
 
 	/* BPF_EXIST means update existing element. */
-	assert(bpf_map_update_elem(fd, &key, value, BPF_EXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, value, BPF_EXIST) < 0 &&
 	       /* key=2 is not there. */
 	       errno == ENOENT);
 
@@ -219,11 +219,11 @@ static void test_hashmap_percpu(unsigned int task, void *data)
 	 * inserted due to max_entries limit.
 	 */
 	key = 0;
-	assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) < 0 &&
 	       errno == E2BIG);
 
 	/* Check that key = 0 doesn't exist. */
-	assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT);
+	assert(bpf_map_delete_elem(fd, &key) < 0 && errno == ENOENT);
 
 	/* Iterate over two elements. */
 	assert(bpf_map_get_next_key(fd, NULL, &first_key) == 0 &&
@@ -254,13 +254,13 @@ static void test_hashmap_percpu(unsigned int task, void *data)
 	assert(bpf_map_delete_elem(fd, &key) == 0);
 	key = 2;
 	assert(bpf_map_delete_elem(fd, &key) == 0);
-	assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT);
+	assert(bpf_map_delete_elem(fd, &key) < 0 && errno == ENOENT);
 
 	key = 0;
 	/* Check that map is empty. */
-	assert(bpf_map_get_next_key(fd, NULL, &next_key) == -1 &&
+	assert(bpf_map_get_next_key(fd, NULL, &next_key) < 0 &&
 	       errno == ENOENT);
-	assert(bpf_map_get_next_key(fd, &key, &next_key) == -1 &&
+	assert(bpf_map_get_next_key(fd, &key, &next_key) < 0 &&
 	       errno == ENOENT);
 
 	close(fd);
@@ -377,7 +377,7 @@ static void test_arraymap(unsigned int task, void *data)
 	assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
 
 	value = 0;
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 &&
 	       errno == EEXIST);
 
 	/* Check that key=1 can be found. */
@@ -391,11 +391,11 @@ static void test_arraymap(unsigned int task, void *data)
 	 * due to max_entries limit.
 	 */
 	key = 2;
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) < 0 &&
 	       errno == E2BIG);
 
 	/* Check that key = 2 doesn't exist. */
-	assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
+	assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == ENOENT);
 
 	/* Iterate over two elements. */
 	assert(bpf_map_get_next_key(fd, NULL, &next_key) == 0 &&
@@ -404,12 +404,12 @@ static void test_arraymap(unsigned int task, void *data)
 	       next_key == 0);
 	assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 &&
 	       next_key == 1);
-	assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 &&
+	assert(bpf_map_get_next_key(fd, &next_key, &next_key) < 0 &&
 	       errno == ENOENT);
 
 	/* Delete shouldn't succeed. */
 	key = 1;
-	assert(bpf_map_delete_elem(fd, &key) == -1 && errno == EINVAL);
+	assert(bpf_map_delete_elem(fd, &key) < 0 && errno == EINVAL);
 
 	close(fd);
 }
@@ -435,7 +435,7 @@ static void test_arraymap_percpu(unsigned int task, void *data)
 	assert(bpf_map_update_elem(fd, &key, values, BPF_ANY) == 0);
 
 	bpf_percpu(values, 0) = 0;
-	assert(bpf_map_update_elem(fd, &key, values, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, values, BPF_NOEXIST) < 0 &&
 	       errno == EEXIST);
 
 	/* Check that key=1 can be found. */
@@ -450,11 +450,11 @@ static void test_arraymap_percpu(unsigned int task, void *data)
 
 	/* Check that key=2 cannot be inserted due to max_entries limit. */
 	key = 2;
-	assert(bpf_map_update_elem(fd, &key, values, BPF_EXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, values, BPF_EXIST) < 0 &&
 	       errno == E2BIG);
 
 	/* Check that key = 2 doesn't exist. */
-	assert(bpf_map_lookup_elem(fd, &key, values) == -1 && errno == ENOENT);
+	assert(bpf_map_lookup_elem(fd, &key, values) < 0 && errno == ENOENT);
 
 	/* Iterate over two elements. */
 	assert(bpf_map_get_next_key(fd, NULL, &next_key) == 0 &&
@@ -463,12 +463,12 @@ static void test_arraymap_percpu(unsigned int task, void *data)
 	       next_key == 0);
 	assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 &&
 	       next_key == 1);
-	assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 &&
+	assert(bpf_map_get_next_key(fd, &next_key, &next_key) < 0 &&
 	       errno == ENOENT);
 
 	/* Delete shouldn't succeed. */
 	key = 1;
-	assert(bpf_map_delete_elem(fd, &key) == -1 && errno == EINVAL);
+	assert(bpf_map_delete_elem(fd, &key) < 0 && errno == EINVAL);
 
 	close(fd);
 }
@@ -572,7 +572,7 @@ static void test_queuemap(unsigned int task, void *data)
 		assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0);
 
 	/* Check that element cannot be pushed due to max_entries limit */
-	assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 &&
+	assert(bpf_map_update_elem(fd, NULL, &val, 0) < 0 &&
 	       errno == E2BIG);
 
 	/* Peek element */
@@ -588,12 +588,12 @@ static void test_queuemap(unsigned int task, void *data)
 		       val == vals[i]);
 
 	/* Check that there are not elements left */
-	assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 &&
+	assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) < 0 &&
 	       errno == ENOENT);
 
 	/* Check that non supported functions set errno to EINVAL */
-	assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL);
-	assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL);
+	assert(bpf_map_delete_elem(fd, NULL) < 0 && errno == EINVAL);
+	assert(bpf_map_get_next_key(fd, NULL, NULL) < 0 && errno == EINVAL);
 
 	close(fd);
 }
@@ -630,7 +630,7 @@ static void test_stackmap(unsigned int task, void *data)
 		assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0);
 
 	/* Check that element cannot be pushed due to max_entries limit */
-	assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 &&
+	assert(bpf_map_update_elem(fd, NULL, &val, 0) < 0 &&
 	       errno == E2BIG);
 
 	/* Peek element */
@@ -646,12 +646,12 @@ static void test_stackmap(unsigned int task, void *data)
 		       val == vals[i]);
 
 	/* Check that there are not elements left */
-	assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 &&
+	assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) < 0 &&
 	       errno == ENOENT);
 
 	/* Check that non supported functions set errno to EINVAL */
-	assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL);
-	assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL);
+	assert(bpf_map_delete_elem(fd, NULL) < 0 && errno == EINVAL);
+	assert(bpf_map_get_next_key(fd, NULL, NULL) < 0 && errno == EINVAL);
 
 	close(fd);
 }
@@ -852,7 +852,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 	}
 
 	bpf_map_rx = bpf_object__find_map_by_name(obj, "sock_map_rx");
-	if (IS_ERR(bpf_map_rx)) {
+	if (!bpf_map_rx) {
 		printf("Failed to load map rx from verdict prog\n");
 		goto out_sockmap;
 	}
@@ -864,7 +864,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 	}
 
 	bpf_map_tx = bpf_object__find_map_by_name(obj, "sock_map_tx");
-	if (IS_ERR(bpf_map_tx)) {
+	if (!bpf_map_tx) {
 		printf("Failed to load map tx from verdict prog\n");
 		goto out_sockmap;
 	}
@@ -876,7 +876,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 	}
 
 	bpf_map_msg = bpf_object__find_map_by_name(obj, "sock_map_msg");
-	if (IS_ERR(bpf_map_msg)) {
+	if (!bpf_map_msg) {
 		printf("Failed to load map msg from msg_verdict prog\n");
 		goto out_sockmap;
 	}
@@ -888,7 +888,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 	}
 
 	bpf_map_break = bpf_object__find_map_by_name(obj, "sock_map_break");
-	if (IS_ERR(bpf_map_break)) {
+	if (!bpf_map_break) {
 		printf("Failed to load map tx from verdict prog\n");
 		goto out_sockmap;
 	}
@@ -1170,7 +1170,7 @@ static void test_map_in_map(void)
 	}
 
 	map = bpf_object__find_map_by_name(obj, "mim_array");
-	if (IS_ERR(map)) {
+	if (!map) {
 		printf("Failed to load array of maps from test prog\n");
 		goto out_map_in_map;
 	}
@@ -1181,7 +1181,7 @@ static void test_map_in_map(void)
 	}
 
 	map = bpf_object__find_map_by_name(obj, "mim_hash");
-	if (IS_ERR(map)) {
+	if (!map) {
 		printf("Failed to load hash of maps from test prog\n");
 		goto out_map_in_map;
 	}
@@ -1194,7 +1194,7 @@ static void test_map_in_map(void)
 	bpf_object__load(obj);
 
 	map = bpf_object__find_map_by_name(obj, "mim_array");
-	if (IS_ERR(map)) {
+	if (!map) {
 		printf("Failed to load array of maps from test prog\n");
 		goto out_map_in_map;
 	}
@@ -1211,7 +1211,7 @@ static void test_map_in_map(void)
 	}
 
 	map = bpf_object__find_map_by_name(obj, "mim_hash");
-	if (IS_ERR(map)) {
+	if (!map) {
 		printf("Failed to load hash of maps from test prog\n");
 		goto out_map_in_map;
 	}
@@ -1263,7 +1263,7 @@ static void test_map_large(void)
 	}
 
 	key.c = -1;
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 &&
 	       errno == E2BIG);
 
 	/* Iterate through all elements. */
@@ -1271,12 +1271,12 @@ static void test_map_large(void)
 	key.c = -1;
 	for (i = 0; i < MAP_SIZE; i++)
 		assert(bpf_map_get_next_key(fd, &key, &key) == 0);
-	assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT);
+	assert(bpf_map_get_next_key(fd, &key, &key) < 0 && errno == ENOENT);
 
 	key.c = 0;
 	assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 0);
 	key.a = 1;
-	assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
+	assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == ENOENT);
 
 	close(fd);
 }
@@ -1408,7 +1408,7 @@ static void test_map_parallel(void)
 	run_parallel(TASKS, test_update_delete, data);
 
 	/* Check that key=0 is already there. */
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 &&
 	       errno == EEXIST);
 
 	/* Check that all elements were inserted. */
@@ -1416,7 +1416,7 @@ static void test_map_parallel(void)
 	key = -1;
 	for (i = 0; i < MAP_SIZE; i++)
 		assert(bpf_map_get_next_key(fd, &key, &key) == 0);
-	assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT);
+	assert(bpf_map_get_next_key(fd, &key, &key) < 0 && errno == ENOENT);
 
 	/* Another check for all elements */
 	for (i = 0; i < MAP_SIZE; i++) {
@@ -1432,8 +1432,8 @@ static void test_map_parallel(void)
 
 	/* Nothing should be left. */
 	key = -1;
-	assert(bpf_map_get_next_key(fd, NULL, &key) == -1 && errno == ENOENT);
-	assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT);
+	assert(bpf_map_get_next_key(fd, NULL, &key) < 0 && errno == ENOENT);
+	assert(bpf_map_get_next_key(fd, &key, &key) < 0 && errno == ENOENT);
 }
 
 static void test_map_rdonly(void)
@@ -1451,12 +1451,12 @@ static void test_map_rdonly(void)
 	key = 1;
 	value = 1234;
 	/* Try to insert key=1 element. */
-	assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == -1 &&
+	assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) < 0 &&
 	       errno == EPERM);
 
 	/* Check that key=1 is not found. */
-	assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
-	assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT);
+	assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == ENOENT);
+	assert(bpf_map_get_next_key(fd, &key, &value) < 0 && errno == ENOENT);
 
 	close(fd);
 }
@@ -1479,8 +1479,8 @@ static void test_map_wronly_hash(void)
 	assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
 
 	/* Check that reading elements and keys from the map is not allowed. */
-	assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM);
-	assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM);
+	assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == EPERM);
+	assert(bpf_map_get_next_key(fd, &key, &value) < 0 && errno == EPERM);
 
 	close(fd);
 }
@@ -1507,10 +1507,10 @@ static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type)
 	assert(bpf_map_update_elem(fd, NULL, &value, BPF_ANY) == 0);
 
 	/* Peek element should fail */
-	assert(bpf_map_lookup_elem(fd, NULL, &value) == -1 && errno == EPERM);
+	assert(bpf_map_lookup_elem(fd, NULL, &value) < 0 && errno == EPERM);
 
 	/* Pop element should fail */
-	assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) == -1 &&
+	assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) < 0 &&
 	       errno == EPERM);
 
 	close(fd);
@@ -1564,7 +1564,7 @@ static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size,
 			value = &fd32;
 		}
 		err = bpf_map_update_elem(map_fd, &index0, value, BPF_ANY);
-		CHECK(err != -1 || errno != EINVAL,
+		CHECK(err >= 0 || errno != EINVAL,
 		      "reuseport array update unbound sk",
 		      "sock_type:%d err:%d errno:%d\n",
 		      type, err, errno);
@@ -1593,7 +1593,7 @@ static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size,
 			 */
 			err = bpf_map_update_elem(map_fd, &index0, value,
 						  BPF_ANY);
-			CHECK(err != -1 || errno != EINVAL,
+			CHECK(err >= 0 || errno != EINVAL,
 			      "reuseport array update non-listening sk",
 			      "sock_type:%d err:%d errno:%d\n",
 			      type, err, errno);
@@ -1623,31 +1623,31 @@ static void test_reuseport_array(void)
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 				sizeof(__u32), sizeof(__u64), array_size, 0);
-	CHECK(map_fd == -1, "reuseport array create",
+	CHECK(map_fd < 0, "reuseport array create",
 	      "map_fd:%d, errno:%d\n", map_fd, errno);
 
 	/* Test lookup/update/delete with invalid index */
 	err = bpf_map_delete_elem(map_fd, &bad_index);
-	CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries",
+	CHECK(err >= 0 || errno != E2BIG, "reuseport array del >=max_entries",
 	      "err:%d errno:%d\n", err, errno);
 
 	err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY);
-	CHECK(err != -1 || errno != E2BIG,
+	CHECK(err >= 0 || errno != E2BIG,
 	      "reuseport array update >=max_entries",
 	      "err:%d errno:%d\n", err, errno);
 
 	err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie);
-	CHECK(err != -1 || errno != ENOENT,
+	CHECK(err >= 0 || errno != ENOENT,
 	      "reuseport array update >=max_entries",
 	      "err:%d errno:%d\n", err, errno);
 
 	/* Test lookup/delete non existence elem */
 	err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
-	CHECK(err != -1 || errno != ENOENT,
+	CHECK(err >= 0 || errno != ENOENT,
 	      "reuseport array lookup not-exist elem",
 	      "err:%d errno:%d\n", err, errno);
 	err = bpf_map_delete_elem(map_fd, &index3);
-	CHECK(err != -1 || errno != ENOENT,
+	CHECK(err >= 0 || errno != ENOENT,
 	      "reuseport array del not-exist elem",
 	      "err:%d errno:%d\n", err, errno);
 
@@ -1661,7 +1661,7 @@ static void test_reuseport_array(void)
 		/* BPF_EXIST failure case */
 		err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
 					  BPF_EXIST);
-		CHECK(err != -1 || errno != ENOENT,
+		CHECK(err >= 0 || errno != ENOENT,
 		      "reuseport array update empty elem BPF_EXIST",
 		      "sock_type:%d err:%d errno:%d\n",
 		      type, err, errno);
@@ -1670,7 +1670,7 @@ static void test_reuseport_array(void)
 		/* BPF_NOEXIST success case */
 		err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
 					  BPF_NOEXIST);
-		CHECK(err == -1,
+		CHECK(err < 0,
 		      "reuseport array update empty elem BPF_NOEXIST",
 		      "sock_type:%d err:%d errno:%d\n",
 		      type, err, errno);
@@ -1679,7 +1679,7 @@ static void test_reuseport_array(void)
 		/* BPF_EXIST success case. */
 		err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
 					  BPF_EXIST);
-		CHECK(err == -1,
+		CHECK(err < 0,
 		      "reuseport array update same elem BPF_EXIST",
 		      "sock_type:%d err:%d errno:%d\n", type, err, errno);
 		fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
@@ -1687,7 +1687,7 @@ static void test_reuseport_array(void)
 		/* BPF_NOEXIST failure case */
 		err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
 					  BPF_NOEXIST);
-		CHECK(err != -1 || errno != EEXIST,
+		CHECK(err >= 0 || errno != EEXIST,
 		      "reuseport array update non-empty elem BPF_NOEXIST",
 		      "sock_type:%d err:%d errno:%d\n",
 		      type, err, errno);
@@ -1696,7 +1696,7 @@ static void test_reuseport_array(void)
 		/* BPF_ANY case (always succeed) */
 		err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
 					  BPF_ANY);
-		CHECK(err == -1,
+		CHECK(err < 0,
 		      "reuseport array update same sk with BPF_ANY",
 		      "sock_type:%d err:%d errno:%d\n", type, err, errno);
 
@@ -1705,32 +1705,32 @@ static void test_reuseport_array(void)
 
 		/* The same sk cannot be added to reuseport_array twice */
 		err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY);
-		CHECK(err != -1 || errno != EBUSY,
+		CHECK(err >= 0 || errno != EBUSY,
 		      "reuseport array update same sk with same index",
 		      "sock_type:%d err:%d errno:%d\n",
 		      type, err, errno);
 
 		err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY);
-		CHECK(err != -1 || errno != EBUSY,
+		CHECK(err >= 0 || errno != EBUSY,
 		      "reuseport array update same sk with different index",
 		      "sock_type:%d err:%d errno:%d\n",
 		      type, err, errno);
 
 		/* Test delete elem */
 		err = bpf_map_delete_elem(map_fd, &index3);
-		CHECK(err == -1, "reuseport array delete sk",
+		CHECK(err < 0, "reuseport array delete sk",
 		      "sock_type:%d err:%d errno:%d\n",
 		      type, err, errno);
 
 		/* Add it back with BPF_NOEXIST */
 		err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST);
-		CHECK(err == -1,
+		CHECK(err < 0,
 		      "reuseport array re-add with BPF_NOEXIST after del",
 		      "sock_type:%d err:%d errno:%d\n", type, err, errno);
 
 		/* Test cookie */
 		err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
-		CHECK(err == -1 || sk_cookie != map_cookie,
+		CHECK(err < 0 || sk_cookie != map_cookie,
 		      "reuseport array lookup re-added sk",
 		      "sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn",
 		      type, err, errno, sk_cookie, map_cookie);
@@ -1739,7 +1739,7 @@ static void test_reuseport_array(void)
 		for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++)
 			close(grpa_fds64[f]);
 		err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
-		CHECK(err != -1 || errno != ENOENT,
+		CHECK(err >= 0 || errno != ENOENT,
 		      "reuseport array lookup after close()",
 		      "sock_type:%d err:%d errno:%d\n",
 		      type, err, errno);
@@ -1750,7 +1750,7 @@ static void test_reuseport_array(void)
 	CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n",
 	      err, errno);
 	err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST);
-	CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW",
+	CHECK(err >= 0 || errno != ENOTSUPP, "reuseport array update SOCK_RAW",
 	      "err:%d errno:%d\n", err, errno);
 	close(fd64);
 
@@ -1760,16 +1760,16 @@ static void test_reuseport_array(void)
 	/* Test 32 bit fd */
 	map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 				sizeof(__u32), sizeof(__u32), array_size, 0);
-	CHECK(map_fd == -1, "reuseport array create",
+	CHECK(map_fd < 0, "reuseport array create",
 	      "map_fd:%d, errno:%d\n", map_fd, errno);
 	prepare_reuseport_grp(SOCK_STREAM, map_fd, sizeof(__u32), &fd64,
 			      &sk_cookie, 1);
 	fd = fd64;
 	err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST);
-	CHECK(err == -1, "reuseport array update 32 bit fd",
+	CHECK(err < 0, "reuseport array update 32 bit fd",
 	      "err:%d errno:%d\n", err, errno);
 	err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
-	CHECK(err != -1 || errno != ENOSPC,
+	CHECK(err >= 0 || errno != ENOSPC,
 	      "reuseport array lookup 32 bit fd",
 	      "err:%d errno:%d\n", err, errno);
 	close(fd);
@@ -1815,6 +1815,8 @@ int main(void)
 {
 	srand(time(NULL));
 
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
 	map_flags = 0;
 	run_all_tests();
 
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 6396932b97e2..6f103106a39b 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -737,6 +737,9 @@ int main(int argc, char **argv)
 	if (err)
 		return err;
 
+	/* Use libbpf 1.0 API mode */
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
 	libbpf_set_print(libbpf_print_fn);
 
 	srand(time(NULL));
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index dda52cb649dc..8ef7f334e715 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -249,16 +249,17 @@ extern int test__join_cgroup(const char *path);
 #define ASSERT_OK_PTR(ptr, name) ({					\
 	static int duration = 0;					\
 	const void *___res = (ptr);					\
-	bool ___ok = !IS_ERR_OR_NULL(___res);				\
-	CHECK(!___ok, (name),						\
-	      "unexpected error: %ld\n", PTR_ERR(___res));		\
+	int ___err = libbpf_get_error(___res);				\
+	bool ___ok = ___err == 0;					\
+	CHECK(!___ok, (name), "unexpected error: %d\n", ___err);	\
 	___ok;								\
 })
 
 #define ASSERT_ERR_PTR(ptr, name) ({					\
 	static int duration = 0;					\
 	const void *___res = (ptr);					\
-	bool ___ok = IS_ERR(___res);					\
+	int ___err = libbpf_get_error(___res);				\
+	bool ___ok = ___err != 0;					\
 	CHECK(!___ok, (name), "unexpected pointer: %p\n", ___res);	\
 	___ok;								\
 })
diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c
index 73da7fe8c152..4a39304cc5a6 100644
--- a/tools/testing/selftests/bpf/test_tcpnotify_user.c
+++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c
@@ -82,6 +82,8 @@ int main(int argc, char **argv)
 	cpu_set_t cpuset;
 	__u32 key = 0;
 
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
 	CPU_ZERO(&cpuset);
 	CPU_SET(0, &cpuset);
 	pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
@@ -116,7 +118,7 @@ int main(int argc, char **argv)
 
 	pb_opts.sample_cb = dummyfn;
 	pb = perf_buffer__new(bpf_map__fd(perf_map), 8, &pb_opts);
-	if (IS_ERR(pb))
+	if (!pb)
 		goto err;
 
 	pthread_create(&tid, NULL, poller_thread, pb);
@@ -163,7 +165,6 @@ err:
 	bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
 	close(cg_fd);
 	cleanup_cgroup_environment();
-	if (!IS_ERR_OR_NULL(pb))
-		perf_buffer__free(pb);
+	perf_buffer__free(pb);
 	return error;
 }
-- 
cgit v1.2.3-70-g09d2


From f12b654327283d158de0af170943ec5dd8cd02e5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 24 May 2021 20:59:33 -0700
Subject: libbpf: Streamline error reporting for low-level APIs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ensure that low-level APIs behave uniformly across the libbpf as follows:
  - in case of an error, errno is always set to the correct error code;
  - when libbpf 1.0 mode is enabled with LIBBPF_STRICT_DIRECT_ERRS option to
    libbpf_set_strict_mode(), return -Exxx error value directly, instead of -1;
  - by default, until libbpf 1.0 is released, keep returning -1 directly.

More context, justification, and discussion can be found in "Libbpf: the road
to v1.0" document ([0]).

  [0] https://docs.google.com/document/d/1UyjTZuPFWiPFyKk1tV5an11_iaRuec6U-ZESZ54nNTY

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210525035935.1461796-4-andrii@kernel.org
---
 tools/lib/bpf/bpf.c             | 168 ++++++++++++++++++++++++++++------------
 tools/lib/bpf/libbpf_internal.h |  26 +++++++
 tools/lib/bpf/libbpf_legacy.h   |  12 +++
 3 files changed, 156 insertions(+), 50 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index b7c2cc12034c..86dcac44f32f 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -80,6 +80,7 @@ static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size)
 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, '\0', sizeof(attr));
 
@@ -102,7 +103,8 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 	else
 		attr.inner_map_fd = create_attr->inner_map_fd;
 
-	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
@@ -160,6 +162,7 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
 			       __u32 map_flags, int node)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, '\0', sizeof(attr));
 
@@ -178,7 +181,8 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
 		attr.numa_node = node;
 	}
 
-	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name,
@@ -222,10 +226,10 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 	int fd;
 
 	if (!load_attr->log_buf != !load_attr->log_buf_sz)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (load_attr->log_level > (4 | 2 | 1) || (load_attr->log_level && !load_attr->log_buf))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.prog_type = load_attr->prog_type;
@@ -281,8 +285,10 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 							load_attr->func_info_cnt,
 							load_attr->func_info_rec_size,
 							attr.func_info_rec_size);
-			if (!finfo)
+			if (!finfo) {
+				errno = E2BIG;
 				goto done;
+			}
 
 			attr.func_info = ptr_to_u64(finfo);
 			attr.func_info_rec_size = load_attr->func_info_rec_size;
@@ -293,8 +299,10 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 							load_attr->line_info_cnt,
 							load_attr->line_info_rec_size,
 							attr.line_info_rec_size);
-			if (!linfo)
+			if (!linfo) {
+				errno = E2BIG;
 				goto done;
+			}
 
 			attr.line_info = ptr_to_u64(linfo);
 			attr.line_info_rec_size = load_attr->line_info_rec_size;
@@ -318,9 +326,10 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 
 	fd = sys_bpf_prog_load(&attr, sizeof(attr));
 done:
+	/* free() doesn't affect errno, so we don't need to restore it */
 	free(finfo);
 	free(linfo);
-	return fd;
+	return libbpf_err_errno(fd);
 }
 
 int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
@@ -329,7 +338,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	struct bpf_prog_load_params p = {};
 
 	if (!load_attr || !log_buf != !log_buf_sz)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	p.prog_type = load_attr->prog_type;
 	p.expected_attach_type = load_attr->expected_attach_type;
@@ -391,6 +400,7 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 		       int log_level)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.prog_type = type;
@@ -404,13 +414,15 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 	attr.kern_version = kern_version;
 	attr.prog_flags = prog_flags;
 
-	return sys_bpf_prog_load(&attr, sizeof(attr));
+	fd = sys_bpf_prog_load(&attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_map_update_elem(int fd, const void *key, const void *value,
 			__u64 flags)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
@@ -418,24 +430,28 @@ int bpf_map_update_elem(int fd, const void *key, const void *value,
 	attr.value = ptr_to_u64(value);
 	attr.flags = flags;
 
-	return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_map_lookup_elem(int fd, const void *key, void *value)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 	attr.value = ptr_to_u64(value);
 
-	return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
@@ -443,19 +459,22 @@ int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags)
 	attr.value = ptr_to_u64(value);
 	attr.flags = flags;
 
-	return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 	attr.value = ptr_to_u64(value);
 
-	return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags)
@@ -474,34 +493,40 @@ int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, _
 int bpf_map_delete_elem(int fd, const void *key)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 
-	return sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_map_get_next_key(int fd, const void *key, void *next_key)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 	attr.next_key = ptr_to_u64(next_key);
 
-	return sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_map_freeze(int fd)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 
-	return sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 static int bpf_map_batch_common(int cmd, int fd, void  *in_batch,
@@ -513,7 +538,7 @@ static int bpf_map_batch_common(int cmd, int fd, void  *in_batch,
 	int ret;
 
 	if (!OPTS_VALID(opts, bpf_map_batch_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.batch.map_fd = fd;
@@ -528,7 +553,7 @@ static int bpf_map_batch_common(int cmd, int fd, void  *in_batch,
 	ret = sys_bpf(cmd, &attr, sizeof(attr));
 	*count = attr.batch.count;
 
-	return ret;
+	return libbpf_err_errno(ret);
 }
 
 int bpf_map_delete_batch(int fd, void *keys, __u32 *count,
@@ -565,22 +590,26 @@ int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count,
 int bpf_obj_pin(int fd, const char *pathname)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.pathname = ptr_to_u64((void *)pathname);
 	attr.bpf_fd = fd;
 
-	return sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_obj_get(const char *pathname)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.pathname = ptr_to_u64((void *)pathname);
 
-	return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
@@ -598,9 +627,10 @@ int bpf_prog_attach_xattr(int prog_fd, int target_fd,
 			  const struct bpf_prog_attach_opts *opts)
 {
 	union bpf_attr attr;
+	int ret;
 
 	if (!OPTS_VALID(opts, bpf_prog_attach_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.target_fd	   = target_fd;
@@ -609,30 +639,35 @@ int bpf_prog_attach_xattr(int prog_fd, int target_fd,
 	attr.attach_flags  = OPTS_GET(opts, flags, 0);
 	attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0);
 
-	return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.target_fd	 = target_fd;
 	attr.attach_type = type;
 
-	return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.target_fd	 = target_fd;
 	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type = type;
 
-	return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_link_create(int prog_fd, int target_fd,
@@ -641,15 +676,16 @@ int bpf_link_create(int prog_fd, int target_fd,
 {
 	__u32 target_btf_id, iter_info_len;
 	union bpf_attr attr;
+	int fd;
 
 	if (!OPTS_VALID(opts, bpf_link_create_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	iter_info_len = OPTS_GET(opts, iter_info_len, 0);
 	target_btf_id = OPTS_GET(opts, target_btf_id, 0);
 
 	if (iter_info_len && target_btf_id)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.link_create.prog_fd = prog_fd;
@@ -665,26 +701,30 @@ int bpf_link_create(int prog_fd, int target_fd,
 		attr.link_create.target_btf_id = target_btf_id;
 	}
 
-	return sys_bpf(BPF_LINK_CREATE, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_LINK_CREATE, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_link_detach(int link_fd)
 {
 	union bpf_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.link_detach.link_fd = link_fd;
 
-	return sys_bpf(BPF_LINK_DETACH, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_LINK_DETACH, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_link_update(int link_fd, int new_prog_fd,
 		    const struct bpf_link_update_opts *opts)
 {
 	union bpf_attr attr;
+	int ret;
 
 	if (!OPTS_VALID(opts, bpf_link_update_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.link_update.link_fd = link_fd;
@@ -692,17 +732,20 @@ int bpf_link_update(int link_fd, int new_prog_fd,
 	attr.link_update.flags = OPTS_GET(opts, flags, 0);
 	attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0);
 
-	return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
 
 int bpf_iter_create(int link_fd)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.iter_create.link_fd = link_fd;
 
-	return sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
@@ -719,10 +762,12 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 	attr.query.prog_ids	= ptr_to_u64(prog_ids);
 
 	ret = sys_bpf(BPF_PROG_QUERY, &attr, sizeof(attr));
+
 	if (attach_flags)
 		*attach_flags = attr.query.attach_flags;
 	*prog_cnt = attr.query.prog_cnt;
-	return ret;
+
+	return libbpf_err_errno(ret);
 }
 
 int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
@@ -740,13 +785,15 @@ int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
 	attr.test.repeat = repeat;
 
 	ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr));
+
 	if (size_out)
 		*size_out = attr.test.data_size_out;
 	if (retval)
 		*retval = attr.test.retval;
 	if (duration)
 		*duration = attr.test.duration;
-	return ret;
+
+	return libbpf_err_errno(ret);
 }
 
 int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr)
@@ -755,7 +802,7 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr)
 	int ret;
 
 	if (!test_attr->data_out && test_attr->data_size_out > 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.test.prog_fd = test_attr->prog_fd;
@@ -770,11 +817,13 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr)
 	attr.test.repeat = test_attr->repeat;
 
 	ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr));
+
 	test_attr->data_size_out = attr.test.data_size_out;
 	test_attr->ctx_size_out = attr.test.ctx_size_out;
 	test_attr->retval = attr.test.retval;
 	test_attr->duration = attr.test.duration;
-	return ret;
+
+	return libbpf_err_errno(ret);
 }
 
 int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts)
@@ -783,7 +832,7 @@ int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts)
 	int ret;
 
 	if (!OPTS_VALID(opts, bpf_test_run_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.test.prog_fd = prog_fd;
@@ -801,11 +850,13 @@ int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts)
 	attr.test.data_out = ptr_to_u64(OPTS_GET(opts, data_out, NULL));
 
 	ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr));
+
 	OPTS_SET(opts, data_size_out, attr.test.data_size_out);
 	OPTS_SET(opts, ctx_size_out, attr.test.ctx_size_out);
 	OPTS_SET(opts, duration, attr.test.duration);
 	OPTS_SET(opts, retval, attr.test.retval);
-	return ret;
+
+	return libbpf_err_errno(ret);
 }
 
 static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd)
@@ -820,7 +871,7 @@ static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd)
 	if (!err)
 		*next_id = attr.next_id;
 
-	return err;
+	return libbpf_err_errno(err);
 }
 
 int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
@@ -846,41 +897,49 @@ int bpf_link_get_next_id(__u32 start_id, __u32 *next_id)
 int bpf_prog_get_fd_by_id(__u32 id)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.prog_id = id;
 
-	return sys_bpf(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_map_get_fd_by_id(__u32 id)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.map_id = id;
 
-	return sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_btf_get_fd_by_id(__u32 id)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.btf_id = id;
 
-	return sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_link_get_fd_by_id(__u32 id)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.link_id = id;
 
-	return sys_bpf(BPF_LINK_GET_FD_BY_ID, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_LINK_GET_FD_BY_ID, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len)
@@ -894,21 +953,24 @@ int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len)
 	attr.info.info = ptr_to_u64(info);
 
 	err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
+
 	if (!err)
 		*info_len = attr.info.info_len;
 
-	return err;
+	return libbpf_err_errno(err);
 }
 
 int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.raw_tracepoint.name = ptr_to_u64(name);
 	attr.raw_tracepoint.prog_fd = prog_fd;
 
-	return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
@@ -928,12 +990,13 @@ retry:
 	}
 
 	fd = sys_bpf(BPF_BTF_LOAD, &attr, sizeof(attr));
-	if (fd == -1 && !do_log && log_buf && log_buf_size) {
+
+	if (fd < 0 && !do_log && log_buf && log_buf_size) {
 		do_log = true;
 		goto retry;
 	}
 
-	return fd;
+	return libbpf_err_errno(fd);
 }
 
 int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
@@ -950,37 +1013,42 @@ int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
 	attr.task_fd_query.buf_len = *buf_len;
 
 	err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr));
+
 	*buf_len = attr.task_fd_query.buf_len;
 	*prog_id = attr.task_fd_query.prog_id;
 	*fd_type = attr.task_fd_query.fd_type;
 	*probe_offset = attr.task_fd_query.probe_offset;
 	*probe_addr = attr.task_fd_query.probe_addr;
 
-	return err;
+	return libbpf_err_errno(err);
 }
 
 int bpf_enable_stats(enum bpf_stats_type type)
 {
 	union bpf_attr attr;
+	int fd;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.enable_stats.type = type;
 
-	return sys_bpf(BPF_ENABLE_STATS, &attr, sizeof(attr));
+	fd = sys_bpf(BPF_ENABLE_STATS, &attr, sizeof(attr));
+	return libbpf_err_errno(fd);
 }
 
 int bpf_prog_bind_map(int prog_fd, int map_fd,
 		      const struct bpf_prog_bind_opts *opts)
 {
 	union bpf_attr attr;
+	int ret;
 
 	if (!OPTS_VALID(opts, bpf_prog_bind_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.prog_bind_map.prog_fd = prog_fd;
 	attr.prog_bind_map.map_fd = map_fd;
 	attr.prog_bind_map.flags = OPTS_GET(opts, flags, 0);
 
-	return sys_bpf(BPF_PROG_BIND_MAP, &attr, sizeof(attr));
+	ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, sizeof(attr));
+	return libbpf_err_errno(ret);
 }
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index e2db08573bf0..8d59683ab246 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -11,6 +11,9 @@
 
 #include <stdlib.h>
 #include <limits.h>
+#include <errno.h>
+#include <linux/err.h>
+#include "libbpf_legacy.h"
 
 /* make sure libbpf doesn't use kernel-only integer typedefs */
 #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
@@ -436,4 +439,27 @@ int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ct
 int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx);
 int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx);
 
+extern enum libbpf_strict_mode libbpf_mode;
+
+/* handle direct returned errors */
+static inline int libbpf_err(int ret)
+{
+	if (ret < 0)
+		errno = -ret;
+	return ret;
+}
+
+/* handle errno-based (e.g., syscall or libc) errors according to libbpf's
+ * strict mode settings
+ */
+static inline int libbpf_err_errno(int ret)
+{
+	if (libbpf_mode & LIBBPF_STRICT_DIRECT_ERRS)
+		/* errno is already assumed to be set on error */
+		return ret < 0 ? -errno : ret;
+
+	/* legacy: on error return -1 directly and don't touch errno */
+	return ret;
+}
+
 #endif /* __LIBBPF_LIBBPF_INTERNAL_H */
diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h
index 7482cfe22ab2..df0d03dcffab 100644
--- a/tools/lib/bpf/libbpf_legacy.h
+++ b/tools/lib/bpf/libbpf_legacy.h
@@ -33,6 +33,18 @@ enum libbpf_strict_mode {
 	 * code so that it handles LIBBPF_STRICT_ALL mode before libbpf v1.0.
 	 */
 	LIBBPF_STRICT_NONE = 0x00,
+	/*
+	 * Return NULL pointers on error, not ERR_PTR(err).
+	 * Additionally, libbpf also always sets errno to corresponding Exx
+	 * (positive) error code.
+	 */
+	LIBBPF_STRICT_CLEAN_PTRS = 0x01,
+	/*
+	 * Return actual error codes from low-level APIs directly, not just -1.
+	 * Additionally, libbpf also always sets errno to corresponding Exx
+	 * (positive) error code.
+	 */
+	LIBBPF_STRICT_DIRECT_ERRS = 0x02,
 
 	__LIBBPF_STRICT_LAST,
 };
-- 
cgit v1.2.3-70-g09d2


From e9fc3ce99b3485586e7e4803b63df8b4c681f897 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 24 May 2021 20:59:34 -0700
Subject: libbpf: Streamline error reporting for high-level APIs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement changes to error reporting for high-level libbpf APIs to make them
less surprising and less error-prone to users:
  - in all the cases when error happens, errno is set to an appropriate error
    value;
  - in libbpf 1.0 mode, all pointer-returning APIs return NULL on error and
    error code is communicated through errno; this applies both to APIs that
    already returned NULL before (so now they communicate more detailed error
    codes), as well as for many APIs that used ERR_PTR() macro and encoded
    error numbers as fake pointers.
  - in legacy (default) mode, those APIs that were returning ERR_PTR(err),
    continue doing so, but still set errno.

With these changes, errno can be always used to extract actual error,
regardless of legacy or libbpf 1.0 modes. This is utilized internally in
libbpf in places where libbpf uses it's own high-level APIs.
libbpf_get_error() is adapted to handle both cases completely transparently to
end-users (and is used by libbpf consistently as well).

More context, justification, and discussion can be found in "Libbpf: the road
to v1.0" document ([0]).

  [0] https://docs.google.com/document/d/1UyjTZuPFWiPFyKk1tV5an11_iaRuec6U-ZESZ54nNTY

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210525035935.1461796-5-andrii@kernel.org
---
 tools/lib/bpf/bpf_prog_linfo.c  |  18 +-
 tools/lib/bpf/btf.c             | 302 ++++++++++++------------
 tools/lib/bpf/btf_dump.c        |  14 +-
 tools/lib/bpf/libbpf.c          | 502 +++++++++++++++++++++-------------------
 tools/lib/bpf/libbpf_errno.c    |   7 +-
 tools/lib/bpf/libbpf_internal.h |  27 +++
 tools/lib/bpf/linker.c          |  22 +-
 tools/lib/bpf/netlink.c         |  81 ++++---
 tools/lib/bpf/ringbuf.c         |  26 +--
 9 files changed, 531 insertions(+), 468 deletions(-)

diff --git a/tools/lib/bpf/bpf_prog_linfo.c b/tools/lib/bpf/bpf_prog_linfo.c
index 3ed1a27b5f7c..5c503096ef43 100644
--- a/tools/lib/bpf/bpf_prog_linfo.c
+++ b/tools/lib/bpf/bpf_prog_linfo.c
@@ -106,7 +106,7 @@ struct bpf_prog_linfo *bpf_prog_linfo__new(const struct bpf_prog_info *info)
 	nr_linfo = info->nr_line_info;
 
 	if (!nr_linfo)
-		return NULL;
+		return errno = EINVAL, NULL;
 
 	/*
 	 * The min size that bpf_prog_linfo has to access for
@@ -114,11 +114,11 @@ struct bpf_prog_linfo *bpf_prog_linfo__new(const struct bpf_prog_info *info)
 	 */
 	if (info->line_info_rec_size <
 	    offsetof(struct bpf_line_info, file_name_off))
-		return NULL;
+		return errno = EINVAL, NULL;
 
 	prog_linfo = calloc(1, sizeof(*prog_linfo));
 	if (!prog_linfo)
-		return NULL;
+		return errno = ENOMEM, NULL;
 
 	/* Copy xlated line_info */
 	prog_linfo->nr_linfo = nr_linfo;
@@ -174,7 +174,7 @@ struct bpf_prog_linfo *bpf_prog_linfo__new(const struct bpf_prog_info *info)
 
 err_free:
 	bpf_prog_linfo__free(prog_linfo);
-	return NULL;
+	return errno = EINVAL, NULL;
 }
 
 const struct bpf_line_info *
@@ -186,11 +186,11 @@ bpf_prog_linfo__lfind_addr_func(const struct bpf_prog_linfo *prog_linfo,
 	const __u64 *jited_linfo;
 
 	if (func_idx >= prog_linfo->nr_jited_func)
-		return NULL;
+		return errno = ENOENT, NULL;
 
 	nr_linfo = prog_linfo->nr_jited_linfo_per_func[func_idx];
 	if (nr_skip >= nr_linfo)
-		return NULL;
+		return errno = ENOENT, NULL;
 
 	start = prog_linfo->jited_linfo_func_idx[func_idx] + nr_skip;
 	jited_rec_size = prog_linfo->jited_rec_size;
@@ -198,7 +198,7 @@ bpf_prog_linfo__lfind_addr_func(const struct bpf_prog_linfo *prog_linfo,
 		(start * jited_rec_size);
 	jited_linfo = raw_jited_linfo;
 	if (addr < *jited_linfo)
-		return NULL;
+		return errno = ENOENT, NULL;
 
 	nr_linfo -= nr_skip;
 	rec_size = prog_linfo->rec_size;
@@ -225,13 +225,13 @@ bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo,
 
 	nr_linfo = prog_linfo->nr_linfo;
 	if (nr_skip >= nr_linfo)
-		return NULL;
+		return errno = ENOENT, NULL;
 
 	rec_size = prog_linfo->rec_size;
 	raw_linfo = prog_linfo->raw_linfo + (nr_skip * rec_size);
 	linfo = raw_linfo;
 	if (insn_off < linfo->insn_off)
-		return NULL;
+		return errno = ENOENT, NULL;
 
 	nr_linfo -= nr_skip;
 	for (i = 0; i < nr_linfo; i++) {
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index d57e13a13798..b46760b93bb4 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -443,7 +443,7 @@ struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id)
 const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id)
 {
 	if (type_id >= btf->start_id + btf->nr_types)
-		return NULL;
+		return errno = EINVAL, NULL;
 	return btf_type_by_id((struct btf *)btf, type_id);
 }
 
@@ -510,7 +510,7 @@ size_t btf__pointer_size(const struct btf *btf)
 int btf__set_pointer_size(struct btf *btf, size_t ptr_sz)
 {
 	if (ptr_sz != 4 && ptr_sz != 8)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	btf->ptr_sz = ptr_sz;
 	return 0;
 }
@@ -537,7 +537,7 @@ enum btf_endianness btf__endianness(const struct btf *btf)
 int btf__set_endianness(struct btf *btf, enum btf_endianness endian)
 {
 	if (endian != BTF_LITTLE_ENDIAN && endian != BTF_BIG_ENDIAN)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	btf->swapped_endian = is_host_big_endian() != (endian == BTF_BIG_ENDIAN);
 	if (!btf->swapped_endian) {
@@ -568,8 +568,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 	int i;
 
 	t = btf__type_by_id(btf, type_id);
-	for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t);
-	     i++) {
+	for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t); i++) {
 		switch (btf_kind(t)) {
 		case BTF_KIND_INT:
 		case BTF_KIND_STRUCT:
@@ -592,12 +591,12 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 		case BTF_KIND_ARRAY:
 			array = btf_array(t);
 			if (nelems && array->nelems > UINT32_MAX / nelems)
-				return -E2BIG;
+				return libbpf_err(-E2BIG);
 			nelems *= array->nelems;
 			type_id = array->type;
 			break;
 		default:
-			return -EINVAL;
+			return libbpf_err(-EINVAL);
 		}
 
 		t = btf__type_by_id(btf, type_id);
@@ -605,9 +604,9 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 
 done:
 	if (size < 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (nelems && size > UINT32_MAX / nelems)
-		return -E2BIG;
+		return libbpf_err(-E2BIG);
 
 	return nelems * size;
 }
@@ -640,7 +639,7 @@ int btf__align_of(const struct btf *btf, __u32 id)
 		for (i = 0; i < vlen; i++, m++) {
 			align = btf__align_of(btf, m->type);
 			if (align <= 0)
-				return align;
+				return libbpf_err(align);
 			max_align = max(max_align, align);
 		}
 
@@ -648,7 +647,7 @@ int btf__align_of(const struct btf *btf, __u32 id)
 	}
 	default:
 		pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t));
-		return 0;
+		return errno = EINVAL, 0;
 	}
 }
 
@@ -667,7 +666,7 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id)
 	}
 
 	if (depth == MAX_RESOLVE_DEPTH || btf_type_is_void_or_null(t))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	return type_id;
 }
@@ -687,7 +686,7 @@ __s32 btf__find_by_name(const struct btf *btf, const char *type_name)
 			return i;
 	}
 
-	return -ENOENT;
+	return libbpf_err(-ENOENT);
 }
 
 __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name,
@@ -709,7 +708,7 @@ __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name,
 			return i;
 	}
 
-	return -ENOENT;
+	return libbpf_err(-ENOENT);
 }
 
 static bool btf_is_modifiable(const struct btf *btf)
@@ -785,12 +784,12 @@ static struct btf *btf_new_empty(struct btf *base_btf)
 
 struct btf *btf__new_empty(void)
 {
-	return btf_new_empty(NULL);
+	return libbpf_ptr(btf_new_empty(NULL));
 }
 
 struct btf *btf__new_empty_split(struct btf *base_btf)
 {
-	return btf_new_empty(base_btf);
+	return libbpf_ptr(btf_new_empty(base_btf));
 }
 
 static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)
@@ -846,7 +845,7 @@ done:
 
 struct btf *btf__new(const void *data, __u32 size)
 {
-	return btf_new(data, size, NULL);
+	return libbpf_ptr(btf_new(data, size, NULL));
 }
 
 static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
@@ -937,7 +936,8 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
 		goto done;
 	}
 	btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf);
-	if (IS_ERR(btf))
+	err = libbpf_get_error(btf);
+	if (err)
 		goto done;
 
 	switch (gelf_getclass(elf)) {
@@ -953,9 +953,9 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
 	}
 
 	if (btf_ext && btf_ext_data) {
-		*btf_ext = btf_ext__new(btf_ext_data->d_buf,
-					btf_ext_data->d_size);
-		if (IS_ERR(*btf_ext))
+		*btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size);
+		err = libbpf_get_error(*btf_ext);
+		if (err)
 			goto done;
 	} else if (btf_ext) {
 		*btf_ext = NULL;
@@ -965,30 +965,24 @@ done:
 		elf_end(elf);
 	close(fd);
 
-	if (err)
-		return ERR_PTR(err);
-	/*
-	 * btf is always parsed before btf_ext, so no need to clean up
-	 * btf_ext, if btf loading failed
-	 */
-	if (IS_ERR(btf))
+	if (!err)
 		return btf;
-	if (btf_ext && IS_ERR(*btf_ext)) {
-		btf__free(btf);
-		err = PTR_ERR(*btf_ext);
-		return ERR_PTR(err);
-	}
-	return btf;
+
+	if (btf_ext)
+		btf_ext__free(*btf_ext);
+	btf__free(btf);
+
+	return ERR_PTR(err);
 }
 
 struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext)
 {
-	return btf_parse_elf(path, NULL, btf_ext);
+	return libbpf_ptr(btf_parse_elf(path, NULL, btf_ext));
 }
 
 struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf)
 {
-	return btf_parse_elf(path, base_btf, NULL);
+	return libbpf_ptr(btf_parse_elf(path, base_btf, NULL));
 }
 
 static struct btf *btf_parse_raw(const char *path, struct btf *base_btf)
@@ -1056,36 +1050,39 @@ err_out:
 
 struct btf *btf__parse_raw(const char *path)
 {
-	return btf_parse_raw(path, NULL);
+	return libbpf_ptr(btf_parse_raw(path, NULL));
 }
 
 struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf)
 {
-	return btf_parse_raw(path, base_btf);
+	return libbpf_ptr(btf_parse_raw(path, base_btf));
 }
 
 static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext)
 {
 	struct btf *btf;
+	int err;
 
 	if (btf_ext)
 		*btf_ext = NULL;
 
 	btf = btf_parse_raw(path, base_btf);
-	if (!IS_ERR(btf) || PTR_ERR(btf) != -EPROTO)
+	err = libbpf_get_error(btf);
+	if (!err)
 		return btf;
-
+	if (err != -EPROTO)
+		return ERR_PTR(err);
 	return btf_parse_elf(path, base_btf, btf_ext);
 }
 
 struct btf *btf__parse(const char *path, struct btf_ext **btf_ext)
 {
-	return btf_parse(path, NULL, btf_ext);
+	return libbpf_ptr(btf_parse(path, NULL, btf_ext));
 }
 
 struct btf *btf__parse_split(const char *path, struct btf *base_btf)
 {
-	return btf_parse(path, base_btf, NULL);
+	return libbpf_ptr(btf_parse(path, base_btf, NULL));
 }
 
 static int compare_vsi_off(const void *_a, const void *_b)
@@ -1178,7 +1175,7 @@ int btf__finalize_data(struct bpf_object *obj, struct btf *btf)
 		}
 	}
 
-	return err;
+	return libbpf_err(err);
 }
 
 static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian);
@@ -1191,13 +1188,13 @@ int btf__load(struct btf *btf)
 	int err = 0;
 
 	if (btf->fd >= 0)
-		return -EEXIST;
+		return libbpf_err(-EEXIST);
 
 retry_load:
 	if (log_buf_size) {
 		log_buf = malloc(log_buf_size);
 		if (!log_buf)
-			return -ENOMEM;
+			return libbpf_err(-ENOMEM);
 
 		*log_buf = 0;
 	}
@@ -1229,7 +1226,7 @@ retry_load:
 
 done:
 	free(log_buf);
-	return err;
+	return libbpf_err(err);
 }
 
 int btf__fd(const struct btf *btf)
@@ -1305,7 +1302,7 @@ const void *btf__get_raw_data(const struct btf *btf_ro, __u32 *size)
 
 	data = btf_get_raw_data(btf, &data_sz, btf->swapped_endian);
 	if (!data)
-		return NULL;
+		return errno = -ENOMEM, NULL;
 
 	btf->raw_size = data_sz;
 	if (btf->swapped_endian)
@@ -1323,7 +1320,7 @@ const char *btf__str_by_offset(const struct btf *btf, __u32 offset)
 	else if (offset - btf->start_str_off < btf->hdr->str_len)
 		return btf_strs_data(btf) + (offset - btf->start_str_off);
 	else
-		return NULL;
+		return errno = EINVAL, NULL;
 }
 
 const char *btf__name_by_offset(const struct btf *btf, __u32 offset)
@@ -1388,17 +1385,20 @@ exit_free:
 int btf__get_from_id(__u32 id, struct btf **btf)
 {
 	struct btf *res;
-	int btf_fd;
+	int err, btf_fd;
 
 	*btf = NULL;
 	btf_fd = bpf_btf_get_fd_by_id(id);
 	if (btf_fd < 0)
-		return -errno;
+		return libbpf_err(-errno);
 
 	res = btf_get_from_fd(btf_fd, NULL);
+	err = libbpf_get_error(res);
+
 	close(btf_fd);
-	if (IS_ERR(res))
-		return PTR_ERR(res);
+
+	if (err)
+		return libbpf_err(err);
 
 	*btf = res;
 	return 0;
@@ -1415,31 +1415,30 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 	__s64 key_size, value_size;
 	__s32 container_id;
 
-	if (snprintf(container_name, max_name, "____btf_map_%s", map_name) ==
-	    max_name) {
+	if (snprintf(container_name, max_name, "____btf_map_%s", map_name) == max_name) {
 		pr_warn("map:%s length of '____btf_map_%s' is too long\n",
 			map_name, map_name);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	container_id = btf__find_by_name(btf, container_name);
 	if (container_id < 0) {
 		pr_debug("map:%s container_name:%s cannot be found in BTF. Missing BPF_ANNOTATE_KV_PAIR?\n",
 			 map_name, container_name);
-		return container_id;
+		return libbpf_err(container_id);
 	}
 
 	container_type = btf__type_by_id(btf, container_id);
 	if (!container_type) {
 		pr_warn("map:%s cannot find BTF type for container_id:%u\n",
 			map_name, container_id);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (!btf_is_struct(container_type) || btf_vlen(container_type) < 2) {
 		pr_warn("map:%s container_name:%s is an invalid container struct\n",
 			map_name, container_name);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	key = btf_members(container_type);
@@ -1448,25 +1447,25 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 	key_size = btf__resolve_size(btf, key->type);
 	if (key_size < 0) {
 		pr_warn("map:%s invalid BTF key_type_size\n", map_name);
-		return key_size;
+		return libbpf_err(key_size);
 	}
 
 	if (expected_key_size != key_size) {
 		pr_warn("map:%s btf_key_type_size:%u != map_def_key_size:%u\n",
 			map_name, (__u32)key_size, expected_key_size);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	value_size = btf__resolve_size(btf, value->type);
 	if (value_size < 0) {
 		pr_warn("map:%s invalid BTF value_type_size\n", map_name);
-		return value_size;
+		return libbpf_err(value_size);
 	}
 
 	if (expected_value_size != value_size) {
 		pr_warn("map:%s btf_value_type_size:%u != map_def_value_size:%u\n",
 			map_name, (__u32)value_size, expected_value_size);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	*key_type_id = key->type;
@@ -1563,11 +1562,11 @@ int btf__find_str(struct btf *btf, const char *s)
 
 	/* BTF needs to be in a modifiable state to build string lookup index */
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	off = strset__find_str(btf->strs_set, s);
 	if (off < 0)
-		return off;
+		return libbpf_err(off);
 
 	return btf->start_str_off + off;
 }
@@ -1588,11 +1587,11 @@ int btf__add_str(struct btf *btf, const char *s)
 	}
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	off = strset__add_str(btf->strs_set, s);
 	if (off < 0)
-		return off;
+		return libbpf_err(off);
 
 	btf->hdr->str_len = strset__data_size(btf->strs_set);
 
@@ -1616,7 +1615,7 @@ static int btf_commit_type(struct btf *btf, int data_sz)
 
 	err = btf_add_type_idx_entry(btf, btf->hdr->type_len);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	btf->hdr->type_len += data_sz;
 	btf->hdr->str_off += data_sz;
@@ -1653,21 +1652,21 @@ int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_t
 
 	sz = btf_type_size(src_type);
 	if (sz < 0)
-		return sz;
+		return libbpf_err(sz);
 
 	/* deconstruct BTF, if necessary, and invalidate raw_data */
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	memcpy(t, src_type, sz);
 
 	err = btf_type_visit_str_offs(t, btf_rewrite_str, &p);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	return btf_commit_type(btf, sz);
 }
@@ -1688,21 +1687,21 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding
 
 	/* non-empty name */
 	if (!name || !name[0])
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	/* byte_sz must be power of 2 */
 	if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (encoding & ~(BTF_INT_SIGNED | BTF_INT_CHAR | BTF_INT_BOOL))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* deconstruct BTF, if necessary, and invalidate raw_data */
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type) + sizeof(int);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	/* if something goes wrong later, we might end up with an extra string,
 	 * but that shouldn't be a problem, because BTF can't be constructed
@@ -1736,20 +1735,20 @@ int btf__add_float(struct btf *btf, const char *name, size_t byte_sz)
 
 	/* non-empty name */
 	if (!name || !name[0])
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* byte_sz must be one of the explicitly allowed values */
 	if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 &&
 	    byte_sz != 16)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	name_off = btf__add_str(btf, name);
 	if (name_off < 0)
@@ -1780,15 +1779,15 @@ static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref
 	int sz, name_off = 0;
 
 	if (validate_type_id(ref_type_id))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	if (name && name[0]) {
 		name_off = btf__add_str(btf, name);
@@ -1831,15 +1830,15 @@ int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 n
 	int sz;
 
 	if (validate_type_id(index_type_id) || validate_type_id(elem_type_id))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type) + sizeof(struct btf_array);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	t->name_off = 0;
 	t->info = btf_type_info(BTF_KIND_ARRAY, 0, 0);
@@ -1860,12 +1859,12 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32
 	int sz, name_off = 0;
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	if (name && name[0]) {
 		name_off = btf__add_str(btf, name);
@@ -1943,30 +1942,30 @@ int btf__add_field(struct btf *btf, const char *name, int type_id,
 
 	/* last type should be union/struct */
 	if (btf->nr_types == 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	t = btf_last_type(btf);
 	if (!btf_is_composite(t))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (validate_type_id(type_id))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	/* best-effort bit field offset/size enforcement */
 	is_bitfield = bit_size || (bit_offset % 8 != 0);
 	if (is_bitfield && (bit_size == 0 || bit_size > 255 || bit_offset > 0xffffff))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* only offset 0 is allowed for unions */
 	if (btf_is_union(t) && bit_offset)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* decompose and invalidate raw data */
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_member);
 	m = btf_add_type_mem(btf, sz);
 	if (!m)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	if (name && name[0]) {
 		name_off = btf__add_str(btf, name);
@@ -2008,15 +2007,15 @@ int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz)
 
 	/* byte_sz must be power of 2 */
 	if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 8)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	if (name && name[0]) {
 		name_off = btf__add_str(btf, name);
@@ -2048,25 +2047,25 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value)
 
 	/* last type should be BTF_KIND_ENUM */
 	if (btf->nr_types == 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	t = btf_last_type(btf);
 	if (!btf_is_enum(t))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* non-empty name */
 	if (!name || !name[0])
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (value < INT_MIN || value > UINT_MAX)
-		return -E2BIG;
+		return libbpf_err(-E2BIG);
 
 	/* decompose and invalidate raw data */
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_enum);
 	v = btf_add_type_mem(btf, sz);
 	if (!v)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	name_off = btf__add_str(btf, name);
 	if (name_off < 0)
@@ -2096,7 +2095,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value)
 int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind)
 {
 	if (!name || !name[0])
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	switch (fwd_kind) {
 	case BTF_FWD_STRUCT:
@@ -2117,7 +2116,7 @@ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind)
 		 */
 		return btf__add_enum(btf, name, sizeof(int));
 	default:
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 }
 
@@ -2132,7 +2131,7 @@ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind)
 int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id)
 {
 	if (!name || !name[0])
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id);
 }
@@ -2187,10 +2186,10 @@ int btf__add_func(struct btf *btf, const char *name,
 	int id;
 
 	if (!name || !name[0])
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL &&
 	    linkage != BTF_FUNC_EXTERN)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	id = btf_add_ref_kind(btf, BTF_KIND_FUNC, name, proto_type_id);
 	if (id > 0) {
@@ -2198,7 +2197,7 @@ int btf__add_func(struct btf *btf, const char *name,
 
 		t->info = btf_type_info(BTF_KIND_FUNC, linkage, 0);
 	}
-	return id;
+	return libbpf_err(id);
 }
 
 /*
@@ -2219,15 +2218,15 @@ int btf__add_func_proto(struct btf *btf, int ret_type_id)
 	int sz;
 
 	if (validate_type_id(ret_type_id))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	/* start out with vlen=0; this will be adjusted when adding enum
 	 * values, if necessary
@@ -2254,23 +2253,23 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id)
 	int sz, name_off = 0;
 
 	if (validate_type_id(type_id))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* last type should be BTF_KIND_FUNC_PROTO */
 	if (btf->nr_types == 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	t = btf_last_type(btf);
 	if (!btf_is_func_proto(t))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* decompose and invalidate raw data */
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_param);
 	p = btf_add_type_mem(btf, sz);
 	if (!p)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	if (name && name[0]) {
 		name_off = btf__add_str(btf, name);
@@ -2308,21 +2307,21 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id)
 
 	/* non-empty name */
 	if (!name || !name[0])
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED &&
 	    linkage != BTF_VAR_GLOBAL_EXTERN)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (validate_type_id(type_id))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* deconstruct BTF, if necessary, and invalidate raw_data */
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type) + sizeof(struct btf_var);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	name_off = btf__add_str(btf, name);
 	if (name_off < 0)
@@ -2357,15 +2356,15 @@ int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz)
 
 	/* non-empty name */
 	if (!name || !name[0])
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_type);
 	t = btf_add_type_mem(btf, sz);
 	if (!t)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	name_off = btf__add_str(btf, name);
 	if (name_off < 0)
@@ -2397,22 +2396,22 @@ int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __
 
 	/* last type should be BTF_KIND_DATASEC */
 	if (btf->nr_types == 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	t = btf_last_type(btf);
 	if (!btf_is_datasec(t))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (validate_type_id(var_type_id))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* decompose and invalidate raw data */
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	sz = sizeof(struct btf_var_secinfo);
 	v = btf_add_type_mem(btf, sz);
 	if (!v)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	v->type = var_type_id;
 	v->offset = offset;
@@ -2614,11 +2613,11 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size)
 
 	err = btf_ext_parse_hdr(data, size);
 	if (err)
-		return ERR_PTR(err);
+		return libbpf_err_ptr(err);
 
 	btf_ext = calloc(1, sizeof(struct btf_ext));
 	if (!btf_ext)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 
 	btf_ext->data_size = size;
 	btf_ext->data = malloc(size);
@@ -2628,9 +2627,11 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size)
 	}
 	memcpy(btf_ext->data, data, size);
 
-	if (btf_ext->hdr->hdr_len <
-	    offsetofend(struct btf_ext_header, line_info_len))
+	if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, line_info_len)) {
+		err = -EINVAL;
 		goto done;
+	}
+
 	err = btf_ext_setup_func_info(btf_ext);
 	if (err)
 		goto done;
@@ -2639,8 +2640,11 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size)
 	if (err)
 		goto done;
 
-	if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len))
+	if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len)) {
+		err = -EINVAL;
 		goto done;
+	}
+
 	err = btf_ext_setup_core_relos(btf_ext);
 	if (err)
 		goto done;
@@ -2648,7 +2652,7 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size)
 done:
 	if (err) {
 		btf_ext__free(btf_ext);
-		return ERR_PTR(err);
+		return libbpf_err_ptr(err);
 	}
 
 	return btf_ext;
@@ -2687,7 +2691,7 @@ static int btf_ext_reloc_info(const struct btf *btf,
 		existing_len = (*cnt) * record_size;
 		data = realloc(*info, existing_len + records_len);
 		if (!data)
-			return -ENOMEM;
+			return libbpf_err(-ENOMEM);
 
 		memcpy(data + existing_len, sinfo->data, records_len);
 		/* adjust insn_off only, the rest data will be passed
@@ -2697,15 +2701,14 @@ static int btf_ext_reloc_info(const struct btf *btf,
 			__u32 *insn_off;
 
 			insn_off = data + existing_len + (i * record_size);
-			*insn_off = *insn_off / sizeof(struct bpf_insn) +
-				insns_cnt;
+			*insn_off = *insn_off / sizeof(struct bpf_insn) + insns_cnt;
 		}
 		*info = data;
 		*cnt += sinfo->num_info;
 		return 0;
 	}
 
-	return -ENOENT;
+	return libbpf_err(-ENOENT);
 }
 
 int btf_ext__reloc_func_info(const struct btf *btf,
@@ -2894,11 +2897,11 @@ int btf__dedup(struct btf *btf, struct btf_ext *btf_ext,
 
 	if (IS_ERR(d)) {
 		pr_debug("btf_dedup_new failed: %ld", PTR_ERR(d));
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (btf_ensure_modifiable(btf))
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	err = btf_dedup_prep(d);
 	if (err) {
@@ -2938,7 +2941,7 @@ int btf__dedup(struct btf *btf, struct btf_ext *btf_ext,
 
 done:
 	btf_dedup_free(d);
-	return err;
+	return libbpf_err(err);
 }
 
 #define BTF_UNPROCESSED_ID ((__u32)-1)
@@ -4411,7 +4414,7 @@ struct btf *libbpf_find_kernel_btf(void)
 	char path[PATH_MAX + 1];
 	struct utsname buf;
 	struct btf *btf;
-	int i;
+	int i, err;
 
 	uname(&buf);
 
@@ -4425,17 +4428,16 @@ struct btf *libbpf_find_kernel_btf(void)
 			btf = btf__parse_raw(path);
 		else
 			btf = btf__parse_elf(path, NULL);
-
-		pr_debug("loading kernel BTF '%s': %ld\n",
-			 path, IS_ERR(btf) ? PTR_ERR(btf) : 0);
-		if (IS_ERR(btf))
+		err = libbpf_get_error(btf);
+		pr_debug("loading kernel BTF '%s': %d\n", path, err);
+		if (err)
 			continue;
 
 		return btf;
 	}
 
 	pr_warn("failed to find valid kernel BTF\n");
-	return ERR_PTR(-ESRCH);
+	return libbpf_err_ptr(-ESRCH);
 }
 
 int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx)
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 5e2809d685bf..5dc6b5172bb3 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -128,7 +128,7 @@ struct btf_dump *btf_dump__new(const struct btf *btf,
 
 	d = calloc(1, sizeof(struct btf_dump));
 	if (!d)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 
 	d->btf = btf;
 	d->btf_ext = btf_ext;
@@ -156,7 +156,7 @@ struct btf_dump *btf_dump__new(const struct btf *btf,
 	return d;
 err:
 	btf_dump__free(d);
-	return ERR_PTR(err);
+	return libbpf_err_ptr(err);
 }
 
 static int btf_dump_resize(struct btf_dump *d)
@@ -236,16 +236,16 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id)
 	int err, i;
 
 	if (id > btf__get_nr_types(d->btf))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	err = btf_dump_resize(d);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	d->emit_queue_cnt = 0;
 	err = btf_dump_order_type(d, id, false);
 	if (err < 0)
-		return err;
+		return libbpf_err(err);
 
 	for (i = 0; i < d->emit_queue_cnt; i++)
 		btf_dump_emit_type(d, d->emit_queue[i], 0 /*top-level*/);
@@ -1075,11 +1075,11 @@ int btf_dump__emit_type_decl(struct btf_dump *d, __u32 id,
 	int lvl, err;
 
 	if (!OPTS_VALID(opts, btf_dump_emit_type_decl_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	err = btf_dump_resize(d);
 	if (err)
-		return -EINVAL;
+		return libbpf_err(err);
 
 	fname = OPTS_GET(opts, field_name, "");
 	lvl = OPTS_GET(opts, indent_level, 0);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1425d7ed0f2f..1c4e20e75237 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2579,16 +2579,14 @@ static int bpf_object__init_btf(struct bpf_object *obj,
 
 	if (btf_data) {
 		obj->btf = btf__new(btf_data->d_buf, btf_data->d_size);
-		if (IS_ERR(obj->btf)) {
-			err = PTR_ERR(obj->btf);
+		err = libbpf_get_error(obj->btf);
+		if (err) {
 			obj->btf = NULL;
-			pr_warn("Error loading ELF section %s: %d.\n",
-				BTF_ELF_SEC, err);
+			pr_warn("Error loading ELF section %s: %d.\n", BTF_ELF_SEC, err);
 			goto out;
 		}
 		/* enforce 8-byte pointers for BPF-targeted BTFs */
 		btf__set_pointer_size(obj->btf, 8);
-		err = 0;
 	}
 	if (btf_ext_data) {
 		if (!obj->btf) {
@@ -2596,11 +2594,11 @@ static int bpf_object__init_btf(struct bpf_object *obj,
 				 BTF_EXT_ELF_SEC, BTF_ELF_SEC);
 			goto out;
 		}
-		obj->btf_ext = btf_ext__new(btf_ext_data->d_buf,
-					    btf_ext_data->d_size);
-		if (IS_ERR(obj->btf_ext)) {
-			pr_warn("Error loading ELF section %s: %ld. Ignored and continue.\n",
-				BTF_EXT_ELF_SEC, PTR_ERR(obj->btf_ext));
+		obj->btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size);
+		err = libbpf_get_error(obj->btf_ext);
+		if (err) {
+			pr_warn("Error loading ELF section %s: %d. Ignored and continue.\n",
+				BTF_EXT_ELF_SEC, err);
 			obj->btf_ext = NULL;
 			goto out;
 		}
@@ -2684,8 +2682,8 @@ static int bpf_object__load_vmlinux_btf(struct bpf_object *obj, bool force)
 		return 0;
 
 	obj->btf_vmlinux = libbpf_find_kernel_btf();
-	if (IS_ERR(obj->btf_vmlinux)) {
-		err = PTR_ERR(obj->btf_vmlinux);
+	err = libbpf_get_error(obj->btf_vmlinux);
+	if (err) {
 		pr_warn("Error loading vmlinux BTF: %d\n", err);
 		obj->btf_vmlinux = NULL;
 		return err;
@@ -2751,8 +2749,9 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj)
 		/* clone BTF to sanitize a copy and leave the original intact */
 		raw_data = btf__get_raw_data(obj->btf, &sz);
 		kern_btf = btf__new(raw_data, sz);
-		if (IS_ERR(kern_btf))
-			return PTR_ERR(kern_btf);
+		err = libbpf_get_error(kern_btf);
+		if (err)
+			return err;
 
 		/* enforce 8-byte pointers for BPF-targeted BTFs */
 		btf__set_pointer_size(obj->btf, 8);
@@ -3523,7 +3522,7 @@ bpf_object__find_program_by_title(const struct bpf_object *obj,
 		if (pos->sec_name && !strcmp(pos->sec_name, title))
 			return pos;
 	}
-	return NULL;
+	return errno = ENOENT, NULL;
 }
 
 static bool prog_is_subprog(const struct bpf_object *obj,
@@ -3556,7 +3555,7 @@ bpf_object__find_program_by_name(const struct bpf_object *obj,
 		if (!strcmp(prog->name, name))
 			return prog;
 	}
-	return NULL;
+	return errno = ENOENT, NULL;
 }
 
 static bool bpf_object__shndx_is_data(const struct bpf_object *obj,
@@ -3903,11 +3902,11 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd)
 
 	err = bpf_obj_get_info_by_fd(fd, &info, &len);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	new_name = strdup(info.name);
 	if (!new_name)
-		return -errno;
+		return libbpf_err(-errno);
 
 	new_fd = open("/", O_RDONLY | O_CLOEXEC);
 	if (new_fd < 0) {
@@ -3945,7 +3944,7 @@ err_close_new_fd:
 	close(new_fd);
 err_free_new_name:
 	free(new_name);
-	return err;
+	return libbpf_err(err);
 }
 
 __u32 bpf_map__max_entries(const struct bpf_map *map)
@@ -3956,7 +3955,7 @@ __u32 bpf_map__max_entries(const struct bpf_map *map)
 struct bpf_map *bpf_map__inner_map(struct bpf_map *map)
 {
 	if (!bpf_map_type__is_map_in_map(map->def.type))
-		return NULL;
+		return errno = EINVAL, NULL;
 
 	return map->inner_map;
 }
@@ -3964,7 +3963,7 @@ struct bpf_map *bpf_map__inner_map(struct bpf_map *map)
 int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries)
 {
 	if (map->fd >= 0)
-		return -EBUSY;
+		return libbpf_err(-EBUSY);
 	map->def.max_entries = max_entries;
 	return 0;
 }
@@ -3972,7 +3971,7 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries)
 int bpf_map__resize(struct bpf_map *map, __u32 max_entries)
 {
 	if (!map || !max_entries)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	return bpf_map__set_max_entries(map, max_entries);
 }
@@ -5103,10 +5102,10 @@ static int load_module_btfs(struct bpf_object *obj)
 		}
 
 		btf = btf_get_from_fd(fd, obj->btf_vmlinux);
-		if (IS_ERR(btf)) {
-			pr_warn("failed to load module [%s]'s BTF object #%d: %ld\n",
-				name, id, PTR_ERR(btf));
-			err = PTR_ERR(btf);
+		err = libbpf_get_error(btf);
+		if (err) {
+			pr_warn("failed to load module [%s]'s BTF object #%d: %d\n",
+				name, id, err);
 			goto err_out;
 		}
 
@@ -6366,8 +6365,8 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
 
 	if (targ_btf_path) {
 		obj->btf_vmlinux_override = btf__parse(targ_btf_path, NULL);
-		if (IS_ERR_OR_NULL(obj->btf_vmlinux_override)) {
-			err = PTR_ERR(obj->btf_vmlinux_override);
+		err = libbpf_get_error(obj->btf_vmlinux_override);
+		if (err) {
 			pr_warn("failed to parse target BTF: %d\n", err);
 			return err;
 		}
@@ -7424,7 +7423,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver)
 
 	if (prog->obj->loaded) {
 		pr_warn("prog '%s': can't load after object was loaded\n", prog->name);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if ((prog->type == BPF_PROG_TYPE_TRACING ||
@@ -7434,7 +7433,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver)
 
 		err = libbpf_find_attach_btf_id(prog, &btf_obj_fd, &btf_type_id);
 		if (err)
-			return err;
+			return libbpf_err(err);
 
 		prog->attach_btf_obj_fd = btf_obj_fd;
 		prog->attach_btf_id = btf_type_id;
@@ -7444,13 +7443,13 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver)
 		if (prog->preprocessor) {
 			pr_warn("Internal error: can't load program '%s'\n",
 				prog->name);
-			return -LIBBPF_ERRNO__INTERNAL;
+			return libbpf_err(-LIBBPF_ERRNO__INTERNAL);
 		}
 
 		prog->instances.fds = malloc(sizeof(int));
 		if (!prog->instances.fds) {
 			pr_warn("Not enough memory for BPF fds\n");
-			return -ENOMEM;
+			return libbpf_err(-ENOMEM);
 		}
 		prog->instances.nr = 1;
 		prog->instances.fds[0] = -1;
@@ -7509,7 +7508,7 @@ out:
 		pr_warn("failed to load program '%s'\n", prog->name);
 	zfree(&prog->insns);
 	prog->insns_cnt = 0;
-	return err;
+	return libbpf_err(err);
 }
 
 static int
@@ -7642,7 +7641,7 @@ __bpf_object__open_xattr(struct bpf_object_open_attr *attr, int flags)
 
 struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr)
 {
-	return __bpf_object__open_xattr(attr, 0);
+	return libbpf_ptr(__bpf_object__open_xattr(attr, 0));
 }
 
 struct bpf_object *bpf_object__open(const char *path)
@@ -7652,18 +7651,18 @@ struct bpf_object *bpf_object__open(const char *path)
 		.prog_type	= BPF_PROG_TYPE_UNSPEC,
 	};
 
-	return bpf_object__open_xattr(&attr);
+	return libbpf_ptr(__bpf_object__open_xattr(&attr, 0));
 }
 
 struct bpf_object *
 bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts)
 {
 	if (!path)
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 
 	pr_debug("loading %s\n", path);
 
-	return __bpf_object__open(path, NULL, 0, opts);
+	return libbpf_ptr(__bpf_object__open(path, NULL, 0, opts));
 }
 
 struct bpf_object *
@@ -7671,9 +7670,9 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
 		     const struct bpf_object_open_opts *opts)
 {
 	if (!obj_buf || obj_buf_sz == 0)
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 
-	return __bpf_object__open(NULL, obj_buf, obj_buf_sz, opts);
+	return libbpf_ptr(__bpf_object__open(NULL, obj_buf, obj_buf_sz, opts));
 }
 
 struct bpf_object *
@@ -7688,9 +7687,9 @@ bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz,
 
 	/* returning NULL is wrong, but backwards-compatible */
 	if (!obj_buf || obj_buf_sz == 0)
-		return NULL;
+		return errno = EINVAL, NULL;
 
-	return bpf_object__open_mem(obj_buf, obj_buf_sz, &opts);
+	return libbpf_ptr(__bpf_object__open(NULL, obj_buf, obj_buf_sz, &opts));
 }
 
 int bpf_object__unload(struct bpf_object *obj)
@@ -7698,7 +7697,7 @@ int bpf_object__unload(struct bpf_object *obj)
 	size_t i;
 
 	if (!obj)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	for (i = 0; i < obj->nr_maps; i++) {
 		zclose(obj->maps[i].fd);
@@ -8031,14 +8030,14 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 	int err, i;
 
 	if (!attr)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	obj = attr->obj;
 	if (!obj)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (obj->loaded) {
 		pr_warn("object '%s': load can't be attempted twice\n", obj->name);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (obj->gen_loader)
@@ -8089,7 +8088,7 @@ out:
 
 	bpf_object__unload(obj);
 	pr_warn("failed to load object '%s'\n", obj->path);
-	return err;
+	return libbpf_err(err);
 }
 
 int bpf_object__load(struct bpf_object *obj)
@@ -8161,28 +8160,28 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
 
 	err = make_parent_dir(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	err = check_path(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	if (prog == NULL) {
 		pr_warn("invalid program pointer\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (instance < 0 || instance >= prog->instances.nr) {
 		pr_warn("invalid prog instance %d of prog %s (max %d)\n",
 			instance, prog->name, prog->instances.nr);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (bpf_obj_pin(prog->instances.fds[instance], path)) {
 		err = -errno;
 		cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
 		pr_warn("failed to pin program: %s\n", cp);
-		return err;
+		return libbpf_err(err);
 	}
 	pr_debug("pinned program '%s'\n", path);
 
@@ -8196,22 +8195,23 @@ int bpf_program__unpin_instance(struct bpf_program *prog, const char *path,
 
 	err = check_path(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	if (prog == NULL) {
 		pr_warn("invalid program pointer\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (instance < 0 || instance >= prog->instances.nr) {
 		pr_warn("invalid prog instance %d of prog %s (max %d)\n",
 			instance, prog->name, prog->instances.nr);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	err = unlink(path);
 	if (err != 0)
-		return -errno;
+		return libbpf_err(-errno);
+
 	pr_debug("unpinned program '%s'\n", path);
 
 	return 0;
@@ -8223,20 +8223,20 @@ int bpf_program__pin(struct bpf_program *prog, const char *path)
 
 	err = make_parent_dir(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	err = check_path(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	if (prog == NULL) {
 		pr_warn("invalid program pointer\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (prog->instances.nr <= 0) {
 		pr_warn("no instances of prog %s to pin\n", prog->name);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (prog->instances.nr == 1) {
@@ -8280,7 +8280,7 @@ err_unpin:
 
 	rmdir(path);
 
-	return err;
+	return libbpf_err(err);
 }
 
 int bpf_program__unpin(struct bpf_program *prog, const char *path)
@@ -8289,16 +8289,16 @@ int bpf_program__unpin(struct bpf_program *prog, const char *path)
 
 	err = check_path(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	if (prog == NULL) {
 		pr_warn("invalid program pointer\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (prog->instances.nr <= 0) {
 		pr_warn("no instances of prog %s to pin\n", prog->name);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (prog->instances.nr == 1) {
@@ -8312,9 +8312,9 @@ int bpf_program__unpin(struct bpf_program *prog, const char *path)
 
 		len = snprintf(buf, PATH_MAX, "%s/%d", path, i);
 		if (len < 0)
-			return -EINVAL;
+			return libbpf_err(-EINVAL);
 		else if (len >= PATH_MAX)
-			return -ENAMETOOLONG;
+			return libbpf_err(-ENAMETOOLONG);
 
 		err = bpf_program__unpin_instance(prog, buf, i);
 		if (err)
@@ -8323,7 +8323,7 @@ int bpf_program__unpin(struct bpf_program *prog, const char *path)
 
 	err = rmdir(path);
 	if (err)
-		return -errno;
+		return libbpf_err(-errno);
 
 	return 0;
 }
@@ -8335,14 +8335,14 @@ int bpf_map__pin(struct bpf_map *map, const char *path)
 
 	if (map == NULL) {
 		pr_warn("invalid map pointer\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (map->pin_path) {
 		if (path && strcmp(path, map->pin_path)) {
 			pr_warn("map '%s' already has pin path '%s' different from '%s'\n",
 				bpf_map__name(map), map->pin_path, path);
-			return -EINVAL;
+			return libbpf_err(-EINVAL);
 		} else if (map->pinned) {
 			pr_debug("map '%s' already pinned at '%s'; not re-pinning\n",
 				 bpf_map__name(map), map->pin_path);
@@ -8352,10 +8352,10 @@ int bpf_map__pin(struct bpf_map *map, const char *path)
 		if (!path) {
 			pr_warn("missing a path to pin map '%s' at\n",
 				bpf_map__name(map));
-			return -EINVAL;
+			return libbpf_err(-EINVAL);
 		} else if (map->pinned) {
 			pr_warn("map '%s' already pinned\n", bpf_map__name(map));
-			return -EEXIST;
+			return libbpf_err(-EEXIST);
 		}
 
 		map->pin_path = strdup(path);
@@ -8367,11 +8367,11 @@ int bpf_map__pin(struct bpf_map *map, const char *path)
 
 	err = make_parent_dir(map->pin_path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	err = check_path(map->pin_path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	if (bpf_obj_pin(map->fd, map->pin_path)) {
 		err = -errno;
@@ -8386,7 +8386,7 @@ int bpf_map__pin(struct bpf_map *map, const char *path)
 out_err:
 	cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg));
 	pr_warn("failed to pin map: %s\n", cp);
-	return err;
+	return libbpf_err(err);
 }
 
 int bpf_map__unpin(struct bpf_map *map, const char *path)
@@ -8395,29 +8395,29 @@ int bpf_map__unpin(struct bpf_map *map, const char *path)
 
 	if (map == NULL) {
 		pr_warn("invalid map pointer\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	if (map->pin_path) {
 		if (path && strcmp(path, map->pin_path)) {
 			pr_warn("map '%s' already has pin path '%s' different from '%s'\n",
 				bpf_map__name(map), map->pin_path, path);
-			return -EINVAL;
+			return libbpf_err(-EINVAL);
 		}
 		path = map->pin_path;
 	} else if (!path) {
 		pr_warn("no path to unpin map '%s' from\n",
 			bpf_map__name(map));
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	err = check_path(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	err = unlink(path);
 	if (err != 0)
-		return -errno;
+		return libbpf_err(-errno);
 
 	map->pinned = false;
 	pr_debug("unpinned map '%s' from '%s'\n", bpf_map__name(map), path);
@@ -8432,7 +8432,7 @@ int bpf_map__set_pin_path(struct bpf_map *map, const char *path)
 	if (path) {
 		new = strdup(path);
 		if (!new)
-			return -errno;
+			return libbpf_err(-errno);
 	}
 
 	free(map->pin_path);
@@ -8466,11 +8466,11 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path)
 	int err;
 
 	if (!obj)
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 
 	if (!obj->loaded) {
 		pr_warn("object not yet loaded; load it first\n");
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 	}
 
 	bpf_object__for_each_map(map, obj) {
@@ -8510,7 +8510,7 @@ err_unpin_maps:
 		bpf_map__unpin(map, NULL);
 	}
 
-	return err;
+	return libbpf_err(err);
 }
 
 int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
@@ -8519,7 +8519,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
 	int err;
 
 	if (!obj)
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 
 	bpf_object__for_each_map(map, obj) {
 		char *pin_path = NULL;
@@ -8531,9 +8531,9 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
 			len = snprintf(buf, PATH_MAX, "%s/%s", path,
 				       bpf_map__name(map));
 			if (len < 0)
-				return -EINVAL;
+				return libbpf_err(-EINVAL);
 			else if (len >= PATH_MAX)
-				return -ENAMETOOLONG;
+				return libbpf_err(-ENAMETOOLONG);
 			sanitize_pin_path(buf);
 			pin_path = buf;
 		} else if (!map->pin_path) {
@@ -8542,7 +8542,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
 
 		err = bpf_map__unpin(map, pin_path);
 		if (err)
-			return err;
+			return libbpf_err(err);
 	}
 
 	return 0;
@@ -8554,11 +8554,11 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path)
 	int err;
 
 	if (!obj)
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 
 	if (!obj->loaded) {
 		pr_warn("object not yet loaded; load it first\n");
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 	}
 
 	bpf_object__for_each_program(prog, obj) {
@@ -8597,7 +8597,7 @@ err_unpin_programs:
 		bpf_program__unpin(prog, buf);
 	}
 
-	return err;
+	return libbpf_err(err);
 }
 
 int bpf_object__unpin_programs(struct bpf_object *obj, const char *path)
@@ -8606,7 +8606,7 @@ int bpf_object__unpin_programs(struct bpf_object *obj, const char *path)
 	int err;
 
 	if (!obj)
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 
 	bpf_object__for_each_program(prog, obj) {
 		char buf[PATH_MAX];
@@ -8615,13 +8615,13 @@ int bpf_object__unpin_programs(struct bpf_object *obj, const char *path)
 		len = snprintf(buf, PATH_MAX, "%s/%s", path,
 			       prog->pin_name);
 		if (len < 0)
-			return -EINVAL;
+			return libbpf_err(-EINVAL);
 		else if (len >= PATH_MAX)
-			return -ENAMETOOLONG;
+			return libbpf_err(-ENAMETOOLONG);
 
 		err = bpf_program__unpin(prog, buf);
 		if (err)
-			return err;
+			return libbpf_err(err);
 	}
 
 	return 0;
@@ -8633,12 +8633,12 @@ int bpf_object__pin(struct bpf_object *obj, const char *path)
 
 	err = bpf_object__pin_maps(obj, path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	err = bpf_object__pin_programs(obj, path);
 	if (err) {
 		bpf_object__unpin_maps(obj, path);
-		return err;
+		return libbpf_err(err);
 	}
 
 	return 0;
@@ -8735,7 +8735,7 @@ bpf_object__next(struct bpf_object *prev)
 
 const char *bpf_object__name(const struct bpf_object *obj)
 {
-	return obj ? obj->name : ERR_PTR(-EINVAL);
+	return obj ? obj->name : libbpf_err_ptr(-EINVAL);
 }
 
 unsigned int bpf_object__kversion(const struct bpf_object *obj)
@@ -8756,7 +8756,7 @@ int bpf_object__btf_fd(const struct bpf_object *obj)
 int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version)
 {
 	if (obj->loaded)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	obj->kern_version = kern_version;
 
@@ -8776,7 +8776,7 @@ int bpf_object__set_priv(struct bpf_object *obj, void *priv,
 
 void *bpf_object__priv(const struct bpf_object *obj)
 {
-	return obj ? obj->priv : ERR_PTR(-EINVAL);
+	return obj ? obj->priv : libbpf_err_ptr(-EINVAL);
 }
 
 int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts)
@@ -8812,7 +8812,7 @@ __bpf_program__iter(const struct bpf_program *p, const struct bpf_object *obj,
 
 	if (p->obj != obj) {
 		pr_warn("error: program handler doesn't match object\n");
-		return NULL;
+		return errno = EINVAL, NULL;
 	}
 
 	idx = (p - obj->programs) + (forward ? 1 : -1);
@@ -8858,7 +8858,7 @@ int bpf_program__set_priv(struct bpf_program *prog, void *priv,
 
 void *bpf_program__priv(const struct bpf_program *prog)
 {
-	return prog ? prog->priv : ERR_PTR(-EINVAL);
+	return prog ? prog->priv : libbpf_err_ptr(-EINVAL);
 }
 
 void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex)
@@ -8885,7 +8885,7 @@ const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy)
 		title = strdup(title);
 		if (!title) {
 			pr_warn("failed to strdup program title\n");
-			return ERR_PTR(-ENOMEM);
+			return libbpf_err_ptr(-ENOMEM);
 		}
 	}
 
@@ -8900,7 +8900,7 @@ bool bpf_program__autoload(const struct bpf_program *prog)
 int bpf_program__set_autoload(struct bpf_program *prog, bool autoload)
 {
 	if (prog->obj->loaded)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	prog->load = autoload;
 	return 0;
@@ -8922,17 +8922,17 @@ int bpf_program__set_prep(struct bpf_program *prog, int nr_instances,
 	int *instances_fds;
 
 	if (nr_instances <= 0 || !prep)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (prog->instances.nr > 0 || prog->instances.fds) {
 		pr_warn("Can't set pre-processor after loading\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	instances_fds = malloc(sizeof(int) * nr_instances);
 	if (!instances_fds) {
 		pr_warn("alloc memory failed for fds\n");
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 	}
 
 	/* fill all fd with -1 */
@@ -8949,19 +8949,19 @@ int bpf_program__nth_fd(const struct bpf_program *prog, int n)
 	int fd;
 
 	if (!prog)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (n >= prog->instances.nr || n < 0) {
 		pr_warn("Can't get the %dth fd from program %s: only %d instances\n",
 			n, prog->name, prog->instances.nr);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	fd = prog->instances.fds[n];
 	if (fd < 0) {
 		pr_warn("%dth instance of program '%s' is invalid\n",
 			n, prog->name);
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 	}
 
 	return fd;
@@ -8987,7 +8987,7 @@ static bool bpf_program__is_type(const struct bpf_program *prog,
 int bpf_program__set_##NAME(struct bpf_program *prog)		\
 {								\
 	if (!prog)						\
-		return -EINVAL;					\
+		return libbpf_err(-EINVAL);			\
 	bpf_program__set_type(prog, TYPE);			\
 	return 0;						\
 }								\
@@ -9274,7 +9274,7 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
 	char *type_names;
 
 	if (!name)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	sec_def = find_sec_def(name);
 	if (sec_def) {
@@ -9290,7 +9290,7 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
 		free(type_names);
 	}
 
-	return -ESRCH;
+	return libbpf_err(-ESRCH);
 }
 
 static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj,
@@ -9488,9 +9488,10 @@ int libbpf_find_vmlinux_btf_id(const char *name,
 	int err;
 
 	btf = libbpf_find_kernel_btf();
-	if (IS_ERR(btf)) {
+	err = libbpf_get_error(btf);
+	if (err) {
 		pr_warn("vmlinux BTF is not found\n");
-		return -EINVAL;
+		return libbpf_err(err);
 	}
 
 	err = find_attach_btf_id(btf, name, attach_type);
@@ -9498,7 +9499,7 @@ int libbpf_find_vmlinux_btf_id(const char *name,
 		pr_warn("%s is not found in vmlinux BTF\n", name);
 
 	btf__free(btf);
-	return err;
+	return libbpf_err(err);
 }
 
 static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd)
@@ -9509,10 +9510,11 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd)
 	int err = -EINVAL;
 
 	info_linear = bpf_program__get_prog_info_linear(attach_prog_fd, 0);
-	if (IS_ERR_OR_NULL(info_linear)) {
+	err = libbpf_get_error(info_linear);
+	if (err) {
 		pr_warn("failed get_prog_info_linear for FD %d\n",
 			attach_prog_fd);
-		return -EINVAL;
+		return err;
 	}
 	info = &info_linear->info;
 	if (!info->btf_id) {
@@ -9633,13 +9635,13 @@ int libbpf_attach_type_by_name(const char *name,
 	int i;
 
 	if (!name)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	for (i = 0; i < ARRAY_SIZE(section_defs); i++) {
 		if (strncmp(name, section_defs[i].sec, section_defs[i].len))
 			continue;
 		if (!section_defs[i].is_attachable)
-			return -EINVAL;
+			return libbpf_err(-EINVAL);
 		*attach_type = section_defs[i].expected_attach_type;
 		return 0;
 	}
@@ -9650,17 +9652,17 @@ int libbpf_attach_type_by_name(const char *name,
 		free(type_names);
 	}
 
-	return -EINVAL;
+	return libbpf_err(-EINVAL);
 }
 
 int bpf_map__fd(const struct bpf_map *map)
 {
-	return map ? map->fd : -EINVAL;
+	return map ? map->fd : libbpf_err(-EINVAL);
 }
 
 const struct bpf_map_def *bpf_map__def(const struct bpf_map *map)
 {
-	return map ? &map->def : ERR_PTR(-EINVAL);
+	return map ? &map->def : libbpf_err_ptr(-EINVAL);
 }
 
 const char *bpf_map__name(const struct bpf_map *map)
@@ -9676,7 +9678,7 @@ enum bpf_map_type bpf_map__type(const struct bpf_map *map)
 int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type)
 {
 	if (map->fd >= 0)
-		return -EBUSY;
+		return libbpf_err(-EBUSY);
 	map->def.type = type;
 	return 0;
 }
@@ -9689,7 +9691,7 @@ __u32 bpf_map__map_flags(const struct bpf_map *map)
 int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags)
 {
 	if (map->fd >= 0)
-		return -EBUSY;
+		return libbpf_err(-EBUSY);
 	map->def.map_flags = flags;
 	return 0;
 }
@@ -9702,7 +9704,7 @@ __u32 bpf_map__numa_node(const struct bpf_map *map)
 int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node)
 {
 	if (map->fd >= 0)
-		return -EBUSY;
+		return libbpf_err(-EBUSY);
 	map->numa_node = numa_node;
 	return 0;
 }
@@ -9715,7 +9717,7 @@ __u32 bpf_map__key_size(const struct bpf_map *map)
 int bpf_map__set_key_size(struct bpf_map *map, __u32 size)
 {
 	if (map->fd >= 0)
-		return -EBUSY;
+		return libbpf_err(-EBUSY);
 	map->def.key_size = size;
 	return 0;
 }
@@ -9728,7 +9730,7 @@ __u32 bpf_map__value_size(const struct bpf_map *map)
 int bpf_map__set_value_size(struct bpf_map *map, __u32 size)
 {
 	if (map->fd >= 0)
-		return -EBUSY;
+		return libbpf_err(-EBUSY);
 	map->def.value_size = size;
 	return 0;
 }
@@ -9747,7 +9749,7 @@ int bpf_map__set_priv(struct bpf_map *map, void *priv,
 		     bpf_map_clear_priv_t clear_priv)
 {
 	if (!map)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (map->priv) {
 		if (map->clear_priv)
@@ -9761,7 +9763,7 @@ int bpf_map__set_priv(struct bpf_map *map, void *priv,
 
 void *bpf_map__priv(const struct bpf_map *map)
 {
-	return map ? map->priv : ERR_PTR(-EINVAL);
+	return map ? map->priv : libbpf_err_ptr(-EINVAL);
 }
 
 int bpf_map__set_initial_value(struct bpf_map *map,
@@ -9769,7 +9771,7 @@ int bpf_map__set_initial_value(struct bpf_map *map,
 {
 	if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG ||
 	    size != map->def.value_size || map->fd >= 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	memcpy(map->mmaped, data, size);
 	return 0;
@@ -9801,7 +9803,7 @@ __u32 bpf_map__ifindex(const struct bpf_map *map)
 int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex)
 {
 	if (map->fd >= 0)
-		return -EBUSY;
+		return libbpf_err(-EBUSY);
 	map->map_ifindex = ifindex;
 	return 0;
 }
@@ -9810,11 +9812,11 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd)
 {
 	if (!bpf_map_type__is_map_in_map(map->def.type)) {
 		pr_warn("error: unsupported map type\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 	if (map->inner_map_fd != -1) {
 		pr_warn("error: inner_map_fd already specified\n");
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 	zfree(&map->inner_map);
 	map->inner_map_fd = fd;
@@ -9828,7 +9830,7 @@ __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i)
 	struct bpf_map *s, *e;
 
 	if (!obj || !obj->maps)
-		return NULL;
+		return errno = EINVAL, NULL;
 
 	s = obj->maps;
 	e = obj->maps + obj->nr_maps;
@@ -9836,7 +9838,7 @@ __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i)
 	if ((m < s) || (m >= e)) {
 		pr_warn("error in %s: map handler doesn't belong to object\n",
 			 __func__);
-		return NULL;
+		return errno = EINVAL, NULL;
 	}
 
 	idx = (m - obj->maps) + i;
@@ -9875,7 +9877,7 @@ bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name)
 		if (pos->name && !strcmp(pos->name, name))
 			return pos;
 	}
-	return NULL;
+	return errno = ENOENT, NULL;
 }
 
 int
@@ -9887,12 +9889,23 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name)
 struct bpf_map *
 bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset)
 {
-	return ERR_PTR(-ENOTSUP);
+	return libbpf_err_ptr(-ENOTSUP);
 }
 
 long libbpf_get_error(const void *ptr)
 {
-	return PTR_ERR_OR_ZERO(ptr);
+	if (!IS_ERR_OR_NULL(ptr))
+		return 0;
+
+	if (IS_ERR(ptr))
+		errno = -PTR_ERR(ptr);
+
+	/* If ptr == NULL, then errno should be already set by the failing
+	 * API, because libbpf never returns NULL on success and it now always
+	 * sets errno on error. So no extra errno handling for ptr == NULL
+	 * case.
+	 */
+	return -errno;
 }
 
 int bpf_prog_load(const char *file, enum bpf_prog_type type,
@@ -9918,16 +9931,17 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 	int err;
 
 	if (!attr)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (!attr->file)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	open_attr.file = attr->file;
 	open_attr.prog_type = attr->prog_type;
 
 	obj = bpf_object__open_xattr(&open_attr);
-	if (IS_ERR_OR_NULL(obj))
-		return -ENOENT;
+	err = libbpf_get_error(obj);
+	if (err)
+		return libbpf_err(-ENOENT);
 
 	bpf_object__for_each_program(prog, obj) {
 		enum bpf_attach_type attach_type = attr->expected_attach_type;
@@ -9947,7 +9961,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 			 * didn't provide a fallback type, too bad...
 			 */
 			bpf_object__close(obj);
-			return -EINVAL;
+			return libbpf_err(-EINVAL);
 		}
 
 		prog->prog_ifindex = attr->ifindex;
@@ -9965,13 +9979,13 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 	if (!first_prog) {
 		pr_warn("object file doesn't contain bpf program\n");
 		bpf_object__close(obj);
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 	}
 
 	err = bpf_object__load(obj);
 	if (err) {
 		bpf_object__close(obj);
-		return err;
+		return libbpf_err(err);
 	}
 
 	*pobj = obj;
@@ -9990,7 +10004,10 @@ struct bpf_link {
 /* Replace link's underlying BPF program with the new one */
 int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog)
 {
-	return bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL);
+	int ret;
+	
+	ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL);
+	return libbpf_err_errno(ret);
 }
 
 /* Release "ownership" of underlying BPF resource (typically, BPF program
@@ -10023,7 +10040,7 @@ int bpf_link__destroy(struct bpf_link *link)
 		free(link->pin_path);
 	free(link);
 
-	return err;
+	return libbpf_err(err);
 }
 
 int bpf_link__fd(const struct bpf_link *link)
@@ -10038,7 +10055,7 @@ const char *bpf_link__pin_path(const struct bpf_link *link)
 
 static int bpf_link__detach_fd(struct bpf_link *link)
 {
-	return close(link->fd);
+	return libbpf_err_errno(close(link->fd));
 }
 
 struct bpf_link *bpf_link__open(const char *path)
@@ -10050,13 +10067,13 @@ struct bpf_link *bpf_link__open(const char *path)
 	if (fd < 0) {
 		fd = -errno;
 		pr_warn("failed to open link at %s: %d\n", path, fd);
-		return ERR_PTR(fd);
+		return libbpf_err_ptr(fd);
 	}
 
 	link = calloc(1, sizeof(*link));
 	if (!link) {
 		close(fd);
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 	}
 	link->detach = &bpf_link__detach_fd;
 	link->fd = fd;
@@ -10064,7 +10081,7 @@ struct bpf_link *bpf_link__open(const char *path)
 	link->pin_path = strdup(path);
 	if (!link->pin_path) {
 		bpf_link__destroy(link);
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 	}
 
 	return link;
@@ -10080,22 +10097,22 @@ int bpf_link__pin(struct bpf_link *link, const char *path)
 	int err;
 
 	if (link->pin_path)
-		return -EBUSY;
+		return libbpf_err(-EBUSY);
 	err = make_parent_dir(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 	err = check_path(path);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	link->pin_path = strdup(path);
 	if (!link->pin_path)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 
 	if (bpf_obj_pin(link->fd, link->pin_path)) {
 		err = -errno;
 		zfree(&link->pin_path);
-		return err;
+		return libbpf_err(err);
 	}
 
 	pr_debug("link fd=%d: pinned at %s\n", link->fd, link->pin_path);
@@ -10107,11 +10124,11 @@ int bpf_link__unpin(struct bpf_link *link)
 	int err;
 
 	if (!link->pin_path)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	err = unlink(link->pin_path);
 	if (err != 0)
-		return -errno;
+		return libbpf_err_errno(err);
 
 	pr_debug("link fd=%d: unpinned from %s\n", link->fd, link->pin_path);
 	zfree(&link->pin_path);
@@ -10127,11 +10144,10 @@ static int bpf_link__detach_perf_event(struct bpf_link *link)
 		err = -errno;
 
 	close(link->fd);
-	return err;
+	return libbpf_err(err);
 }
 
-struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog,
-						int pfd)
+struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pfd)
 {
 	char errmsg[STRERR_BUFSIZE];
 	struct bpf_link *link;
@@ -10140,18 +10156,18 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog,
 	if (pfd < 0) {
 		pr_warn("prog '%s': invalid perf event FD %d\n",
 			prog->name, pfd);
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 	}
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
 		pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n",
 			prog->name);
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 	}
 
 	link = calloc(1, sizeof(*link));
 	if (!link)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 	link->detach = &bpf_link__detach_perf_event;
 	link->fd = pfd;
 
@@ -10163,14 +10179,14 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog,
 		if (err == -EPROTO)
 			pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n",
 				prog->name, pfd);
-		return ERR_PTR(err);
+		return libbpf_err_ptr(err);
 	}
 	if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
 		err = -errno;
 		free(link);
 		pr_warn("prog '%s': failed to enable pfd %d: %s\n",
 			prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
-		return ERR_PTR(err);
+		return libbpf_err_ptr(err);
 	}
 	return link;
 }
@@ -10294,16 +10310,16 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog,
 		pr_warn("prog '%s': failed to create %s '%s' perf event: %s\n",
 			prog->name, retprobe ? "kretprobe" : "kprobe", func_name,
 			libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
-		return ERR_PTR(pfd);
+		return libbpf_err_ptr(pfd);
 	}
 	link = bpf_program__attach_perf_event(prog, pfd);
-	if (IS_ERR(link)) {
+	err = libbpf_get_error(link);
+	if (err) {
 		close(pfd);
-		err = PTR_ERR(link);
 		pr_warn("prog '%s': failed to attach to %s '%s': %s\n",
 			prog->name, retprobe ? "kretprobe" : "kprobe", func_name,
 			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
-		return link;
+		return libbpf_err_ptr(err);
 	}
 	return link;
 }
@@ -10336,17 +10352,17 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog,
 			prog->name, retprobe ? "uretprobe" : "uprobe",
 			binary_path, func_offset,
 			libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
-		return ERR_PTR(pfd);
+		return libbpf_err_ptr(pfd);
 	}
 	link = bpf_program__attach_perf_event(prog, pfd);
-	if (IS_ERR(link)) {
+	err = libbpf_get_error(link);
+	if (err) {
 		close(pfd);
-		err = PTR_ERR(link);
 		pr_warn("prog '%s': failed to attach to %s '%s:0x%zx': %s\n",
 			prog->name, retprobe ? "uretprobe" : "uprobe",
 			binary_path, func_offset,
 			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
-		return link;
+		return libbpf_err_ptr(err);
 	}
 	return link;
 }
@@ -10414,16 +10430,16 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog,
 		pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n",
 			prog->name, tp_category, tp_name,
 			libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
-		return ERR_PTR(pfd);
+		return libbpf_err_ptr(pfd);
 	}
 	link = bpf_program__attach_perf_event(prog, pfd);
-	if (IS_ERR(link)) {
+	err = libbpf_get_error(link);
+	if (err) {
 		close(pfd);
-		err = PTR_ERR(link);
 		pr_warn("prog '%s': failed to attach to tracepoint '%s/%s': %s\n",
 			prog->name, tp_category, tp_name,
 			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
-		return link;
+		return libbpf_err_ptr(err);
 	}
 	return link;
 }
@@ -10436,20 +10452,19 @@ static struct bpf_link *attach_tp(const struct bpf_sec_def *sec,
 
 	sec_name = strdup(prog->sec_name);
 	if (!sec_name)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 
 	/* extract "tp/<category>/<name>" */
 	tp_cat = sec_name + sec->len;
 	tp_name = strchr(tp_cat, '/');
 	if (!tp_name) {
-		link = ERR_PTR(-EINVAL);
-		goto out;
+		free(sec_name);
+		return libbpf_err_ptr(-EINVAL);
 	}
 	*tp_name = '\0';
 	tp_name++;
 
 	link = bpf_program__attach_tracepoint(prog, tp_cat, tp_name);
-out:
 	free(sec_name);
 	return link;
 }
@@ -10464,12 +10479,12 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog,
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
 		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 	}
 
 	link = calloc(1, sizeof(*link));
 	if (!link)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 	link->detach = &bpf_link__detach_fd;
 
 	pfd = bpf_raw_tracepoint_open(tp_name, prog_fd);
@@ -10478,7 +10493,7 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog,
 		free(link);
 		pr_warn("prog '%s': failed to attach to raw tracepoint '%s': %s\n",
 			prog->name, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
-		return ERR_PTR(pfd);
+		return libbpf_err_ptr(pfd);
 	}
 	link->fd = pfd;
 	return link;
@@ -10502,12 +10517,12 @@ static struct bpf_link *bpf_program__attach_btf_id(struct bpf_program *prog)
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
 		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 	}
 
 	link = calloc(1, sizeof(*link));
 	if (!link)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 	link->detach = &bpf_link__detach_fd;
 
 	pfd = bpf_raw_tracepoint_open(NULL, prog_fd);
@@ -10516,7 +10531,7 @@ static struct bpf_link *bpf_program__attach_btf_id(struct bpf_program *prog)
 		free(link);
 		pr_warn("prog '%s': failed to attach: %s\n",
 			prog->name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
-		return ERR_PTR(pfd);
+		return libbpf_err_ptr(pfd);
 	}
 	link->fd = pfd;
 	return (struct bpf_link *)link;
@@ -10544,12 +10559,6 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec,
 	return bpf_program__attach_lsm(prog);
 }
 
-static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
-				    struct bpf_program *prog)
-{
-	return bpf_program__attach_iter(prog, NULL);
-}
-
 static struct bpf_link *
 bpf_program__attach_fd(struct bpf_program *prog, int target_fd, int btf_id,
 		       const char *target_name)
@@ -10564,12 +10573,12 @@ bpf_program__attach_fd(struct bpf_program *prog, int target_fd, int btf_id,
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
 		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 	}
 
 	link = calloc(1, sizeof(*link));
 	if (!link)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 	link->detach = &bpf_link__detach_fd;
 
 	attach_type = bpf_program__get_expected_attach_type(prog);
@@ -10580,7 +10589,7 @@ bpf_program__attach_fd(struct bpf_program *prog, int target_fd, int btf_id,
 		pr_warn("prog '%s': failed to attach to %s: %s\n",
 			prog->name, target_name,
 			libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
-		return ERR_PTR(link_fd);
+		return libbpf_err_ptr(link_fd);
 	}
 	link->fd = link_fd;
 	return link;
@@ -10613,19 +10622,19 @@ struct bpf_link *bpf_program__attach_freplace(struct bpf_program *prog,
 	if (!!target_fd != !!attach_func_name) {
 		pr_warn("prog '%s': supply none or both of target_fd and attach_func_name\n",
 			prog->name);
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 	}
 
 	if (prog->type != BPF_PROG_TYPE_EXT) {
 		pr_warn("prog '%s': only BPF_PROG_TYPE_EXT can attach as freplace",
 			prog->name);
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 	}
 
 	if (target_fd) {
 		btf_id = libbpf_find_prog_btf_id(attach_func_name, target_fd);
 		if (btf_id < 0)
-			return ERR_PTR(btf_id);
+			return libbpf_err_ptr(btf_id);
 
 		return bpf_program__attach_fd(prog, target_fd, btf_id, "freplace");
 	} else {
@@ -10647,7 +10656,7 @@ bpf_program__attach_iter(struct bpf_program *prog,
 	__u32 target_fd = 0;
 
 	if (!OPTS_VALID(opts, bpf_iter_attach_opts))
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 
 	link_create_opts.iter_info = OPTS_GET(opts, link_info, (void *)0);
 	link_create_opts.iter_info_len = OPTS_GET(opts, link_info_len, 0);
@@ -10655,12 +10664,12 @@ bpf_program__attach_iter(struct bpf_program *prog,
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
 		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 	}
 
 	link = calloc(1, sizeof(*link));
 	if (!link)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 	link->detach = &bpf_link__detach_fd;
 
 	link_fd = bpf_link_create(prog_fd, target_fd, BPF_TRACE_ITER,
@@ -10670,19 +10679,25 @@ bpf_program__attach_iter(struct bpf_program *prog,
 		free(link);
 		pr_warn("prog '%s': failed to attach to iterator: %s\n",
 			prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
-		return ERR_PTR(link_fd);
+		return libbpf_err_ptr(link_fd);
 	}
 	link->fd = link_fd;
 	return link;
 }
 
+static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
+				    struct bpf_program *prog)
+{
+	return bpf_program__attach_iter(prog, NULL);
+}
+
 struct bpf_link *bpf_program__attach(struct bpf_program *prog)
 {
 	const struct bpf_sec_def *sec_def;
 
 	sec_def = find_sec_def(prog->sec_name);
 	if (!sec_def || !sec_def->attach_fn)
-		return ERR_PTR(-ESRCH);
+		return libbpf_err_ptr(-ESRCH);
 
 	return sec_def->attach_fn(sec_def, prog);
 }
@@ -10705,11 +10720,11 @@ struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map)
 	int err;
 
 	if (!bpf_map__is_struct_ops(map) || map->fd == -1)
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 
 	link = calloc(1, sizeof(*link));
 	if (!link)
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 
 	st_ops = map->st_ops;
 	for (i = 0; i < btf_vlen(st_ops->type); i++) {
@@ -10729,7 +10744,7 @@ struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map)
 	if (err) {
 		err = -errno;
 		free(link);
-		return ERR_PTR(err);
+		return libbpf_err_ptr(err);
 	}
 
 	link->detach = bpf_link__detach_struct_ops;
@@ -10783,7 +10798,7 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 	}
 
 	ring_buffer_write_tail(header, data_tail);
-	return ret;
+	return libbpf_err(ret);
 }
 
 struct perf_buffer;
@@ -10936,7 +10951,7 @@ struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt,
 	p.lost_cb = opts ? opts->lost_cb : NULL;
 	p.ctx = opts ? opts->ctx : NULL;
 
-	return __perf_buffer__new(map_fd, page_cnt, &p);
+	return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p));
 }
 
 struct perf_buffer *
@@ -10952,7 +10967,7 @@ perf_buffer__new_raw(int map_fd, size_t page_cnt,
 	p.cpus = opts->cpus;
 	p.map_keys = opts->map_keys;
 
-	return __perf_buffer__new(map_fd, page_cnt, &p);
+	return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p));
 }
 
 static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
@@ -11173,16 +11188,19 @@ int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms)
 	int i, cnt, err;
 
 	cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms);
+	if (cnt < 0)
+		return libbpf_err_errno(cnt);
+
 	for (i = 0; i < cnt; i++) {
 		struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr;
 
 		err = perf_buffer__process_records(pb, cpu_buf);
 		if (err) {
 			pr_warn("error while processing records: %d\n", err);
-			return err;
+			return libbpf_err(err);
 		}
 	}
-	return cnt < 0 ? -errno : cnt;
+	return cnt;
 }
 
 /* Return number of PERF_EVENT_ARRAY map slots set up by this perf_buffer
@@ -11203,11 +11221,11 @@ int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx)
 	struct perf_cpu_buf *cpu_buf;
 
 	if (buf_idx >= pb->cpu_cnt)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	cpu_buf = pb->cpu_bufs[buf_idx];
 	if (!cpu_buf)
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 
 	return cpu_buf->fd;
 }
@@ -11225,11 +11243,11 @@ int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx)
 	struct perf_cpu_buf *cpu_buf;
 
 	if (buf_idx >= pb->cpu_cnt)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	cpu_buf = pb->cpu_bufs[buf_idx];
 	if (!cpu_buf)
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 
 	return perf_buffer__process_records(pb, cpu_buf);
 }
@@ -11247,7 +11265,7 @@ int perf_buffer__consume(struct perf_buffer *pb)
 		err = perf_buffer__process_records(pb, cpu_buf);
 		if (err) {
 			pr_warn("perf_buffer: failed to process records in buffer #%d: %d\n", i, err);
-			return err;
+			return libbpf_err(err);
 		}
 	}
 	return 0;
@@ -11359,13 +11377,13 @@ bpf_program__get_prog_info_linear(int fd, __u64 arrays)
 	void *ptr;
 
 	if (arrays >> BPF_PROG_INFO_LAST_ARRAY)
-		return ERR_PTR(-EINVAL);
+		return libbpf_err_ptr(-EINVAL);
 
 	/* step 1: get array dimensions */
 	err = bpf_obj_get_info_by_fd(fd, &info, &info_len);
 	if (err) {
 		pr_debug("can't get prog info: %s", strerror(errno));
-		return ERR_PTR(-EFAULT);
+		return libbpf_err_ptr(-EFAULT);
 	}
 
 	/* step 2: calculate total size of all arrays */
@@ -11397,7 +11415,7 @@ bpf_program__get_prog_info_linear(int fd, __u64 arrays)
 	data_len = roundup(data_len, sizeof(__u64));
 	info_linear = malloc(sizeof(struct bpf_prog_info_linear) + data_len);
 	if (!info_linear)
-		return ERR_PTR(-ENOMEM);
+		return libbpf_err_ptr(-ENOMEM);
 
 	/* step 4: fill data to info_linear->info */
 	info_linear->arrays = arrays;
@@ -11429,7 +11447,7 @@ bpf_program__get_prog_info_linear(int fd, __u64 arrays)
 	if (err) {
 		pr_debug("can't get prog info: %s", strerror(errno));
 		free(info_linear);
-		return ERR_PTR(-EFAULT);
+		return libbpf_err_ptr(-EFAULT);
 	}
 
 	/* step 6: verify the data */
@@ -11508,26 +11526,26 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
 	int btf_obj_fd = 0, btf_id = 0, err;
 
 	if (!prog || attach_prog_fd < 0 || !attach_func_name)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (prog->obj->loaded)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (attach_prog_fd) {
 		btf_id = libbpf_find_prog_btf_id(attach_func_name,
 						 attach_prog_fd);
 		if (btf_id < 0)
-			return btf_id;
+			return libbpf_err(btf_id);
 	} else {
 		/* load btf_vmlinux, if not yet */
 		err = bpf_object__load_vmlinux_btf(prog->obj, true);
 		if (err)
-			return err;
+			return libbpf_err(err);
 		err = find_kernel_btf_id(prog->obj, attach_func_name,
 					 prog->expected_attach_type,
 					 &btf_obj_fd, &btf_id);
 		if (err)
-			return err;
+			return libbpf_err(err);
 	}
 
 	prog->attach_btf_id = btf_id;
@@ -11626,7 +11644,7 @@ int libbpf_num_possible_cpus(void)
 
 	err = parse_cpu_mask_file(fcpu, &mask, &n);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	tmp_cpus = 0;
 	for (i = 0; i < n; i++) {
@@ -11646,7 +11664,7 @@ int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
 		.object_name = s->name,
 	);
 	struct bpf_object *obj;
-	int i;
+	int i, err;
 
 	/* Attempt to preserve opts->object_name, unless overriden by user
 	 * explicitly. Overwriting object name for skeletons is discouraged,
@@ -11661,10 +11679,11 @@ int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
 	}
 
 	obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts);
-	if (IS_ERR(obj)) {
-		pr_warn("failed to initialize skeleton BPF object '%s': %ld\n",
-			s->name, PTR_ERR(obj));
-		return PTR_ERR(obj);
+	err = libbpf_get_error(obj);
+	if (err) {
+		pr_warn("failed to initialize skeleton BPF object '%s': %d\n",
+			s->name, err);
+		return libbpf_err(err);
 	}
 
 	*s->obj = obj;
@@ -11677,7 +11696,7 @@ int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
 		*map = bpf_object__find_map_by_name(obj, name);
 		if (!*map) {
 			pr_warn("failed to find skeleton map '%s'\n", name);
-			return -ESRCH;
+			return libbpf_err(-ESRCH);
 		}
 
 		/* externs shouldn't be pre-setup from user code */
@@ -11692,7 +11711,7 @@ int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
 		*prog = bpf_object__find_program_by_name(obj, name);
 		if (!*prog) {
 			pr_warn("failed to find skeleton program '%s'\n", name);
-			return -ESRCH;
+			return libbpf_err(-ESRCH);
 		}
 	}
 
@@ -11706,7 +11725,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
 	err = bpf_object__load(*s->obj);
 	if (err) {
 		pr_warn("failed to load BPF skeleton '%s': %d\n", s->name, err);
-		return err;
+		return libbpf_err(err);
 	}
 
 	for (i = 0; i < s->map_cnt; i++) {
@@ -11745,7 +11764,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
 			*mmaped = NULL;
 			pr_warn("failed to re-mmap() map '%s': %d\n",
 				 bpf_map__name(map), err);
-			return err;
+			return libbpf_err(err);
 		}
 	}
 
@@ -11754,7 +11773,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
 
 int bpf_object__attach_skeleton(struct bpf_object_skeleton *s)
 {
-	int i;
+	int i, err;
 
 	for (i = 0; i < s->prog_cnt; i++) {
 		struct bpf_program *prog = *s->progs[i].prog;
@@ -11769,10 +11788,11 @@ int bpf_object__attach_skeleton(struct bpf_object_skeleton *s)
 			continue;
 
 		*link = sec_def->attach_fn(sec_def, prog);
-		if (IS_ERR(*link)) {
-			pr_warn("failed to auto-attach program '%s': %ld\n",
-				bpf_program__name(prog), PTR_ERR(*link));
-			return PTR_ERR(*link);
+		err = libbpf_get_error(*link);
+		if (err) {
+			pr_warn("failed to auto-attach program '%s': %d\n",
+				bpf_program__name(prog), err);
+			return libbpf_err(err);
 		}
 	}
 
diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c
index 0afb51f7a919..96f67a772a1b 100644
--- a/tools/lib/bpf/libbpf_errno.c
+++ b/tools/lib/bpf/libbpf_errno.c
@@ -12,6 +12,7 @@
 #include <string.h>
 
 #include "libbpf.h"
+#include "libbpf_internal.h"
 
 /* make sure libbpf doesn't use kernel-only integer typedefs */
 #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
@@ -39,7 +40,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
 int libbpf_strerror(int err, char *buf, size_t size)
 {
 	if (!buf || !size)
-		return -1;
+		return libbpf_err(-EINVAL);
 
 	err = err > 0 ? err : -err;
 
@@ -48,7 +49,7 @@ int libbpf_strerror(int err, char *buf, size_t size)
 
 		ret = strerror_r(err, buf, size);
 		buf[size - 1] = '\0';
-		return ret;
+		return libbpf_err_errno(ret);
 	}
 
 	if (err < __LIBBPF_ERRNO__END) {
@@ -62,5 +63,5 @@ int libbpf_strerror(int err, char *buf, size_t size)
 
 	snprintf(buf, size, "Unknown libbpf error %d", err);
 	buf[size - 1] = '\0';
-	return -1;
+	return libbpf_err(-ENOENT);
 }
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 8d59683ab246..bc3a6b402e8e 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -462,4 +462,31 @@ static inline int libbpf_err_errno(int ret)
 	return ret;
 }
 
+/* handle error for pointer-returning APIs, err is assumed to be < 0 always */
+static inline void *libbpf_err_ptr(int err)
+{
+	/* set errno on error, this doesn't break anything */
+	errno = -err;
+
+	if (libbpf_mode & LIBBPF_STRICT_CLEAN_PTRS)
+		return NULL;
+
+	/* legacy: encode err as ptr */
+	return ERR_PTR(err);
+}
+
+/* handle pointer-returning APIs' error handling */
+static inline void *libbpf_ptr(void *ret)
+{
+	/* set errno on error, this doesn't break anything */
+	if (IS_ERR(ret))
+		errno = -PTR_ERR(ret);
+
+	if (libbpf_mode & LIBBPF_STRICT_CLEAN_PTRS)
+		return IS_ERR(ret) ? NULL : ret;
+
+	/* legacy: pass-through original pointer */
+	return ret;
+}
+
 #endif /* __LIBBPF_LIBBPF_INTERNAL_H */
diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 1dca41a24f75..10911a8cad0f 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -220,16 +220,16 @@ struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts
 	int err;
 
 	if (!OPTS_VALID(opts, bpf_linker_opts))
-		return NULL;
+		return errno = EINVAL, NULL;
 
 	if (elf_version(EV_CURRENT) == EV_NONE) {
 		pr_warn_elf("libelf initialization failed");
-		return NULL;
+		return errno = EINVAL, NULL;
 	}
 
 	linker = calloc(1, sizeof(*linker));
 	if (!linker)
-		return NULL;
+		return errno = ENOMEM, NULL;
 
 	linker->fd = -1;
 
@@ -241,7 +241,7 @@ struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts
 
 err_out:
 	bpf_linker__free(linker);
-	return NULL;
+	return errno = -err, NULL;
 }
 
 static struct dst_sec *add_dst_sec(struct bpf_linker *linker, const char *sec_name)
@@ -444,10 +444,10 @@ int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
 	int err = 0;
 
 	if (!OPTS_VALID(opts, bpf_linker_file_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (!linker->elf)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	err = err ?: linker_load_obj_file(linker, filename, opts, &obj);
 	err = err ?: linker_append_sec_data(linker, &obj);
@@ -467,7 +467,7 @@ int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
 	if (obj.fd >= 0)
 		close(obj.fd);
 
-	return err;
+	return libbpf_err(err);
 }
 
 static bool is_dwarf_sec_name(const char *name)
@@ -2548,11 +2548,11 @@ int bpf_linker__finalize(struct bpf_linker *linker)
 	int err, i;
 
 	if (!linker->elf)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	err = finalize_btf(linker);
 	if (err)
-		return err;
+		return libbpf_err(err);
 
 	/* Finalize strings */
 	strs_sz = strset__data_size(linker->strtab_strs);
@@ -2584,14 +2584,14 @@ int bpf_linker__finalize(struct bpf_linker *linker)
 	if (elf_update(linker->elf, ELF_C_NULL) < 0) {
 		err = -errno;
 		pr_warn_elf("failed to finalize ELF layout");
-		return err;
+		return libbpf_err(err);
 	}
 
 	/* Write out final ELF contents */
 	if (elf_update(linker->elf, ELF_C_WRITE) < 0) {
 		err = -errno;
 		pr_warn_elf("failed to write ELF contents");
-		return err;
+		return libbpf_err(err);
 	}
 
 	elf_end(linker->elf);
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index 47444588e0d2..d743c8721aa7 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -225,22 +225,26 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd,
 int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags,
 			     const struct bpf_xdp_set_link_opts *opts)
 {
-	int old_fd = -1;
+	int old_fd = -1, ret;
 
 	if (!OPTS_VALID(opts, bpf_xdp_set_link_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	if (OPTS_HAS(opts, old_fd)) {
 		old_fd = OPTS_GET(opts, old_fd, -1);
 		flags |= XDP_FLAGS_REPLACE;
 	}
 
-	return __bpf_set_link_xdp_fd_replace(ifindex, fd, old_fd, flags);
+	ret = __bpf_set_link_xdp_fd_replace(ifindex, fd, old_fd, flags);
+	return libbpf_err(ret);
 }
 
 int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 {
-	return __bpf_set_link_xdp_fd_replace(ifindex, fd, 0, flags);
+	int ret;
+
+	ret = __bpf_set_link_xdp_fd_replace(ifindex, fd, 0, flags);
+	return libbpf_err(ret);
 }
 
 static int __dump_link_nlmsg(struct nlmsghdr *nlh,
@@ -321,13 +325,13 @@ int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info,
 	};
 
 	if (flags & ~XDP_FLAGS_MASK || !info_size)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	/* Check whether the single {HW,DRV,SKB} mode is set */
 	flags &= (XDP_FLAGS_SKB_MODE | XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE);
 	mask = flags - 1;
 	if (flags && flags & mask)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	xdp_id.ifindex = ifindex;
 	xdp_id.flags = flags;
@@ -341,7 +345,7 @@ int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info,
 		memset((void *) info + sz, 0, info_size - sz);
 	}
 
-	return ret;
+	return libbpf_err(ret);
 }
 
 static __u32 get_xdp_id(struct xdp_link_info *info, __u32 flags)
@@ -369,7 +373,7 @@ int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags)
 	if (!ret)
 		*prog_id = get_xdp_id(&info, flags);
 
-	return ret;
+	return libbpf_err(ret);
 }
 
 typedef int (*qdisc_config_t)(struct nlmsghdr *nh, struct tcmsg *t,
@@ -463,11 +467,14 @@ static int tc_qdisc_delete(struct bpf_tc_hook *hook)
 
 int bpf_tc_hook_create(struct bpf_tc_hook *hook)
 {
+	int ret;
+
 	if (!hook || !OPTS_VALID(hook, bpf_tc_hook) ||
 	    OPTS_GET(hook, ifindex, 0) <= 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
-	return tc_qdisc_create_excl(hook);
+	ret = tc_qdisc_create_excl(hook);
+	return libbpf_err(ret);
 }
 
 static int __bpf_tc_detach(const struct bpf_tc_hook *hook,
@@ -478,18 +485,18 @@ int bpf_tc_hook_destroy(struct bpf_tc_hook *hook)
 {
 	if (!hook || !OPTS_VALID(hook, bpf_tc_hook) ||
 	    OPTS_GET(hook, ifindex, 0) <= 0)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	switch (OPTS_GET(hook, attach_point, 0)) {
 	case BPF_TC_INGRESS:
 	case BPF_TC_EGRESS:
-		return __bpf_tc_detach(hook, NULL, true);
+		return libbpf_err(__bpf_tc_detach(hook, NULL, true));
 	case BPF_TC_INGRESS | BPF_TC_EGRESS:
-		return tc_qdisc_delete(hook);
+		return libbpf_err(tc_qdisc_delete(hook));
 	case BPF_TC_CUSTOM:
-		return -EOPNOTSUPP;
+		return libbpf_err(-EOPNOTSUPP);
 	default:
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 }
 
@@ -574,7 +581,7 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
 	if (!hook || !opts ||
 	    !OPTS_VALID(hook, bpf_tc_hook) ||
 	    !OPTS_VALID(opts, bpf_tc_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	ifindex      = OPTS_GET(hook, ifindex, 0);
 	parent       = OPTS_GET(hook, parent, 0);
@@ -587,11 +594,11 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
 	flags        = OPTS_GET(opts, flags, 0);
 
 	if (ifindex <= 0 || !prog_fd || prog_id)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (priority > UINT16_MAX)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (flags & ~BPF_TC_F_REPLACE)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	flags = (flags & BPF_TC_F_REPLACE) ? NLM_F_REPLACE : NLM_F_EXCL;
 	protocol = ETH_P_ALL;
@@ -608,32 +615,32 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
 
 	ret = tc_get_tcm_parent(attach_point, &parent);
 	if (ret < 0)
-		return ret;
+		return libbpf_err(ret);
 	req.tc.tcm_parent = parent;
 
 	ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf"));
 	if (ret < 0)
-		return ret;
+		return libbpf_err(ret);
 	nla = nlattr_begin_nested(&req.nh, sizeof(req), TCA_OPTIONS);
 	if (!nla)
-		return -EMSGSIZE;
+		return libbpf_err(-EMSGSIZE);
 	ret = tc_add_fd_and_name(&req.nh, sizeof(req), prog_fd);
 	if (ret < 0)
-		return ret;
+		return libbpf_err(ret);
 	bpf_flags = TCA_BPF_FLAG_ACT_DIRECT;
 	ret = nlattr_add(&req.nh, sizeof(req), TCA_BPF_FLAGS, &bpf_flags,
 			 sizeof(bpf_flags));
 	if (ret < 0)
-		return ret;
+		return libbpf_err(ret);
 	nlattr_end_nested(&req.nh, nla);
 
 	info.opts = opts;
 
 	ret = libbpf_netlink_send_recv(&req.nh, get_tc_info, NULL, &info);
 	if (ret < 0)
-		return ret;
+		return libbpf_err(ret);
 	if (!info.processed)
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 	return ret;
 }
 
@@ -708,7 +715,13 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook,
 int bpf_tc_detach(const struct bpf_tc_hook *hook,
 		  const struct bpf_tc_opts *opts)
 {
-	return !opts ? -EINVAL : __bpf_tc_detach(hook, opts, false);
+	int ret;
+
+	if (!opts)
+		return libbpf_err(-EINVAL);
+
+	ret = __bpf_tc_detach(hook, opts, false);
+	return libbpf_err(ret);
 }
 
 int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
@@ -725,7 +738,7 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
 	if (!hook || !opts ||
 	    !OPTS_VALID(hook, bpf_tc_hook) ||
 	    !OPTS_VALID(opts, bpf_tc_opts))
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	ifindex      = OPTS_GET(hook, ifindex, 0);
 	parent       = OPTS_GET(hook, parent, 0);
@@ -739,9 +752,9 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
 
 	if (ifindex <= 0 || flags || prog_fd || prog_id ||
 	    !handle || !priority)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	if (priority > UINT16_MAX)
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 
 	protocol = ETH_P_ALL;
 
@@ -756,19 +769,19 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
 
 	ret = tc_get_tcm_parent(attach_point, &parent);
 	if (ret < 0)
-		return ret;
+		return libbpf_err(ret);
 	req.tc.tcm_parent = parent;
 
 	ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf"));
 	if (ret < 0)
-		return ret;
+		return libbpf_err(ret);
 
 	info.opts = opts;
 
 	ret = libbpf_netlink_send_recv(&req.nh, get_tc_info, NULL, &info);
 	if (ret < 0)
-		return ret;
+		return libbpf_err(ret);
 	if (!info.processed)
-		return -ENOENT;
+		return libbpf_err(-ENOENT);
 	return ret;
 }
diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
index 1d80ad4e0de8..8bc117bcc7bc 100644
--- a/tools/lib/bpf/ringbuf.c
+++ b/tools/lib/bpf/ringbuf.c
@@ -69,23 +69,23 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
 		err = -errno;
 		pr_warn("ringbuf: failed to get map info for fd=%d: %d\n",
 			map_fd, err);
-		return err;
+		return libbpf_err(err);
 	}
 
 	if (info.type != BPF_MAP_TYPE_RINGBUF) {
 		pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n",
 			map_fd);
-		return -EINVAL;
+		return libbpf_err(-EINVAL);
 	}
 
 	tmp = libbpf_reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings));
 	if (!tmp)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 	rb->rings = tmp;
 
 	tmp = libbpf_reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events));
 	if (!tmp)
-		return -ENOMEM;
+		return libbpf_err(-ENOMEM);
 	rb->events = tmp;
 
 	r = &rb->rings[rb->ring_cnt];
@@ -103,7 +103,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
 		err = -errno;
 		pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
 			map_fd, err);
-		return err;
+		return libbpf_err(err);
 	}
 	r->consumer_pos = tmp;
 
@@ -118,7 +118,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
 		ringbuf_unmap_ring(rb, r);
 		pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n",
 			map_fd, err);
-		return err;
+		return libbpf_err(err);
 	}
 	r->producer_pos = tmp;
 	r->data = tmp + rb->page_size;
@@ -133,7 +133,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
 		ringbuf_unmap_ring(rb, r);
 		pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n",
 			map_fd, err);
-		return err;
+		return libbpf_err(err);
 	}
 
 	rb->ring_cnt++;
@@ -165,11 +165,11 @@ ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
 	int err;
 
 	if (!OPTS_VALID(opts, ring_buffer_opts))
-		return NULL;
+		return errno = EINVAL, NULL;
 
 	rb = calloc(1, sizeof(*rb));
 	if (!rb)
-		return NULL;
+		return errno = ENOMEM, NULL;
 
 	rb->page_size = getpagesize();
 
@@ -188,7 +188,7 @@ ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
 
 err_out:
 	ring_buffer__free(rb);
-	return NULL;
+	return errno = -err, NULL;
 }
 
 static inline int roundup_len(__u32 len)
@@ -260,7 +260,7 @@ int ring_buffer__consume(struct ring_buffer *rb)
 
 		err = ringbuf_process_ring(ring);
 		if (err < 0)
-			return err;
+			return libbpf_err(err);
 		res += err;
 	}
 	if (res > INT_MAX)
@@ -279,7 +279,7 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
 
 	cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms);
 	if (cnt < 0)
-		return -errno;
+		return libbpf_err(-errno);
 
 	for (i = 0; i < cnt; i++) {
 		__u32 ring_id = rb->events[i].data.fd;
@@ -287,7 +287,7 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
 
 		err = ringbuf_process_ring(ring);
 		if (err < 0)
-			return err;
+			return libbpf_err(err);
 		res += err;
 	}
 	if (res > INT_MAX)
-- 
cgit v1.2.3-70-g09d2


From 9c6c0449deb41dbe3a66ab9adfd08020bba6c43d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 24 May 2021 20:59:35 -0700
Subject: bpftool: Set errno on skeleton failures and propagate errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow libbpf's error handling conventions and pass through errors and errno
properly. Skeleton code always returned NULL on errors (not ERR_PTR(err)), so
there are no backwards compatibility concerns. But now we also set errno
properly, so it's possible to distinguish different reasons for failure, if
necessary.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210525035935.1461796-6-andrii@kernel.org
---
 tools/bpf/bpftool/gen.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 13b0aa789178..1d71ff8c52fa 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -713,6 +713,7 @@ static int do_skeleton(int argc, char **argv)
 		#ifndef %2$s						    \n\
 		#define %2$s						    \n\
 									    \n\
+		#include <errno.h>					    \n\
 		#include <stdlib.h>					    \n\
 		#include <bpf/libbpf.h>					    \n\
 									    \n\
@@ -793,18 +794,23 @@ static int do_skeleton(int argc, char **argv)
 		%1$s__open_opts(const struct bpf_object_open_opts *opts)    \n\
 		{							    \n\
 			struct %1$s *obj;				    \n\
+			int err;					    \n\
 									    \n\
 			obj = (struct %1$s *)calloc(1, sizeof(*obj));	    \n\
-			if (!obj)					    \n\
+			if (!obj) {					    \n\
+				errno = ENOMEM;				    \n\
 				return NULL;				    \n\
-			if (%1$s__create_skeleton(obj))			    \n\
-				goto err;				    \n\
-			if (bpf_object__open_skeleton(obj->skeleton, opts)) \n\
-				goto err;				    \n\
+			}						    \n\
+									    \n\
+			err = %1$s__create_skeleton(obj);		    \n\
+			err = err ?: bpf_object__open_skeleton(obj->skeleton, opts);\n\
+			if (err)					    \n\
+				goto err_out;				    \n\
 									    \n\
 			return obj;					    \n\
-		err:							    \n\
+		err_out:						    \n\
 			%1$s__destroy(obj);				    \n\
+			errno = -err;					    \n\
 			return NULL;					    \n\
 		}							    \n\
 									    \n\
@@ -824,12 +830,15 @@ static int do_skeleton(int argc, char **argv)
 		%1$s__open_and_load(void)				    \n\
 		{							    \n\
 			struct %1$s *obj;				    \n\
+			int err;					    \n\
 									    \n\
 			obj = %1$s__open();				    \n\
 			if (!obj)					    \n\
 				return NULL;				    \n\
-			if (%1$s__load(obj)) {				    \n\
+			err = %1$s__load(obj);				    \n\
+			if (err) {					    \n\
 				%1$s__destroy(obj);			    \n\
+				errno = -err;				    \n\
 				return NULL;				    \n\
 			}						    \n\
 			return obj;					    \n\
@@ -860,7 +869,7 @@ static int do_skeleton(int argc, char **argv)
 									    \n\
 			s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));\n\
 			if (!s)						    \n\
-				return -1;				    \n\
+				goto err;				    \n\
 			obj->skeleton = s;				    \n\
 									    \n\
 			s->sz = sizeof(*s);				    \n\
@@ -949,7 +958,7 @@ static int do_skeleton(int argc, char **argv)
 			return 0;					    \n\
 		err:							    \n\
 			bpf_object__destroy_skeleton(s);		    \n\
-			return -1;					    \n\
+			return -ENOMEM;					    \n\
 		}							    \n\
 									    \n\
 		#endif /* %s */						    \n\
-- 
cgit v1.2.3-70-g09d2


From cb261b594b4108668e00f565184c7c221efe0359 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 19 May 2021 17:07:44 +0800
Subject: bpf: Run devmap xdp_prog on flush instead of bulk enqueue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This changes the devmap XDP program support to run the program when the
bulk queue is flushed instead of before the frame is enqueued. This has
a couple of benefits:

- It "sorts" the packets by destination devmap entry, and then runs the
  same BPF program on all the packets in sequence. This ensures that we
  keep the XDP program and destination device properties hot in I-cache.

- It makes the multicast implementation simpler because it can just
  enqueue packets using bq_enqueue() without having to deal with the
  devmap program at all.

The drawback is that if the devmap program drops the packet, the enqueue
step is redundant. However, arguably this is mostly visible in a
micro-benchmark, and with more mixed traffic the I-cache benefit should
win out. The performance impact of just this patch is as follows:

Using 2 10Gb i40e NIC, redirecting one to another, or into a veth interface,
which do XDP_DROP on veth peer. With xdp_redirect_map in sample/bpf, send
pkts via pktgen cmd:
./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64

There are about +/- 0.1M deviation for native testing, the performance
improved for the base-case, but some drop back with xdp devmap prog attached.

Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
5.12 rc4         | xdp_redirect_map   i40e->i40e  |    1.9M |   9.6M |  8.4M
5.12 rc4         | xdp_redirect_map   i40e->veth  |    1.7M |  11.7M |  9.8M
5.12 rc4 + patch | xdp_redirect_map   i40e->i40e  |    1.9M |   9.8M |  8.0M
5.12 rc4 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  12.0M |  9.4M

When bq_xmit_all() is called from bq_enqueue(), another packet will
always be enqueued immediately after, so clearing dev_rx, xdp_prog and
flush_node in bq_xmit_all() is redundant. Move the clear to __dev_flush(),
and only check them once in bq_enqueue() since they are all modified
together.

This change also has the side effect of extending the lifetime of the
RCU-protected xdp_prog that lives inside the devmap entries: Instead of
just living for the duration of the XDP program invocation, the
reference now lives all the way until the bq is flushed. This is safe
because the bq flush happens at the end of the NAPI poll loop, so
everything happens between a local_bh_disable()/local_bh_enable() pair.
However, this is by no means obvious from looking at the call sites; in
particular, some drivers have an additional rcu_read_lock() around only
the XDP program invocation, which only confuses matters further.
Cleaning this up will be done in a separate patch series.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210519090747.1655268-2-liuhangbin@gmail.com
---
 kernel/bpf/devmap.c | 127 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 76 insertions(+), 51 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index d60d617ec0d7..642264e32abd 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue {
 	struct list_head flush_node;
 	struct net_device *dev;
 	struct net_device *dev_rx;
+	struct bpf_prog *xdp_prog;
 	unsigned int count;
 };
 
@@ -326,22 +327,71 @@ bool dev_map_can_have_prog(struct bpf_map *map)
 	return false;
 }
 
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+				struct xdp_frame **frames, int n,
+				struct net_device *dev)
+{
+	struct xdp_txq_info txq = { .dev = dev };
+	struct xdp_buff xdp;
+	int i, nframes = 0;
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		u32 act;
+		int err;
+
+		xdp_convert_frame_to_buff(xdpf, &xdp);
+		xdp.txq = &txq;
+
+		act = bpf_prog_run_xdp(xdp_prog, &xdp);
+		switch (act) {
+		case XDP_PASS:
+			err = xdp_update_frame_from_buff(&xdp, xdpf);
+			if (unlikely(err < 0))
+				xdp_return_frame_rx_napi(xdpf);
+			else
+				frames[nframes++] = xdpf;
+			break;
+		default:
+			bpf_warn_invalid_xdp_action(act);
+			fallthrough;
+		case XDP_ABORTED:
+			trace_xdp_exception(dev, xdp_prog, act);
+			fallthrough;
+		case XDP_DROP:
+			xdp_return_frame_rx_napi(xdpf);
+			break;
+		}
+	}
+	return nframes; /* sent frames count */
+}
+
 static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 {
 	struct net_device *dev = bq->dev;
-	int sent = 0, err = 0;
+	int sent = 0, drops = 0, err = 0;
+	unsigned int cnt = bq->count;
+	int to_send = cnt;
 	int i;
 
-	if (unlikely(!bq->count))
+	if (unlikely(!cnt))
 		return;
 
-	for (i = 0; i < bq->count; i++) {
+	for (i = 0; i < cnt; i++) {
 		struct xdp_frame *xdpf = bq->q[i];
 
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+	if (bq->xdp_prog) {
+		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+		if (!to_send)
+			goto out;
+
+		drops = cnt - to_send;
+	}
+
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
 	if (sent < 0) {
 		/* If ndo_xdp_xmit fails with an errno, no frames have
 		 * been xmit'ed.
@@ -353,13 +403,13 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 	/* If not all frames have been transmitted, it is our
 	 * responsibility to free them
 	 */
-	for (i = sent; unlikely(i < bq->count); i++)
+	for (i = sent; unlikely(i < to_send); i++)
 		xdp_return_frame_rx_napi(bq->q[i]);
 
-	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err);
-	bq->dev_rx = NULL;
+out:
+	drops = cnt - sent;
 	bq->count = 0;
-	__list_del_clearprev(&bq->flush_node);
+	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
 }
 
 /* __dev_flush is called from xdp_do_flush() which _must_ be signaled
@@ -377,8 +427,12 @@ void __dev_flush(void)
 	struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
 	struct xdp_dev_bulk_queue *bq, *tmp;
 
-	list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
 		bq_xmit_all(bq, XDP_XMIT_FLUSH);
+		bq->dev_rx = NULL;
+		bq->xdp_prog = NULL;
+		__list_del_clearprev(&bq->flush_node);
+	}
 }
 
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -401,7 +455,7 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
  * Thus, safe percpu variable access.
  */
 static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
-		       struct net_device *dev_rx)
+		       struct net_device *dev_rx, struct bpf_prog *xdp_prog)
 {
 	struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
 	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -412,18 +466,22 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 	/* Ingress dev_rx will be the same for all xdp_frame's in
 	 * bulk_queue, because bq stored per-CPU and must be flushed
 	 * from net_device drivers NAPI func end.
+	 *
+	 * Do the same with xdp_prog and flush_list since these fields
+	 * are only ever modified together.
 	 */
-	if (!bq->dev_rx)
+	if (!bq->dev_rx) {
 		bq->dev_rx = dev_rx;
+		bq->xdp_prog = xdp_prog;
+		list_add(&bq->flush_node, flush_list);
+	}
 
 	bq->q[bq->count++] = xdpf;
-
-	if (!bq->flush_node.prev)
-		list_add(&bq->flush_node, flush_list);
 }
 
 static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
-			       struct net_device *dev_rx)
+				struct net_device *dev_rx,
+				struct bpf_prog *xdp_prog)
 {
 	struct xdp_frame *xdpf;
 	int err;
@@ -439,42 +497,14 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	bq_enqueue(dev, xdpf, dev_rx);
+	bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
 	return 0;
 }
 
-static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
-					 struct xdp_buff *xdp,
-					 struct bpf_prog *xdp_prog)
-{
-	struct xdp_txq_info txq = { .dev = dev };
-	u32 act;
-
-	xdp_set_data_meta_invalid(xdp);
-	xdp->txq = &txq;
-
-	act = bpf_prog_run_xdp(xdp_prog, xdp);
-	switch (act) {
-	case XDP_PASS:
-		return xdp;
-	case XDP_DROP:
-		break;
-	default:
-		bpf_warn_invalid_xdp_action(act);
-		fallthrough;
-	case XDP_ABORTED:
-		trace_xdp_exception(dev, xdp_prog, act);
-		break;
-	}
-
-	xdp_return_buff(xdp);
-	return NULL;
-}
-
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
-	return __xdp_enqueue(dev, xdp, dev_rx);
+	return __xdp_enqueue(dev, xdp, dev_rx, NULL);
 }
 
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
@@ -482,12 +512,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 {
 	struct net_device *dev = dst->dev;
 
-	if (dst->xdp_prog) {
-		xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
-		if (!xdp)
-			return 0;
-	}
-	return __xdp_enqueue(dev, xdp, dev_rx);
+	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
 }
 
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
-- 
cgit v1.2.3-70-g09d2


From e624d4ed4aa8cc3c69d1359b0aaea539203ed266 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 19 May 2021 17:07:45 +0800
Subject: xdp: Extend xdp_redirect_map with broadcast support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to
extend xdp_redirect_map for broadcast support.

With BPF_F_BROADCAST the packet will be broadcasted to all the interfaces
in the map. with BPF_F_EXCLUDE_INGRESS the ingress interface will be
excluded when do broadcasting.

When getting the devices in dev hash map via dev_map_hash_get_next_key(),
there is a possibility that we fall back to the first key when a device
was removed. This will duplicate packets on some interfaces. So just walk
the whole buckets to avoid this issue. For dev array map, we also walk the
whole map to find valid interfaces.

Function bpf_clear_redirect_map() was removed in
commit ee75aef23afe ("bpf, xdp: Restructure redirect actions").
Add it back as we need to use ri->map again.

With test topology:
  +-------------------+             +-------------------+
  | Host A (i40e 10G) |  ---------- | eno1(i40e 10G)    |
  +-------------------+             |                   |
                                    |   Host B          |
  +-------------------+             |                   |
  | Host C (i40e 10G) |  ---------- | eno2(i40e 10G)    |
  +-------------------+             |                   |
                                    |          +------+ |
                                    | veth0 -- | Peer | |
                                    | veth1 -- |      | |
                                    | veth2 -- |  NS  | |
                                    |          +------+ |
                                    +-------------------+

On Host A:
 # pktgen/pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -s 64

On Host B(Intel(R) Xeon(R) CPU E5-2690 v3 @ 2.60GHz, 128G Memory):
Use xdp_redirect_map and xdp_redirect_map_multi in samples/bpf for testing.
All the veth peers in the NS have a XDP_DROP program loaded. The
forward_map max_entries in xdp_redirect_map_multi is modify to 4.

Testing the performance impact on the regular xdp_redirect path with and
without patch (to check impact of additional check for broadcast mode):

5.12 rc4         | redirect_map        i40e->i40e      |    2.0M |  9.7M
5.12 rc4         | redirect_map        i40e->veth      |    1.7M | 11.8M
5.12 rc4 + patch | redirect_map        i40e->i40e      |    2.0M |  9.6M
5.12 rc4 + patch | redirect_map        i40e->veth      |    1.7M | 11.7M

Testing the performance when cloning packets with the redirect_map_multi
test, using a redirect map size of 4, filled with 1-3 devices:

5.12 rc4 + patch | redirect_map multi  i40e->veth (x1) |    1.7M | 11.4M
5.12 rc4 + patch | redirect_map multi  i40e->veth (x2) |    1.1M |  4.3M
5.12 rc4 + patch | redirect_map multi  i40e->veth (x3) |    0.8M |  2.6M

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20210519090747.1655268-3-liuhangbin@gmail.com
---
 include/linux/bpf.h            |  20 +++++
 include/linux/filter.h         |  19 ++++-
 include/net/xdp.h              |   1 +
 include/trace/events/xdp.h     |   6 +-
 include/uapi/linux/bpf.h       |  14 +++-
 kernel/bpf/cpumap.c            |   3 +-
 kernel/bpf/devmap.c            | 183 ++++++++++++++++++++++++++++++++++++++++-
 net/core/filter.c              |  37 ++++++++-
 net/core/xdp.c                 |  28 +++++++
 net/xdp/xskmap.c               |   3 +-
 tools/include/uapi/linux/bpf.h |  14 +++-
 11 files changed, 313 insertions(+), 15 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e9a0ff3217b..86dec5001ae2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1501,8 +1501,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress);
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog);
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+			   struct bpf_prog *xdp_prog, struct bpf_map *map,
+			   bool exclude_ingress);
 bool dev_map_can_have_prog(struct bpf_map *map);
 
 void __cpu_map_flush(void);
@@ -1670,6 +1675,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 	return 0;
 }
 
+static inline
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress)
+{
+	return 0;
+}
+
 struct sk_buff;
 
 static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
@@ -1679,6 +1691,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
 	return 0;
 }
 
+static inline
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+			   struct bpf_prog *xdp_prog, struct bpf_map *map,
+			   bool exclude_ingress)
+{
+	return 0;
+}
+
 static inline void __cpu_map_flush(void)
 {
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9a09547bc7ba..c5ad7df029ed 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -646,6 +646,7 @@ struct bpf_redirect_info {
 	u32 flags;
 	u32 tgt_index;
 	void *tgt_value;
+	struct bpf_map *map;
 	u32 map_id;
 	enum bpf_map_type map_type;
 	u32 kern_flags;
@@ -1464,17 +1465,19 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
-static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
+static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex,
+						  u64 flags, const u64 flag_mask,
 						  void *lookup_elem(struct bpf_map *map, u32 key))
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
 
 	/* Lower bits of the flags are used as return code on lookup failure */
-	if (unlikely(flags > XDP_TX))
+	if (unlikely(flags & ~(action_mask | flag_mask)))
 		return XDP_ABORTED;
 
 	ri->tgt_value = lookup_elem(map, ifindex);
-	if (unlikely(!ri->tgt_value)) {
+	if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
 		/* If the lookup fails we want to clear out the state in the
 		 * redirect_info struct completely, so that if an eBPF program
 		 * performs multiple lookups, the last one always takes
@@ -1482,13 +1485,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
 		 */
 		ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
 		ri->map_type = BPF_MAP_TYPE_UNSPEC;
-		return flags;
+		return flags & action_mask;
 	}
 
 	ri->tgt_index = ifindex;
 	ri->map_id = map->id;
 	ri->map_type = map->map_type;
 
+	if (flags & BPF_F_BROADCAST) {
+		WRITE_ONCE(ri->map, map);
+		ri->flags = flags;
+	} else {
+		WRITE_ONCE(ri->map, NULL);
+		ri->flags = 0;
+	}
+
 	return XDP_REDIRECT;
 }
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index a5bc214a49d9..5533f0ab2afc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 					 struct net_device *dev);
 int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
 
 static inline
 void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index fcad3645a70b..c40fc97f9417 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -110,7 +110,11 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
 		u32 ifindex = 0, map_index = index;
 
 		if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
-			ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
+			/* Just leave to_ifindex to 0 if do broadcast redirect,
+			 * as tgt will be NULL.
+			 */
+			if (tgt)
+				ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
 		} else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
 			ifindex = index;
 			map_index = 0;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 562adeac1d67..2c1ba70abbf1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2555,8 +2555,12 @@ union bpf_attr {
  * 		The lower two bits of *flags* are used as the return code if
  * 		the map lookup fails. This is so that the return value can be
  * 		one of the XDP program return codes up to **XDP_TX**, as chosen
- * 		by the caller. Any higher bits in the *flags* argument must be
- * 		unset.
+ * 		by the caller. The higher bits of *flags* can be set to
+ * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * 		interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
+ * 		interface will be excluded when do broadcasting.
  *
  * 		See also **bpf_redirect**\ (), which only supports redirecting
  * 		to an ifindex, but doesn't require a map to do so.
@@ -5122,6 +5126,12 @@ enum {
 	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
 };
 
+/* Flags for bpf_redirect_map helper */
+enum {
+	BPF_F_BROADCAST		= (1ULL << 3),
+	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
+};
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5dd3e866599a..a1a0c4e791c6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+	return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+				      __cpu_map_lookup_elem);
 }
 
 static int cpu_map_btf_id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 642264e32abd..f9148daab0e3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -198,6 +198,7 @@ static void dev_map_free(struct bpf_map *map)
 	list_del_rcu(&dtab->list);
 	spin_unlock(&dev_map_lock);
 
+	bpf_clear_redirect_map(map);
 	synchronize_rcu();
 
 	/* Make sure prior __dev_map_entry_free() have completed. */
@@ -515,6 +516,99 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
 }
 
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
+			 int exclude_ifindex)
+{
+	if (!obj || obj->dev->ifindex == exclude_ifindex ||
+	    !obj->dev->netdev_ops->ndo_xdp_xmit)
+		return false;
+
+	if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+		return false;
+
+	return true;
+}
+
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+				 struct net_device *dev_rx,
+				 struct xdp_frame *xdpf)
+{
+	struct xdp_frame *nxdpf;
+
+	nxdpf = xdpf_clone(xdpf);
+	if (!nxdpf)
+		return -ENOMEM;
+
+	bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+	return 0;
+}
+
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
+	struct bpf_dtab_netdev *dst, *last_dst = NULL;
+	struct hlist_head *head;
+	struct xdp_frame *xdpf;
+	unsigned int i;
+	int err;
+
+	xdpf = xdp_convert_buff_to_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		for (i = 0; i < map->max_entries; i++) {
+			dst = READ_ONCE(dtab->netdev_map[i]);
+			if (!is_valid_dst(dst, xdp, exclude_ifindex))
+				continue;
+
+			/* we only need n-1 clones; last_dst enqueued below */
+			if (!last_dst) {
+				last_dst = dst;
+				continue;
+			}
+
+			err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+			if (err)
+				return err;
+
+			last_dst = dst;
+		}
+	} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+		for (i = 0; i < dtab->n_buckets; i++) {
+			head = dev_map_index_hash(dtab, i);
+			hlist_for_each_entry_rcu(dst, head, index_hlist,
+						 lockdep_is_held(&dtab->index_lock)) {
+				if (!is_valid_dst(dst, xdp, exclude_ifindex))
+					continue;
+
+				/* we only need n-1 clones; last_dst enqueued below */
+				if (!last_dst) {
+					last_dst = dst;
+					continue;
+				}
+
+				err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+				if (err)
+					return err;
+
+				last_dst = dst;
+			}
+		}
+	}
+
+	/* consume the last copy of the frame */
+	if (last_dst)
+		bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+	else
+		xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+	return 0;
+}
+
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog)
 {
@@ -529,6 +623,87 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 	return 0;
 }
 
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+				  struct sk_buff *skb,
+				  struct bpf_prog *xdp_prog)
+{
+	struct sk_buff *nskb;
+	int err;
+
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (!nskb)
+		return -ENOMEM;
+
+	err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+	if (unlikely(err)) {
+		consume_skb(nskb);
+		return err;
+	}
+
+	return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+			   struct bpf_prog *xdp_prog, struct bpf_map *map,
+			   bool exclude_ingress)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
+	struct bpf_dtab_netdev *dst, *last_dst = NULL;
+	struct hlist_head *head;
+	struct hlist_node *next;
+	unsigned int i;
+	int err;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		for (i = 0; i < map->max_entries; i++) {
+			dst = READ_ONCE(dtab->netdev_map[i]);
+			if (!dst || dst->dev->ifindex == exclude_ifindex)
+				continue;
+
+			/* we only need n-1 clones; last_dst enqueued below */
+			if (!last_dst) {
+				last_dst = dst;
+				continue;
+			}
+
+			err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+			if (err)
+				return err;
+
+			last_dst = dst;
+		}
+	} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+		for (i = 0; i < dtab->n_buckets; i++) {
+			head = dev_map_index_hash(dtab, i);
+			hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+				if (!dst || dst->dev->ifindex == exclude_ifindex)
+					continue;
+
+				/* we only need n-1 clones; last_dst enqueued below */
+				if (!last_dst) {
+					last_dst = dst;
+					continue;
+				}
+
+				err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+				if (err)
+					return err;
+
+				last_dst = dst;
+			}
+		}
+	}
+
+	/* consume the first skb and return */
+	if (last_dst)
+		return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+	/* dtab is empty */
+	consume_skb(skb);
+	return 0;
+}
+
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
@@ -755,12 +930,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
 
 static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+	return __bpf_xdp_redirect_map(map, ifindex, flags,
+				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+				      __dev_map_lookup_elem);
 }
 
 static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+	return __bpf_xdp_redirect_map(map, ifindex, flags,
+				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+				      __dev_map_hash_lookup_elem);
 }
 
 static int dev_map_btf_id;
diff --git a/net/core/filter.c b/net/core/filter.c
index 582ac196fd94..caa88955562e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3930,6 +3930,23 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
+void bpf_clear_redirect_map(struct bpf_map *map)
+{
+	struct bpf_redirect_info *ri;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+		/* Avoid polluting remote cacheline due to writes if
+		 * not needed. Once we pass this test, we need the
+		 * cmpxchg() to make sure it hasn't been changed in
+		 * the meantime by remote CPU.
+		 */
+		if (unlikely(READ_ONCE(ri->map) == map))
+			cmpxchg(&ri->map, map, NULL);
+	}
+}
+
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
@@ -3937,6 +3954,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	enum bpf_map_type map_type = ri->map_type;
 	void *fwd = ri->tgt_value;
 	u32 map_id = ri->map_id;
+	struct bpf_map *map;
 	int err;
 
 	ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
@@ -3946,7 +3964,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	case BPF_MAP_TYPE_DEVMAP:
 		fallthrough;
 	case BPF_MAP_TYPE_DEVMAP_HASH:
-		err = dev_map_enqueue(fwd, xdp, dev);
+		map = READ_ONCE(ri->map);
+		if (unlikely(map)) {
+			WRITE_ONCE(ri->map, NULL);
+			err = dev_map_enqueue_multi(xdp, dev, map,
+						    ri->flags & BPF_F_EXCLUDE_INGRESS);
+		} else {
+			err = dev_map_enqueue(fwd, xdp, dev);
+		}
 		break;
 	case BPF_MAP_TYPE_CPUMAP:
 		err = cpu_map_enqueue(fwd, xdp, dev);
@@ -3988,13 +4013,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       enum bpf_map_type map_type, u32 map_id)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	struct bpf_map *map;
 	int err;
 
 	switch (map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
 		fallthrough;
 	case BPF_MAP_TYPE_DEVMAP_HASH:
-		err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+		map = READ_ONCE(ri->map);
+		if (unlikely(map)) {
+			WRITE_ONCE(ri->map, NULL);
+			err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+						     ri->flags & BPF_F_EXCLUDE_INGRESS);
+		} else {
+			err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+		}
 		if (unlikely(err))
 			goto err;
 		break;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 858276e72c68..725d20f1b100 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -584,3 +584,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 	return __xdp_build_skb_from_frame(xdpf, skb, dev);
 }
 EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+	unsigned int headroom, totalsize;
+	struct xdp_frame *nxdpf;
+	struct page *page;
+	void *addr;
+
+	headroom = xdpf->headroom + sizeof(*xdpf);
+	totalsize = headroom + xdpf->len;
+
+	if (unlikely(totalsize > PAGE_SIZE))
+		return NULL;
+	page = dev_alloc_page();
+	if (!page)
+		return NULL;
+	addr = page_to_virt(page);
+
+	memcpy(addr, xdpf, totalsize);
+
+	nxdpf = addr;
+	nxdpf->data = addr + headroom;
+	nxdpf->frame_sz = PAGE_SIZE;
+	nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+	nxdpf->mem.id = 0;
+
+	return nxdpf;
+}
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 67b4ce504852..9df75ea4a567 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -226,7 +226,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 
 static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem);
+	return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+				      __xsk_map_lookup_elem);
 }
 
 void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 562adeac1d67..2c1ba70abbf1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2555,8 +2555,12 @@ union bpf_attr {
  * 		The lower two bits of *flags* are used as the return code if
  * 		the map lookup fails. This is so that the return value can be
  * 		one of the XDP program return codes up to **XDP_TX**, as chosen
- * 		by the caller. Any higher bits in the *flags* argument must be
- * 		unset.
+ * 		by the caller. The higher bits of *flags* can be set to
+ * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * 		interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
+ * 		interface will be excluded when do broadcasting.
  *
  * 		See also **bpf_redirect**\ (), which only supports redirecting
  * 		to an ifindex, but doesn't require a map to do so.
@@ -5122,6 +5126,12 @@ enum {
 	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
 };
 
+/* Flags for bpf_redirect_map helper */
+enum {
+	BPF_F_BROADCAST		= (1ULL << 3),
+	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
+};
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\
-- 
cgit v1.2.3-70-g09d2


From e48cfe4bbfadd7b88821fe98f625a6b5a6d1cbb4 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 19 May 2021 17:07:46 +0800
Subject: sample/bpf: Add xdp_redirect_map_multi for redirect_map broadcast
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a sample for xdp redirect broadcast. In the sample we could forward
all packets between given interfaces. There is also an option -X that could
enable 2nd xdp_prog on egress interface.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210519090747.1655268-4-liuhangbin@gmail.com
---
 samples/bpf/Makefile                      |   3 +
 samples/bpf/xdp_redirect_map_multi_kern.c |  88 +++++++++
 samples/bpf/xdp_redirect_map_multi_user.c | 302 ++++++++++++++++++++++++++++++
 3 files changed, 393 insertions(+)
 create mode 100644 samples/bpf/xdp_redirect_map_multi_kern.c
 create mode 100644 samples/bpf/xdp_redirect_map_multi_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 45ceca4e2c70..520434ea966f 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -41,6 +41,7 @@ tprogs-y += test_map_in_map
 tprogs-y += per_socket_stats_example
 tprogs-y += xdp_redirect
 tprogs-y += xdp_redirect_map
+tprogs-y += xdp_redirect_map_multi
 tprogs-y += xdp_redirect_cpu
 tprogs-y += xdp_monitor
 tprogs-y += xdp_rxq_info
@@ -99,6 +100,7 @@ test_map_in_map-objs := test_map_in_map_user.o
 per_socket_stats_example-objs := cookie_uid_helper_example.o
 xdp_redirect-objs := xdp_redirect_user.o
 xdp_redirect_map-objs := xdp_redirect_map_user.o
+xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o
 xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
 xdp_monitor-objs := xdp_monitor_user.o
 xdp_rxq_info-objs := xdp_rxq_info_user.o
@@ -160,6 +162,7 @@ always-y += tcp_tos_reflect_kern.o
 always-y += tcp_dumpstats_kern.o
 always-y += xdp_redirect_kern.o
 always-y += xdp_redirect_map_kern.o
+always-y += xdp_redirect_map_multi_kern.o
 always-y += xdp_redirect_cpu_kern.o
 always-y += xdp_monitor_kern.o
 always-y += xdp_rxq_info_kern.o
diff --git a/samples/bpf/xdp_redirect_map_multi_kern.c b/samples/bpf/xdp_redirect_map_multi_kern.c
new file mode 100644
index 000000000000..71aa23d1cb2b
--- /dev/null
+++ b/samples/bpf/xdp_redirect_map_multi_kern.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+	__uint(max_entries, 32);
+} forward_map_general SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(struct bpf_devmap_val));
+	__uint(max_entries, 32);
+} forward_map_native SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, long);
+	__uint(max_entries, 1);
+} rxcnt SEC(".maps");
+
+/* map to store egress interfaces mac addresses, set the
+ * max_entries to 1 and extend it in user sapce prog.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, __be64);
+	__uint(max_entries, 1);
+} mac_map SEC(".maps");
+
+static int xdp_redirect_map(struct xdp_md *ctx, void *forward_map)
+{
+	long *value;
+	u32 key = 0;
+
+	/* count packet in global counter */
+	value = bpf_map_lookup_elem(&rxcnt, &key);
+	if (value)
+		*value += 1;
+
+	return bpf_redirect_map(forward_map, key,
+				BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
+}
+
+SEC("xdp_redirect_general")
+int xdp_redirect_map_general(struct xdp_md *ctx)
+{
+	return xdp_redirect_map(ctx, &forward_map_general);
+}
+
+SEC("xdp_redirect_native")
+int xdp_redirect_map_native(struct xdp_md *ctx)
+{
+	return xdp_redirect_map(ctx, &forward_map_native);
+}
+
+SEC("xdp_devmap/map_prog")
+int xdp_devmap_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	u32 key = ctx->egress_ifindex;
+	struct ethhdr *eth = data;
+	__be64 *mac;
+	u64 nh_off;
+
+	nh_off = sizeof(*eth);
+	if (data + nh_off > data_end)
+		return XDP_DROP;
+
+	mac = bpf_map_lookup_elem(&mac_map, &key);
+	if (mac)
+		__builtin_memcpy(eth->h_source, mac, ETH_ALEN);
+
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_redirect_map_multi_user.c b/samples/bpf/xdp_redirect_map_multi_user.c
new file mode 100644
index 000000000000..84cdbbed20b7
--- /dev/null
+++ b/samples/bpf/xdp_redirect_map_multi_user.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/if.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/resource.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#define MAX_IFACE_NUM 32
+
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+static int ifaces[MAX_IFACE_NUM] = {};
+static int rxcnt_map_fd;
+
+static void int_exit(int sig)
+{
+	__u32 prog_id = 0;
+	int i;
+
+	for (i = 0; ifaces[i] > 0; i++) {
+		if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) {
+			printf("bpf_get_link_xdp_id failed\n");
+			exit(1);
+		}
+		if (prog_id)
+			bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags);
+	}
+
+	exit(0);
+}
+
+static void poll_stats(int interval)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	__u64 values[nr_cpus], prev[nr_cpus];
+
+	memset(prev, 0, sizeof(prev));
+
+	while (1) {
+		__u64 sum = 0;
+		__u32 key = 0;
+		int i;
+
+		sleep(interval);
+		assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
+		for (i = 0; i < nr_cpus; i++)
+			sum += (values[i] - prev[i]);
+		if (sum)
+			printf("Forwarding %10llu pkt/s\n", sum / interval);
+		memcpy(prev, values, sizeof(values));
+	}
+}
+
+static int get_mac_addr(unsigned int ifindex, void *mac_addr)
+{
+	char ifname[IF_NAMESIZE];
+	struct ifreq ifr;
+	int fd, ret = -1;
+
+	fd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (fd < 0)
+		return ret;
+
+	if (!if_indextoname(ifindex, ifname))
+		goto err_out;
+
+	strcpy(ifr.ifr_name, ifname);
+
+	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+		goto err_out;
+
+	memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
+	ret = 0;
+
+err_out:
+	close(fd);
+	return ret;
+}
+
+static int update_mac_map(struct bpf_object *obj)
+{
+	int i, ret = -1, mac_map_fd;
+	unsigned char mac_addr[6];
+	unsigned int ifindex;
+
+	mac_map_fd = bpf_object__find_map_fd_by_name(obj, "mac_map");
+	if (mac_map_fd < 0) {
+		printf("find mac map fd failed\n");
+		return ret;
+	}
+
+	for (i = 0; ifaces[i] > 0; i++) {
+		ifindex = ifaces[i];
+
+		ret = get_mac_addr(ifindex, mac_addr);
+		if (ret < 0) {
+			printf("get interface %d mac failed\n", ifindex);
+			return ret;
+		}
+
+		ret = bpf_map_update_elem(mac_map_fd, &ifindex, mac_addr, 0);
+		if (ret) {
+			perror("bpf_update_elem mac_map_fd");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void usage(const char *prog)
+{
+	fprintf(stderr,
+		"usage: %s [OPTS] <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n"
+		"OPTS:\n"
+		"    -S    use skb-mode\n"
+		"    -N    enforce native mode\n"
+		"    -F    force loading prog\n"
+		"    -X    load xdp program on egress\n",
+		prog);
+}
+
+int main(int argc, char **argv)
+{
+	int i, ret, opt, forward_map_fd, max_ifindex = 0;
+	struct bpf_program *ingress_prog, *egress_prog;
+	int ingress_prog_fd, egress_prog_fd = 0;
+	struct bpf_devmap_val devmap_val;
+	bool attach_egress_prog = false;
+	char ifname[IF_NAMESIZE];
+	struct bpf_map *mac_map;
+	struct bpf_object *obj;
+	unsigned int ifindex;
+	char filename[256];
+
+	while ((opt = getopt(argc, argv, "SNFX")) != -1) {
+		switch (opt) {
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'N':
+			/* default, set below */
+			break;
+		case 'F':
+			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+			break;
+		case 'X':
+			attach_egress_prog = true;
+			break;
+		default:
+			usage(basename(argv[0]));
+			return 1;
+		}
+	}
+
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) {
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+	} else if (attach_egress_prog) {
+		printf("Load xdp program on egress with SKB mode not supported yet\n");
+		return 1;
+	}
+
+	if (optind == argc) {
+		printf("usage: %s <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n", argv[0]);
+		return 1;
+	}
+
+	printf("Get interfaces");
+	for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) {
+		ifaces[i] = if_nametoindex(argv[optind + i]);
+		if (!ifaces[i])
+			ifaces[i] = strtoul(argv[optind + i], NULL, 0);
+		if (!if_indextoname(ifaces[i], ifname)) {
+			perror("Invalid interface name or i");
+			return 1;
+		}
+
+		/* Find the largest index number */
+		if (ifaces[i] > max_ifindex)
+			max_ifindex = ifaces[i];
+
+		printf(" %d", ifaces[i]);
+	}
+	printf("\n");
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	obj = bpf_object__open(filename);
+	if (libbpf_get_error(obj)) {
+		printf("ERROR: opening BPF object file failed\n");
+		obj = NULL;
+		goto err_out;
+	}
+
+	/* Reset the map size to max ifindex + 1 */
+	if (attach_egress_prog) {
+		mac_map = bpf_object__find_map_by_name(obj, "mac_map");
+		ret = bpf_map__resize(mac_map, max_ifindex + 1);
+		if (ret < 0) {
+			printf("ERROR: reset mac map size failed\n");
+			goto err_out;
+		}
+	}
+
+	/* load BPF program */
+	if (bpf_object__load(obj)) {
+		printf("ERROR: loading BPF object file failed\n");
+		goto err_out;
+	}
+
+	if (xdp_flags & XDP_FLAGS_SKB_MODE) {
+		ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_general");
+		forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_general");
+	} else {
+		ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_native");
+		forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_native");
+	}
+	if (!ingress_prog || forward_map_fd < 0) {
+		printf("finding ingress_prog/forward_map in obj file failed\n");
+		goto err_out;
+	}
+
+	ingress_prog_fd = bpf_program__fd(ingress_prog);
+	if (ingress_prog_fd < 0) {
+		printf("find ingress_prog fd failed\n");
+		goto err_out;
+	}
+
+	rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
+	if (rxcnt_map_fd < 0) {
+		printf("bpf_object__find_map_fd_by_name failed\n");
+		goto err_out;
+	}
+
+	if (attach_egress_prog) {
+		/* Update mac_map with all egress interfaces' mac addr */
+		if (update_mac_map(obj) < 0) {
+			printf("Error: update mac map failed");
+			goto err_out;
+		}
+
+		/* Find egress prog fd */
+		egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog");
+		if (!egress_prog) {
+			printf("finding egress_prog in obj file failed\n");
+			goto err_out;
+		}
+		egress_prog_fd = bpf_program__fd(egress_prog);
+		if (egress_prog_fd < 0) {
+			printf("find egress_prog fd failed\n");
+			goto err_out;
+		}
+	}
+
+	/* Remove attached program when program is interrupted or killed */
+	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
+
+	/* Init forward multicast groups */
+	for (i = 0; ifaces[i] > 0; i++) {
+		ifindex = ifaces[i];
+
+		/* bind prog_fd to each interface */
+		ret = bpf_set_link_xdp_fd(ifindex, ingress_prog_fd, xdp_flags);
+		if (ret) {
+			printf("Set xdp fd failed on %d\n", ifindex);
+			goto err_out;
+		}
+
+		/* Add all the interfaces to forward group and attach
+		 * egress devmap programe if exist
+		 */
+		devmap_val.ifindex = ifindex;
+		devmap_val.bpf_prog.fd = egress_prog_fd;
+		ret = bpf_map_update_elem(forward_map_fd, &ifindex, &devmap_val, 0);
+		if (ret) {
+			perror("bpf_map_update_elem forward_map");
+			goto err_out;
+		}
+	}
+
+	poll_stats(2);
+
+	return 0;
+
+err_out:
+	return 1;
+}
-- 
cgit v1.2.3-70-g09d2


From d232924762971fe2698011bc244e05949e544541 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 19 May 2021 17:07:47 +0800
Subject: selftests/bpf: Add xdp_redirect_multi test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a bpf selftest for new helper xdp_redirect_map_multi(). In this
test there are 3 forward groups and 1 exclude group. The test will
redirect each interface's packets to all the interfaces in the forward
group, and exclude the interface in exclude map.

Two maps (DEVMAP, DEVMAP_HASH) and two xdp modes (generic, drive) will
be tested. XDP egress program will also be tested by setting pkt src MAC
to egress interface's MAC address.

For more test details, you can find it in the test script. Here is
the test result.
]# time ./test_xdp_redirect_multi.sh
Pass: xdpgeneric arp(F_BROADCAST) ns1-1
Pass: xdpgeneric arp(F_BROADCAST) ns1-2
Pass: xdpgeneric arp(F_BROADCAST) ns1-3
Pass: xdpgeneric IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1
Pass: xdpgeneric IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2
Pass: xdpgeneric IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3
Pass: xdpgeneric IPv6 (no flags) ns1-1
Pass: xdpgeneric IPv6 (no flags) ns1-2
Pass: xdpdrv arp(F_BROADCAST) ns1-1
Pass: xdpdrv arp(F_BROADCAST) ns1-2
Pass: xdpdrv arp(F_BROADCAST) ns1-3
Pass: xdpdrv IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1
Pass: xdpdrv IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2
Pass: xdpdrv IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3
Pass: xdpdrv IPv6 (no flags) ns1-1
Pass: xdpdrv IPv6 (no flags) ns1-2
Pass: xdpegress mac ns1-2
Pass: xdpegress mac ns1-3
Summary: PASS 18, FAIL 0

real    1m18.321s
user    0m0.123s
sys     0m0.350s

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210519090747.1655268-5-liuhangbin@gmail.com
---
 tools/testing/selftests/bpf/Makefile               |   3 +-
 .../selftests/bpf/progs/xdp_redirect_multi_kern.c  |  94 +++++++++
 .../selftests/bpf/test_xdp_redirect_multi.sh       | 204 +++++++++++++++++++
 tools/testing/selftests/bpf/xdp_redirect_multi.c   | 226 +++++++++++++++++++++
 4 files changed, 526 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c
 create mode 100755 tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
 create mode 100644 tools/testing/selftests/bpf/xdp_redirect_multi.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 525e4b3fb514..f405b20c1e6c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -54,6 +54,7 @@ TEST_FILES = xsk_prereqs.sh \
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
 	test_xdp_redirect.sh \
+	test_xdp_redirect_multi.sh \
 	test_xdp_meta.sh \
 	test_xdp_veth.sh \
 	test_offload.py \
@@ -84,7 +85,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
 	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
-	xdpxceiver
+	xdpxceiver xdp_redirect_multi
 
 TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
 
diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c b/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c
new file mode 100644
index 000000000000..880debcbcd65
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+#define KBUILD_MODNAME "foo"
+#include <string.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* One map use devmap, another one use devmap_hash for testing */
+struct {
+	__uint(type, BPF_MAP_TYPE_DEVMAP);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+	__uint(max_entries, 1024);
+} map_all SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(struct bpf_devmap_val));
+	__uint(max_entries, 128);
+} map_egress SEC(".maps");
+
+/* map to store egress interfaces mac addresses */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, __u32);
+	__type(value, __be64);
+	__uint(max_entries, 128);
+} mac_map SEC(".maps");
+
+SEC("xdp_redirect_map_multi")
+int xdp_redirect_map_multi_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	int if_index = ctx->ingress_ifindex;
+	struct ethhdr *eth = data;
+	__u16 h_proto;
+	__u64 nh_off;
+
+	nh_off = sizeof(*eth);
+	if (data + nh_off > data_end)
+		return XDP_DROP;
+
+	h_proto = eth->h_proto;
+
+	/* Using IPv4 for (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) testing */
+	if (h_proto == bpf_htons(ETH_P_IP))
+		return bpf_redirect_map(&map_all, 0,
+					BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
+	/* Using IPv6 for none flag testing */
+	else if (h_proto == bpf_htons(ETH_P_IPV6))
+		return bpf_redirect_map(&map_all, if_index, 0);
+	/* All others for BPF_F_BROADCAST testing */
+	else
+		return bpf_redirect_map(&map_all, 0, BPF_F_BROADCAST);
+}
+
+/* The following 2 progs are for 2nd devmap prog testing */
+SEC("xdp_redirect_map_ingress")
+int xdp_redirect_map_all_prog(struct xdp_md *ctx)
+{
+	return bpf_redirect_map(&map_egress, 0,
+				BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
+}
+
+SEC("xdp_devmap/map_prog")
+int xdp_devmap_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	__u32 key = ctx->egress_ifindex;
+	struct ethhdr *eth = data;
+	__u64 nh_off;
+	__be64 *mac;
+
+	nh_off = sizeof(*eth);
+	if (data + nh_off > data_end)
+		return XDP_DROP;
+
+	mac = bpf_map_lookup_elem(&mac_map, &key);
+	if (mac)
+		__builtin_memcpy(eth->h_source, mac, ETH_ALEN);
+
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
new file mode 100755
index 000000000000..1538373157e3
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test topology:
+#     - - - - - - - - - - - - - - - - - - - - - - - - -
+#    | veth1         veth2         veth3 |  ... init net
+#     - -| - - - - - - | - - - - - - | - -
+#    ---------     ---------     ---------
+#    | veth0 |     | veth0 |     | veth0 |  ...
+#    ---------     ---------     ---------
+#       ns1           ns2           ns3
+#
+# Test modules:
+# XDP modes: generic, native, native + egress_prog
+#
+# Test cases:
+#   ARP: Testing BPF_F_BROADCAST, the ingress interface also should receive
+#   the redirects.
+#      ns1 -> gw: ns1, ns2, ns3, should receive the arp request
+#   IPv4: Testing BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, the ingress
+#   interface should not receive the redirects.
+#      ns1 -> gw: ns1 should not receive, ns2, ns3 should receive redirects.
+#   IPv6: Testing none flag, all the pkts should be redirected back
+#      ping test: ns1 -> ns2 (block), echo requests will be redirect back
+#   egress_prog:
+#      all src mac should be egress interface's mac
+
+# netns numbers
+NUM=3
+IFACES=""
+DRV_MODE="xdpgeneric xdpdrv xdpegress"
+PASS=0
+FAIL=0
+
+test_pass()
+{
+	echo "Pass: $@"
+	PASS=$((PASS + 1))
+}
+
+test_fail()
+{
+	echo "fail: $@"
+	FAIL=$((FAIL + 1))
+}
+
+clean_up()
+{
+	for i in $(seq $NUM); do
+		ip link del veth$i 2> /dev/null
+		ip netns del ns$i 2> /dev/null
+	done
+}
+
+# Kselftest framework requirement - SKIP code is 4.
+check_env()
+{
+	ip link set dev lo xdpgeneric off &>/dev/null
+	if [ $? -ne 0 ];then
+		echo "selftests: [SKIP] Could not run test without the ip xdpgeneric support"
+		exit 4
+	fi
+
+	which tcpdump &>/dev/null
+	if [ $? -ne 0 ];then
+		echo "selftests: [SKIP] Could not run test without tcpdump"
+		exit 4
+	fi
+}
+
+setup_ns()
+{
+	local mode=$1
+	IFACES=""
+
+	if [ "$mode" = "xdpegress" ]; then
+		mode="xdpdrv"
+	fi
+
+	for i in $(seq $NUM); do
+	        ip netns add ns$i
+	        ip link add veth$i type veth peer name veth0 netns ns$i
+		ip link set veth$i up
+		ip -n ns$i link set veth0 up
+
+		ip -n ns$i addr add 192.0.2.$i/24 dev veth0
+		ip -n ns$i addr add 2001:db8::$i/64 dev veth0
+		# Add a neigh entry for IPv4 ping test
+		ip -n ns$i neigh add 192.0.2.253 lladdr 00:00:00:00:00:01 dev veth0
+		ip -n ns$i link set veth0 $mode obj \
+			xdp_dummy.o sec xdp_dummy &> /dev/null || \
+			{ test_fail "Unable to load dummy xdp" && exit 1; }
+		IFACES="$IFACES veth$i"
+		veth_mac[$i]=$(ip link show veth$i | awk '/link\/ether/ {print $2}')
+	done
+}
+
+do_egress_tests()
+{
+	local mode=$1
+
+	# mac test
+	ip netns exec ns2 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-2_${mode}.log &
+	ip netns exec ns3 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-3_${mode}.log &
+	sleep 0.5
+	ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null
+	sleep 0.5
+	pkill -9 tcpdump
+
+	# mac check
+	grep -q "${veth_mac[2]} > ff:ff:ff:ff:ff:ff" mac_ns1-2_${mode}.log && \
+	       test_pass "$mode mac ns1-2" || test_fail "$mode mac ns1-2"
+	grep -q "${veth_mac[3]} > ff:ff:ff:ff:ff:ff" mac_ns1-3_${mode}.log && \
+		test_pass "$mode mac ns1-3" || test_fail "$mode mac ns1-3"
+}
+
+do_ping_tests()
+{
+	local mode=$1
+
+	# ping6 test: echo request should be redirect back to itself, not others
+	ip netns exec ns1 ip neigh add 2001:db8::2 dev veth0 lladdr 00:00:00:00:00:02
+
+	ip netns exec ns1 tcpdump -i veth0 -nn -l -e &> ns1-1_${mode}.log &
+	ip netns exec ns2 tcpdump -i veth0 -nn -l -e &> ns1-2_${mode}.log &
+	ip netns exec ns3 tcpdump -i veth0 -nn -l -e &> ns1-3_${mode}.log &
+	sleep 0.5
+	# ARP test
+	ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null
+	# IPv4 test
+	ip netns exec ns1 ping 192.0.2.253 -i 0.1 -c 4 &> /dev/null
+	# IPv6 test
+	ip netns exec ns1 ping6 2001:db8::2 -i 0.1 -c 2 &> /dev/null
+	sleep 0.5
+	pkill -9 tcpdump
+
+	# All netns should receive the redirect arp requests
+	[ $(grep -c "who-has 192.0.2.254" ns1-1_${mode}.log) -gt 4 ] && \
+		test_pass "$mode arp(F_BROADCAST) ns1-1" || \
+		test_fail "$mode arp(F_BROADCAST) ns1-1"
+	[ $(grep -c "who-has 192.0.2.254" ns1-2_${mode}.log) -le 4 ] && \
+		test_pass "$mode arp(F_BROADCAST) ns1-2" || \
+		test_fail "$mode arp(F_BROADCAST) ns1-2"
+	[ $(grep -c "who-has 192.0.2.254" ns1-3_${mode}.log) -le 4 ] && \
+		test_pass "$mode arp(F_BROADCAST) ns1-3" || \
+		test_fail "$mode arp(F_BROADCAST) ns1-3"
+
+	# ns1 should not receive the redirect echo request, others should
+	[ $(grep -c "ICMP echo request" ns1-1_${mode}.log) -eq 4 ] && \
+		test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1" || \
+		test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1"
+	[ $(grep -c "ICMP echo request" ns1-2_${mode}.log) -eq 4 ] && \
+		test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2" || \
+		test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2"
+	[ $(grep -c "ICMP echo request" ns1-3_${mode}.log) -eq 4 ] && \
+		test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3" || \
+		test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3"
+
+	# ns1 should receive the echo request, ns2 should not
+	[ $(grep -c "ICMP6, echo request" ns1-1_${mode}.log) -eq 4 ] && \
+		test_pass "$mode IPv6 (no flags) ns1-1" || \
+		test_fail "$mode IPv6 (no flags) ns1-1"
+	[ $(grep -c "ICMP6, echo request" ns1-2_${mode}.log) -eq 0 ] && \
+		test_pass "$mode IPv6 (no flags) ns1-2" || \
+		test_fail "$mode IPv6 (no flags) ns1-2"
+}
+
+do_tests()
+{
+	local mode=$1
+	local drv_p
+
+	case ${mode} in
+		xdpdrv)  drv_p="-N";;
+		xdpegress) drv_p="-X";;
+		xdpgeneric) drv_p="-S";;
+	esac
+
+	./xdp_redirect_multi $drv_p $IFACES &> xdp_redirect_${mode}.log &
+	xdp_pid=$!
+	sleep 1
+
+	if [ "$mode" = "xdpegress" ]; then
+		do_egress_tests $mode
+	else
+		do_ping_tests $mode
+	fi
+
+	kill $xdp_pid
+}
+
+trap clean_up 0 2 3 6 9
+
+check_env
+rm -f xdp_redirect_*.log ns*.log mac_ns*.log
+
+for mode in ${DRV_MODE}; do
+	setup_ns $mode
+	do_tests $mode
+	clean_up
+done
+
+echo "Summary: PASS $PASS, FAIL $FAIL"
+[ $FAIL -eq 0 ] && exit 0 || exit 1
diff --git a/tools/testing/selftests/bpf/xdp_redirect_multi.c b/tools/testing/selftests/bpf/xdp_redirect_multi.c
new file mode 100644
index 000000000000..3696a8f32c23
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_redirect_multi.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/if.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/resource.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#define MAX_IFACE_NUM 32
+#define MAX_INDEX_NUM 1024
+
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+static int ifaces[MAX_IFACE_NUM] = {};
+
+static void int_exit(int sig)
+{
+	__u32 prog_id = 0;
+	int i;
+
+	for (i = 0; ifaces[i] > 0; i++) {
+		if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) {
+			printf("bpf_get_link_xdp_id failed\n");
+			exit(1);
+		}
+		if (prog_id)
+			bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags);
+	}
+
+	exit(0);
+}
+
+static int get_mac_addr(unsigned int ifindex, void *mac_addr)
+{
+	char ifname[IF_NAMESIZE];
+	struct ifreq ifr;
+	int fd, ret = -1;
+
+	fd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (fd < 0)
+		return ret;
+
+	if (!if_indextoname(ifindex, ifname))
+		goto err_out;
+
+	strcpy(ifr.ifr_name, ifname);
+
+	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+		goto err_out;
+
+	memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
+	ret = 0;
+
+err_out:
+	close(fd);
+	return ret;
+}
+
+static void usage(const char *prog)
+{
+	fprintf(stderr,
+		"usage: %s [OPTS] <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n"
+		"OPTS:\n"
+		"    -S    use skb-mode\n"
+		"    -N    enforce native mode\n"
+		"    -F    force loading prog\n"
+		"    -X    load xdp program on egress\n",
+		prog);
+}
+
+int main(int argc, char **argv)
+{
+	int prog_fd, group_all, mac_map;
+	struct bpf_program *ingress_prog, *egress_prog;
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type = BPF_PROG_TYPE_UNSPEC,
+	};
+	int i, ret, opt, egress_prog_fd = 0;
+	struct bpf_devmap_val devmap_val;
+	bool attach_egress_prog = false;
+	unsigned char mac_addr[6];
+	char ifname[IF_NAMESIZE];
+	struct bpf_object *obj;
+	unsigned int ifindex;
+	char filename[256];
+
+	while ((opt = getopt(argc, argv, "SNFX")) != -1) {
+		switch (opt) {
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'N':
+			/* default, set below */
+			break;
+		case 'F':
+			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+			break;
+		case 'X':
+			attach_egress_prog = true;
+			break;
+		default:
+			usage(basename(argv[0]));
+			return 1;
+		}
+	}
+
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) {
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+	} else if (attach_egress_prog) {
+		printf("Load xdp program on egress with SKB mode not supported yet\n");
+		goto err_out;
+	}
+
+	if (optind == argc) {
+		printf("usage: %s <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n", argv[0]);
+		goto err_out;
+	}
+
+	printf("Get interfaces");
+	for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) {
+		ifaces[i] = if_nametoindex(argv[optind + i]);
+		if (!ifaces[i])
+			ifaces[i] = strtoul(argv[optind + i], NULL, 0);
+		if (!if_indextoname(ifaces[i], ifname)) {
+			perror("Invalid interface name or i");
+			goto err_out;
+		}
+		if (ifaces[i] > MAX_INDEX_NUM) {
+			printf("Interface index to large\n");
+			goto err_out;
+		}
+		printf(" %d", ifaces[i]);
+	}
+	printf("\n");
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	prog_load_attr.file = filename;
+
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+		goto err_out;
+
+	if (attach_egress_prog)
+		group_all = bpf_object__find_map_fd_by_name(obj, "map_egress");
+	else
+		group_all = bpf_object__find_map_fd_by_name(obj, "map_all");
+	mac_map = bpf_object__find_map_fd_by_name(obj, "mac_map");
+
+	if (group_all < 0 || mac_map < 0) {
+		printf("bpf_object__find_map_fd_by_name failed\n");
+		goto err_out;
+	}
+
+	if (attach_egress_prog) {
+		/* Find ingress/egress prog for 2nd xdp prog */
+		ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_all_prog");
+		egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog");
+		if (!ingress_prog || !egress_prog) {
+			printf("finding ingress/egress_prog in obj file failed\n");
+			goto err_out;
+		}
+		prog_fd = bpf_program__fd(ingress_prog);
+		egress_prog_fd = bpf_program__fd(egress_prog);
+		if (prog_fd < 0 || egress_prog_fd < 0) {
+			printf("find egress_prog fd failed\n");
+			goto err_out;
+		}
+	}
+
+	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
+
+	/* Init forward multicast groups and exclude group */
+	for (i = 0; ifaces[i] > 0; i++) {
+		ifindex = ifaces[i];
+
+		if (attach_egress_prog) {
+			ret = get_mac_addr(ifindex, mac_addr);
+			if (ret < 0) {
+				printf("get interface %d mac failed\n", ifindex);
+				goto err_out;
+			}
+			ret = bpf_map_update_elem(mac_map, &ifindex, mac_addr, 0);
+			if (ret) {
+				perror("bpf_update_elem mac_map failed\n");
+				goto err_out;
+			}
+		}
+
+		/* Add all the interfaces to group all */
+		devmap_val.ifindex = ifindex;
+		devmap_val.bpf_prog.fd = egress_prog_fd;
+		ret = bpf_map_update_elem(group_all, &ifindex, &devmap_val, 0);
+		if (ret) {
+			perror("bpf_map_update_elem");
+			goto err_out;
+		}
+
+		/* bind prog_fd to each interface */
+		ret = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
+		if (ret) {
+			printf("Set xdp fd failed on %d\n", ifindex);
+			goto err_out;
+		}
+	}
+
+	/* sleep some time for testing */
+	sleep(999);
+
+	return 0;
+
+err_out:
+	return 1;
+}
-- 
cgit v1.2.3-70-g09d2


From d6a6a55518c16040a369360255b355b7a2a261de Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Wed, 26 May 2021 18:46:43 +0200
Subject: libbpf: Move BPF_SEQ_PRINTF and BPF_SNPRINTF to bpf_helpers.h

These macros are convenient wrappers around the bpf_seq_printf and
bpf_snprintf helpers. They are currently provided by bpf_tracing.h which
targets low level tracing primitives. bpf_helpers.h is a better fit.

The __bpf_narg and __bpf_apply are needed in both files and provided
twice. __bpf_empty isn't used anywhere and is removed from bpf_tracing.h

Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210526164643.2881368-1-revest@chromium.org
---
 kernel/bpf/preload/iterators/iterators.bpf.c       |  1 -
 tools/lib/bpf/bpf_helpers.h                        | 66 ++++++++++++++++++++++
 tools/lib/bpf/bpf_tracing.h                        | 62 +++-----------------
 .../selftests/bpf/progs/bpf_iter_bpf_hash_map.c    |  1 -
 .../testing/selftests/bpf/progs/bpf_iter_bpf_map.c |  1 -
 .../selftests/bpf/progs/bpf_iter_ipv6_route.c      |  1 -
 .../testing/selftests/bpf/progs/bpf_iter_netlink.c |  1 -
 tools/testing/selftests/bpf/progs/bpf_iter_task.c  |  1 -
 .../selftests/bpf/progs/bpf_iter_task_btf.c        |  1 -
 .../selftests/bpf/progs/bpf_iter_task_file.c       |  1 -
 .../selftests/bpf/progs/bpf_iter_task_stack.c      |  1 -
 .../selftests/bpf/progs/bpf_iter_task_vma.c        |  1 -
 tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c  |  1 -
 tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c  |  1 -
 tools/testing/selftests/bpf/progs/bpf_iter_udp4.c  |  1 -
 tools/testing/selftests/bpf/progs/bpf_iter_udp6.c  |  1 -
 tools/testing/selftests/bpf/progs/test_snprintf.c  |  1 -
 17 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
index 52aa7b38e8b8..03af863314ea 100644
--- a/kernel/bpf/preload/iterators/iterators.bpf.c
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2020 Facebook */
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
 
 #pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 9720dc0b4605..b9987c3efa3c 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -158,4 +158,70 @@ enum libbpf_tristate {
 #define __kconfig __attribute__((section(".kconfig")))
 #define __ksym __attribute__((section(".ksyms")))
 
+#ifndef ___bpf_concat
+#define ___bpf_concat(a, b) a ## b
+#endif
+#ifndef ___bpf_apply
+#define ___bpf_apply(fn, n) ___bpf_concat(fn, n)
+#endif
+#ifndef ___bpf_nth
+#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N
+#endif
+#ifndef ___bpf_narg
+#define ___bpf_narg(...) \
+	___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#endif
+
+#define ___bpf_fill0(arr, p, x) do {} while (0)
+#define ___bpf_fill1(arr, p, x) arr[p] = x
+#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args)
+#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args)
+#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args)
+#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args)
+#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args)
+#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args)
+#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args)
+#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args)
+#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args)
+#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args)
+#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args)
+#define ___bpf_fill(arr, args...) \
+	___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args)
+
+/*
+ * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values
+ * in a structure.
+ */
+#define BPF_SEQ_PRINTF(seq, fmt, args...)			\
+({								\
+	static const char ___fmt[] = fmt;			\
+	unsigned long long ___param[___bpf_narg(args)];		\
+								\
+	_Pragma("GCC diagnostic push")				\
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")	\
+	___bpf_fill(___param, args);				\
+	_Pragma("GCC diagnostic pop")				\
+								\
+	bpf_seq_printf(seq, ___fmt, sizeof(___fmt),		\
+		       ___param, sizeof(___param));		\
+})
+
+/*
+ * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of
+ * an array of u64.
+ */
+#define BPF_SNPRINTF(out, out_size, fmt, args...)		\
+({								\
+	static const char ___fmt[] = fmt;			\
+	unsigned long long ___param[___bpf_narg(args)];		\
+								\
+	_Pragma("GCC diagnostic push")				\
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")	\
+	___bpf_fill(___param, args);				\
+	_Pragma("GCC diagnostic pop")				\
+								\
+	bpf_snprintf(out, out_size, ___fmt,			\
+		     ___param, sizeof(___param));		\
+})
+
 #endif
diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 8c954ebc0c7c..c0f3a26aa582 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -295,13 +295,19 @@ struct pt_regs;
 			  (void *)(PT_REGS_FP(ctx) + sizeof(ip))); })
 #endif
 
+#ifndef ___bpf_concat
 #define ___bpf_concat(a, b) a ## b
+#endif
+#ifndef ___bpf_apply
 #define ___bpf_apply(fn, n) ___bpf_concat(fn, n)
+#endif
+#ifndef ___bpf_nth
 #define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N
+#endif
+#ifndef ___bpf_narg
 #define ___bpf_narg(...) \
 	___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
-#define ___bpf_empty(...) \
-	___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0)
+#endif
 
 #define ___bpf_ctx_cast0() ctx
 #define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0]
@@ -413,56 +419,4 @@ typeof(name(0)) name(struct pt_regs *ctx)				    \
 }									    \
 static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
 
-#define ___bpf_fill0(arr, p, x) do {} while (0)
-#define ___bpf_fill1(arr, p, x) arr[p] = x
-#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args)
-#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args)
-#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args)
-#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args)
-#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args)
-#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args)
-#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args)
-#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args)
-#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args)
-#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args)
-#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args)
-#define ___bpf_fill(arr, args...) \
-	___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args)
-
-/*
- * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values
- * in a structure.
- */
-#define BPF_SEQ_PRINTF(seq, fmt, args...)			\
-({								\
-	static const char ___fmt[] = fmt;			\
-	unsigned long long ___param[___bpf_narg(args)];		\
-								\
-	_Pragma("GCC diagnostic push")				\
-	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")	\
-	___bpf_fill(___param, args);				\
-	_Pragma("GCC diagnostic pop")				\
-								\
-	bpf_seq_printf(seq, ___fmt, sizeof(___fmt),		\
-		       ___param, sizeof(___param));		\
-})
-
-/*
- * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of
- * an array of u64.
- */
-#define BPF_SNPRINTF(out, out_size, fmt, args...)		\
-({								\
-	static const char ___fmt[] = fmt;			\
-	unsigned long long ___param[___bpf_narg(args)];		\
-								\
-	_Pragma("GCC diagnostic push")				\
-	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")	\
-	___bpf_fill(___param, args);				\
-	_Pragma("GCC diagnostic pop")				\
-								\
-	bpf_snprintf(out, out_size, ___fmt,			\
-		     ___param, sizeof(___param));		\
-})
-
 #endif
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
index 6dfce3fd68bc..0aa3cd34cbe3 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2020 Facebook */
 #include "bpf_iter.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
index b83b5d2e17dc..6c39e86b666f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2020 Facebook */
 #include "bpf_iter.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
index d58d9f1642b5..784a610ce039 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
@@ -3,7 +3,6 @@
 #include "bpf_iter.h"
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
index 95989f4c99b5..a28e51e2dcee 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
@@ -3,7 +3,6 @@
 #include "bpf_iter.h"
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
index b7f32c160f4e..c86b93f33b32 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2020 Facebook */
 #include "bpf_iter.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
index a1ddc36f13ec..bca8b889cb10 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2020, Oracle and/or its affiliates. */
 #include "bpf_iter.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
 
 #include <errno.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
index b2f7c7c5f952..6e7b400888fe 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2020 Facebook */
 #include "bpf_iter.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
index 43c36f5f7649..f2b8167b72a8 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2020 Facebook */
 #include "bpf_iter.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
index 11d1aa37cf11..4ea6a37d1345 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2020 Facebook */
 #include "bpf_iter.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
index 54380c5e1069..2e4775c35414 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
@@ -3,7 +3,6 @@
 #include "bpf_iter.h"
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 #include <bpf/bpf_endian.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
index b4fbddfa4e10..943f7bba180e 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
@@ -3,7 +3,6 @@
 #include "bpf_iter.h"
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 #include <bpf/bpf_endian.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
index f258583afbbd..cf0c485b1ed7 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
@@ -3,7 +3,6 @@
 #include "bpf_iter.h"
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 #include <bpf/bpf_endian.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
index 65f93bb03f0f..5031e21c433f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
@@ -3,7 +3,6 @@
 #include "bpf_iter.h"
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 #include <bpf/bpf_endian.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c
index e35129bea0a0..e2ad26150f9b 100644
--- a/tools/testing/selftests/bpf/progs/test_snprintf.c
+++ b/tools/testing/selftests/bpf/progs/test_snprintf.c
@@ -3,7 +3,6 @@
 
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
 
 __u32 pid = 0;
 
-- 
cgit v1.2.3-70-g09d2


From fc8c262e0eb5aa248af5051377e4ff3348841ac5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 26 May 2021 08:24:57 -0700
Subject: bpf, docs: Add llvm_reloc.rst to explain llvm bpf relocations

LLVM upstream commit https://reviews.llvm.org/D102712 made some changes
to bpf relocations to make them llvm linker lld friendly. The scope of
existing relocations R_BPF_64_{64,32} is narrowed and new relocations
R_BPF_64_{ABS32,ABS64,NODYLD32} are introduced.

Let us add some documentation about llvm bpf relocations so people can
understand how to resolve them properly in their respective tools.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210526152457.335210-1-yhs@fb.com
---
 Documentation/bpf/index.rst            |   1 +
 Documentation/bpf/llvm_reloc.rst       | 240 +++++++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/README.rst |  19 +++
 3 files changed, 260 insertions(+)
 create mode 100644 Documentation/bpf/llvm_reloc.rst

diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst
index a702f67dd45f..93e8cf12a6d4 100644
--- a/Documentation/bpf/index.rst
+++ b/Documentation/bpf/index.rst
@@ -84,6 +84,7 @@ Other
    :maxdepth: 1
 
    ringbuf
+   llvm_reloc
 
 .. Links:
 .. _networking-filter: ../networking/filter.rst
diff --git a/Documentation/bpf/llvm_reloc.rst b/Documentation/bpf/llvm_reloc.rst
new file mode 100644
index 000000000000..ca8957d5b671
--- /dev/null
+++ b/Documentation/bpf/llvm_reloc.rst
@@ -0,0 +1,240 @@
+.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+====================
+BPF LLVM Relocations
+====================
+
+This document describes LLVM BPF backend relocation types.
+
+Relocation Record
+=================
+
+LLVM BPF backend records each relocation with the following 16-byte
+ELF structure::
+
+  typedef struct
+  {
+    Elf64_Addr    r_offset;  // Offset from the beginning of section.
+    Elf64_Xword   r_info;    // Relocation type and symbol index.
+  } Elf64_Rel;
+
+For example, for the following code::
+
+  int g1 __attribute__((section("sec")));
+  int g2 __attribute__((section("sec")));
+  static volatile int l1 __attribute__((section("sec")));
+  static volatile int l2 __attribute__((section("sec")));
+  int test() {
+    return g1 + g2 + l1 + l2;
+  }
+
+Compiled with ``clang -target bpf -O2 -c test.c``, the following is
+the code with ``llvm-objdump -dr test.o``::
+
+       0:       18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll
+                0000000000000000:  R_BPF_64_64  g1
+       2:       61 11 00 00 00 00 00 00 r1 = *(u32 *)(r1 + 0)
+       3:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0 ll
+                0000000000000018:  R_BPF_64_64  g2
+       5:       61 20 00 00 00 00 00 00 r0 = *(u32 *)(r2 + 0)
+       6:       0f 10 00 00 00 00 00 00 r0 += r1
+       7:       18 01 00 00 08 00 00 00 00 00 00 00 00 00 00 00 r1 = 8 ll
+                0000000000000038:  R_BPF_64_64  sec
+       9:       61 11 00 00 00 00 00 00 r1 = *(u32 *)(r1 + 0)
+      10:       0f 10 00 00 00 00 00 00 r0 += r1
+      11:       18 01 00 00 0c 00 00 00 00 00 00 00 00 00 00 00 r1 = 12 ll
+                0000000000000058:  R_BPF_64_64  sec
+      13:       61 11 00 00 00 00 00 00 r1 = *(u32 *)(r1 + 0)
+      14:       0f 10 00 00 00 00 00 00 r0 += r1
+      15:       95 00 00 00 00 00 00 00 exit
+
+There are four relations in the above for four ``LD_imm64`` instructions.
+The following ``llvm-readelf -r test.o`` shows the binary values of the four
+relocations::
+
+  Relocation section '.rel.text' at offset 0x190 contains 4 entries:
+      Offset             Info             Type               Symbol's Value  Symbol's Name
+  0000000000000000  0000000600000001 R_BPF_64_64            0000000000000000 g1
+  0000000000000018  0000000700000001 R_BPF_64_64            0000000000000004 g2
+  0000000000000038  0000000400000001 R_BPF_64_64            0000000000000000 sec
+  0000000000000058  0000000400000001 R_BPF_64_64            0000000000000000 sec
+
+Each relocation is represented by ``Offset`` (8 bytes) and ``Info`` (8 bytes).
+For example, the first relocation corresponds to the first instruction
+(Offset 0x0) and the corresponding ``Info`` indicates the relocation type
+of ``R_BPF_64_64`` (type 1) and the entry in the symbol table (entry 6).
+The following is the symbol table with ``llvm-readelf -s test.o``::
+
+  Symbol table '.symtab' contains 8 entries:
+     Num:    Value          Size Type    Bind   Vis       Ndx Name
+       0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
+       1: 0000000000000000     0 FILE    LOCAL  DEFAULT   ABS test.c
+       2: 0000000000000008     4 OBJECT  LOCAL  DEFAULT     4 l1
+       3: 000000000000000c     4 OBJECT  LOCAL  DEFAULT     4 l2
+       4: 0000000000000000     0 SECTION LOCAL  DEFAULT     4 sec
+       5: 0000000000000000   128 FUNC    GLOBAL DEFAULT     2 test
+       6: 0000000000000000     4 OBJECT  GLOBAL DEFAULT     4 g1
+       7: 0000000000000004     4 OBJECT  GLOBAL DEFAULT     4 g2
+
+The 6th entry is global variable ``g1`` with value 0.
+
+Similarly, the second relocation is at ``.text`` offset ``0x18``, instruction 3,
+for global variable ``g2`` which has a symbol value 4, the offset
+from the start of ``.data`` section.
+
+The third and fourth relocations refers to static variables ``l1``
+and ``l2``. From ``.rel.text`` section above, it is not clear
+which symbols they really refers to as they both refers to
+symbol table entry 4, symbol ``sec``, which has ``STT_SECTION`` type
+and represents a section. So for static variable or function,
+the section offset is written to the original insn
+buffer, which is called ``A`` (addend). Looking at
+above insn ``7`` and ``11``, they have section offset ``8`` and ``12``.
+From symbol table, we can find that they correspond to entries ``2``
+and ``3`` for ``l1`` and ``l2``.
+
+In general, the ``A`` is 0 for global variables and functions,
+and is the section offset or some computation result based on
+section offset for static variables/functions. The non-section-offset
+case refers to function calls. See below for more details.
+
+Different Relocation Types
+==========================
+
+Six relocation types are supported. The following is an overview and
+``S`` represents the value of the symbol in the symbol table::
+
+  Enum  ELF Reloc Type     Description      BitSize  Offset        Calculation
+  0     R_BPF_NONE         None
+  1     R_BPF_64_64        ld_imm64 insn    32       r_offset + 4  S + A
+  2     R_BPF_64_ABS64     normal data      64       r_offset      S + A
+  3     R_BPF_64_ABS32     normal data      32       r_offset      S + A
+  4     R_BPF_64_NODYLD32  .BTF[.ext] data  32       r_offset      S + A
+  10    R_BPF_64_32        call insn        32       r_offset + 4  (S + A) / 8 - 1
+
+For example, ``R_BPF_64_64`` relocation type is used for ``ld_imm64`` instruction.
+The actual to-be-relocated data (0 or section offset)
+is stored at ``r_offset + 4`` and the read/write
+data bitsize is 32 (4 bytes). The relocation can be resolved with
+the symbol value plus implicit addend. Note that the ``BitSize`` is 32 which
+means the section offset must be less than or equal to ``UINT32_MAX`` and this
+is enforced by LLVM BPF backend.
+
+In another case, ``R_BPF_64_ABS64`` relocation type is used for normal 64-bit data.
+The actual to-be-relocated data is stored at ``r_offset`` and the read/write data
+bitsize is 64 (8 bytes). The relocation can be resolved with
+the symbol value plus implicit addend.
+
+Both ``R_BPF_64_ABS32`` and ``R_BPF_64_NODYLD32`` types are for 32-bit data.
+But ``R_BPF_64_NODYLD32`` specifically refers to relocations in ``.BTF`` and
+``.BTF.ext`` sections. For cases like bcc where llvm ``ExecutionEngine RuntimeDyld``
+is involved, ``R_BPF_64_NODYLD32`` types of relocations should not be resolved
+to actual function/variable address. Otherwise, ``.BTF`` and ``.BTF.ext``
+become unusable by bcc and kernel.
+
+Type ``R_BPF_64_32`` is used for call instruction. The call target section
+offset is stored at ``r_offset + 4`` (32bit) and calculated as
+``(S + A) / 8 - 1``.
+
+Examples
+========
+
+Types ``R_BPF_64_64`` and ``R_BPF_64_32`` are used to resolve ``ld_imm64``
+and ``call`` instructions. For example::
+
+  __attribute__((noinline)) __attribute__((section("sec1")))
+  int gfunc(int a, int b) {
+    return a * b;
+  }
+  static __attribute__((noinline)) __attribute__((section("sec1")))
+  int lfunc(int a, int b) {
+    return a + b;
+  }
+  int global __attribute__((section("sec2")));
+  int test(int a, int b) {
+    return gfunc(a, b) +  lfunc(a, b) + global;
+  }
+
+Compiled with ``clang -target bpf -O2 -c test.c``, we will have
+following code with `llvm-objdump -dr test.o``::
+
+  Disassembly of section .text:
+
+  0000000000000000 <test>:
+         0:       bf 26 00 00 00 00 00 00 r6 = r2
+         1:       bf 17 00 00 00 00 00 00 r7 = r1
+         2:       85 10 00 00 ff ff ff ff call -1
+                  0000000000000010:  R_BPF_64_32  gfunc
+         3:       bf 08 00 00 00 00 00 00 r8 = r0
+         4:       bf 71 00 00 00 00 00 00 r1 = r7
+         5:       bf 62 00 00 00 00 00 00 r2 = r6
+         6:       85 10 00 00 02 00 00 00 call 2
+                  0000000000000030:  R_BPF_64_32  sec1
+         7:       0f 80 00 00 00 00 00 00 r0 += r8
+         8:       18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll
+                  0000000000000040:  R_BPF_64_64  global
+        10:       61 11 00 00 00 00 00 00 r1 = *(u32 *)(r1 + 0)
+        11:       0f 10 00 00 00 00 00 00 r0 += r1
+        12:       95 00 00 00 00 00 00 00 exit
+
+  Disassembly of section sec1:
+
+  0000000000000000 <gfunc>:
+         0:       bf 20 00 00 00 00 00 00 r0 = r2
+         1:       2f 10 00 00 00 00 00 00 r0 *= r1
+         2:       95 00 00 00 00 00 00 00 exit
+
+  0000000000000018 <lfunc>:
+         3:       bf 20 00 00 00 00 00 00 r0 = r2
+         4:       0f 10 00 00 00 00 00 00 r0 += r1
+         5:       95 00 00 00 00 00 00 00 exit
+
+The first relocation corresponds to ``gfunc(a, b)`` where ``gfunc`` has a value of 0,
+so the ``call`` instruction offset is ``(0 + 0)/8 - 1 = -1``.
+The second relocation corresponds to ``lfunc(a, b)`` where ``lfunc`` has a section
+offset ``0x18``, so the ``call`` instruction offset is ``(0 + 0x18)/8 - 1 = 2``.
+The third relocation corresponds to ld_imm64 of ``global``, which has a section
+offset ``0``.
+
+The following is an example to show how R_BPF_64_ABS64 could be generated::
+
+  int global() { return 0; }
+  struct t { void *g; } gbl = { global };
+
+Compiled with ``clang -target bpf -O2 -g -c test.c``, we will see a
+relocation below in ``.data`` section with command
+``llvm-readelf -r test.o``::
+
+  Relocation section '.rel.data' at offset 0x458 contains 1 entries:
+      Offset             Info             Type               Symbol's Value  Symbol's Name
+  0000000000000000  0000000700000002 R_BPF_64_ABS64         0000000000000000 global
+
+The relocation says the first 8-byte of ``.data`` section should be
+filled with address of ``global`` variable.
+
+With ``llvm-readelf`` output, we can see that dwarf sections have a bunch of
+``R_BPF_64_ABS32`` and ``R_BPF_64_ABS64`` relocations::
+
+  Relocation section '.rel.debug_info' at offset 0x468 contains 13 entries:
+      Offset             Info             Type               Symbol's Value  Symbol's Name
+  0000000000000006  0000000300000003 R_BPF_64_ABS32         0000000000000000 .debug_abbrev
+  000000000000000c  0000000400000003 R_BPF_64_ABS32         0000000000000000 .debug_str
+  0000000000000012  0000000400000003 R_BPF_64_ABS32         0000000000000000 .debug_str
+  0000000000000016  0000000600000003 R_BPF_64_ABS32         0000000000000000 .debug_line
+  000000000000001a  0000000400000003 R_BPF_64_ABS32         0000000000000000 .debug_str
+  000000000000001e  0000000200000002 R_BPF_64_ABS64         0000000000000000 .text
+  000000000000002b  0000000400000003 R_BPF_64_ABS32         0000000000000000 .debug_str
+  0000000000000037  0000000800000002 R_BPF_64_ABS64         0000000000000000 gbl
+  0000000000000040  0000000400000003 R_BPF_64_ABS32         0000000000000000 .debug_str
+  ......
+
+The .BTF/.BTF.ext sections has R_BPF_64_NODYLD32 relocations::
+
+  Relocation section '.rel.BTF' at offset 0x538 contains 1 entries:
+      Offset             Info             Type               Symbol's Value  Symbol's Name
+  0000000000000084  0000000800000004 R_BPF_64_NODYLD32      0000000000000000 gbl
+
+  Relocation section '.rel.BTF.ext' at offset 0x548 contains 2 entries:
+      Offset             Info             Type               Symbol's Value  Symbol's Name
+  000000000000002c  0000000200000004 R_BPF_64_NODYLD32      0000000000000000 .text
+  0000000000000040  0000000200000004 R_BPF_64_NODYLD32      0000000000000000 .text
diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 3353778c30f8..8deec1ca9150 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -202,3 +202,22 @@ generate valid BTF information for weak variables. Please make sure you use
 Clang that contains the fix.
 
 __ https://reviews.llvm.org/D100362
+
+Clang relocation changes
+========================
+
+Clang 13 patch `clang reloc patch`_  made some changes on relocations such
+that existing relocation types are broken into more types and
+each new type corresponds to only one way to resolve relocation.
+See `kernel llvm reloc`_ for more explanation and some examples.
+Using clang 13 to compile old libbpf which has static linker support,
+there will be a compilation failure::
+
+  libbpf: ELF relo #0 in section #6 has unexpected type 2 in .../bpf_tcp_nogpl.o
+
+Here, ``type 2`` refers to new relocation type ``R_BPF_64_ABS64``.
+To fix this issue, user newer libbpf.
+
+.. Links
+.. _clang reloc patch: https://reviews.llvm.org/D102712
+.. _kernel llvm reloc: /Documentation/bpf/llvm_reloc.rst
-- 
cgit v1.2.3-70-g09d2


From e8e0f0f484780d7b90a63ea50020ac4bb027178d Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 27 May 2021 22:43:56 -0400
Subject: bpf, devmap: Remove drops variable from bq_xmit_all()

As Colin pointed out, the first drops assignment after declaration will
be overwritten by the second drops assignment before using, which makes
it useless.

Since the drops variable will be used only once. Just remove it and
use "cnt - sent" in trace_xdp_devmap_xmit().

Fixes: cb261b594b41 ("bpf: Run devmap xdp_prog on flush instead of bulk enqueue")
Reported-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210528024356.24333-1-liuhangbin@gmail.com
---
 kernel/bpf/devmap.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f9148daab0e3..2a75e6c2d27d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -370,8 +370,8 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
 static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 {
 	struct net_device *dev = bq->dev;
-	int sent = 0, drops = 0, err = 0;
 	unsigned int cnt = bq->count;
+	int sent = 0, err = 0;
 	int to_send = cnt;
 	int i;
 
@@ -388,8 +388,6 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
 		if (!to_send)
 			goto out;
-
-		drops = cnt - to_send;
 	}
 
 	sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
@@ -408,9 +406,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 		xdp_return_frame_rx_napi(bq->q[i]);
 
 out:
-	drops = cnt - sent;
 	bq->count = 0;
-	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
+	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
 }
 
 /* __dev_flush is called from xdp_do_flush() which _must_ be signaled
-- 
cgit v1.2.3-70-g09d2


From 05924717ac704a868053652b20036aa3a2273e26 Mon Sep 17 00:00:00 2001
From: Harishankar Vishwanathan <harishankar.vishwanathan@rutgers.edu>
Date: Sun, 30 May 2021 22:01:57 -0400
Subject: bpf, tnums: Provably sound, faster, and more precise algorithm for
 tnum_mul

This patch introduces a new algorithm for multiplication of tristate
numbers (tnums) that is provably sound. It is faster and more precise when
compared to the existing method.

Like the existing method, this new algorithm follows the long
multiplication algorithm. The idea is to generate partial products by
multiplying each bit in the multiplier (tnum a) with the multiplicand
(tnum b), and adding the partial products after appropriately bit-shifting
them. The new algorithm, however, uses just a single loop over the bits of
the multiplier (tnum a) and accumulates only the uncertain components of
the multiplicand (tnum b) into a mask-only tnum. The following paper
explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.

A natural way to construct the tnum product is by performing a tnum
addition on all the partial products. This algorithm presents another
method of doing this: decompose each partial product into two tnums,
consisting of the values and the masks separately. The mask-sum is
accumulated within the loop in acc_m. The value-sum tnum is generated
using a.value * b.value. The tnum constructed by tnum addition of the
value-sum and the mask-sum contains all possible summations of concrete
values drawn from the partial product tnums pairwise. We prove this result
in the paper.

Our evaluations show that the new algorithm is overall more precise
(producing tnums with less uncertain components) than the existing method.
As an illustrative example, consider the input tnums A and B. The numbers
in the parenthesis correspond to (value;mask).

  A                = 000000x1 (1;2)
  B                = 0010011x (38;1)
  A * B (existing) = xxxxxxxx (0;255)
  A * B (new)      = 0x1xxxxx (32;95)

Importantly, we present a proof of soundness of the new algorithm in the
aforementioned paper. Additionally, we show that this new algorithm is
empirically faster than the existing method.

Co-developed-by: Matan Shachnai <m.shachnai@rutgers.edu>
Co-developed-by: Srinivas Narayana <srinivas.narayana@rutgers.edu>
Co-developed-by: Santosh Nagarakatte <santosh.nagarakatte@rutgers.edu>
Signed-off-by: Matan Shachnai <m.shachnai@rutgers.edu>
Signed-off-by: Srinivas Narayana <srinivas.narayana@rutgers.edu>
Signed-off-by: Santosh Nagarakatte <santosh.nagarakatte@rutgers.edu>
Signed-off-by: Harishankar Vishwanathan <harishankar.vishwanathan@rutgers.edu>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://arxiv.org/abs/2105.05398
Link: https://lore.kernel.org/bpf/20210531020157.7386-1-harishankar.vishwanathan@rutgers.edu
---
 kernel/bpf/tnum.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ceac5281bd31..3d7127f439a1 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -111,28 +111,31 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
 	return TNUM(v & ~mu, mu);
 }
 
-/* half-multiply add: acc += (unknown * mask * value).
- * An intermediate step in the multiply algorithm.
+/* Generate partial products by multiplying each bit in the multiplier (tnum a)
+ * with the multiplicand (tnum b), and add the partial products after
+ * appropriately bit-shifting them. Instead of directly performing tnum addition
+ * on the generated partial products, equivalenty, decompose each partial
+ * product into two tnums, consisting of the value-sum (acc_v) and the
+ * mask-sum (acc_m) and then perform tnum addition on them. The following paper
+ * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
  */
-static struct tnum hma(struct tnum acc, u64 value, u64 mask)
-{
-	while (mask) {
-		if (mask & 1)
-			acc = tnum_add(acc, TNUM(0, value));
-		mask >>= 1;
-		value <<= 1;
-	}
-	return acc;
-}
-
 struct tnum tnum_mul(struct tnum a, struct tnum b)
 {
-	struct tnum acc;
-	u64 pi;
-
-	pi = a.value * b.value;
-	acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
-	return hma(acc, b.mask, a.value);
+	u64 acc_v = a.value * b.value;
+	struct tnum acc_m = TNUM(0, 0);
+
+	while (a.value || a.mask) {
+		/* LSB of tnum a is a certain 1 */
+		if (a.value & 1)
+			acc_m = tnum_add(acc_m, TNUM(0, b.mask));
+		/* LSB of tnum a is uncertain */
+		else if (a.mask & 1)
+			acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
+		/* Note: no case for LSB is certain 0 */
+		a = tnum_rshift(a, 1);
+		b = tnum_lshift(b, 1);
+	}
+	return tnum_add(TNUM(acc_v, 0), acc_m);
 }
 
 /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
-- 
cgit v1.2.3-70-g09d2


From 16cac0060680c11bb82c325c4fe95cb66fc8dfaf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 2 Jun 2021 17:40:23 -0700
Subject: libbpf: Move few APIs from 0.4 to 0.5 version

Official libbpf 0.4 release doesn't include three APIs that were tentatively
put into 0.4 section. Fix libbpf.map and move these three APIs:

  - bpf_map__initial_value;
  - bpf_map_lookup_and_delete_elem_flags;
  - bpf_object__gen_loader.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210603004026.2698513-2-andrii@kernel.org
---
 tools/lib/bpf/libbpf.map | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index bbe99b1db1a9..944c99d1ded3 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -359,10 +359,7 @@ LIBBPF_0.4.0 {
 		bpf_linker__finalize;
 		bpf_linker__free;
 		bpf_linker__new;
-		bpf_map__initial_value;
 		bpf_map__inner_map;
-		bpf_map_lookup_and_delete_elem_flags;
-		bpf_object__gen_loader;
 		bpf_object__set_kversion;
 		bpf_tc_attach;
 		bpf_tc_detach;
@@ -373,5 +370,8 @@ LIBBPF_0.4.0 {
 
 LIBBPF_0.5.0 {
 	global:
+		bpf_map__initial_value;
+		bpf_map_lookup_and_delete_elem_flags;
+		bpf_object__gen_loader;
 		libbpf_set_strict_mode;
 } LIBBPF_0.4.0;
-- 
cgit v1.2.3-70-g09d2


From 232c9e8bd5ebfb43563b58f31e685fde06d9441f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 2 Jun 2021 17:40:24 -0700
Subject: libbpf: Refactor header installation portions of Makefile

As we gradually get more headers that have to be installed, it's quite
annoying to copy/paste long $(call) commands. So extract that logic and do
a simple $(foreach) over the list of headers.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210603004026.2698513-3-andrii@kernel.org
---
 tools/lib/bpf/Makefile | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 15420303cf06..d1b909e005dc 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -223,19 +223,14 @@ install_lib: all_cmd
 		$(call do_install_mkdir,$(libdir_SQ)); \
 		cp -fpR $(LIB_FILE) $(DESTDIR)$(libdir_SQ)
 
+INSTALL_HEADERS = bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h xsk.h \
+		  bpf_helpers.h $(BPF_HELPER_DEFS) bpf_tracing.h	     \
+		  bpf_endian.h bpf_core_read.h
+
 install_headers: $(BPF_HELPER_DEFS)
-	$(call QUIET_INSTALL, headers) \
-		$(call do_install,bpf.h,$(prefix)/include/bpf,644); \
-		$(call do_install,libbpf.h,$(prefix)/include/bpf,644); \
-		$(call do_install,btf.h,$(prefix)/include/bpf,644); \
-		$(call do_install,libbpf_common.h,$(prefix)/include/bpf,644); \
-		$(call do_install,libbpf_legacy.h,$(prefix)/include/bpf,644); \
-		$(call do_install,xsk.h,$(prefix)/include/bpf,644); \
-		$(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \
-		$(call do_install,$(BPF_HELPER_DEFS),$(prefix)/include/bpf,644); \
-		$(call do_install,bpf_tracing.h,$(prefix)/include/bpf,644); \
-		$(call do_install,bpf_endian.h,$(prefix)/include/bpf,644); \
-		$(call do_install,bpf_core_read.h,$(prefix)/include/bpf,644);
+	$(call QUIET_INSTALL, headers)					     \
+		$(foreach hdr,$(INSTALL_HEADERS),			     \
+			$(call do_install,$(hdr),$(prefix)/include/bpf,644);)
 
 install_pkgconfig: $(PC_FILE)
 	$(call QUIET_INSTALL, $(PC_FILE)) \
-- 
cgit v1.2.3-70-g09d2


From 7d8a819dd31672f02ece93b6a9b9491daba4f0f2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 2 Jun 2021 17:40:25 -0700
Subject: libbpf: Install skel_internal.h header used from light skeletons

Light skeleton code assumes skel_internal.h header to be installed system-wide
by libbpf package. Make sure it is actually installed.

Fixes: 67234743736a ("libbpf: Generate loader program out of BPF ELF file.")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210603004026.2698513-4-andrii@kernel.org
---
 tools/lib/bpf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index d1b909e005dc..ec14aa725bb0 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -225,7 +225,7 @@ install_lib: all_cmd
 
 INSTALL_HEADERS = bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h xsk.h \
 		  bpf_helpers.h $(BPF_HELPER_DEFS) bpf_tracing.h	     \
-		  bpf_endian.h bpf_core_read.h
+		  bpf_endian.h bpf_core_read.h skel_internal.h
 
 install_headers: $(BPF_HELPER_DEFS)
 	$(call QUIET_INSTALL, headers)					     \
-- 
cgit v1.2.3-70-g09d2


From 56b8b7f9533b5c40cbc1266b5cc6a3b19dfd2aad Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 2 Jun 2021 17:40:26 -0700
Subject: selftests/bpf: Add xdp_redirect_multi into .gitignore

When xdp_redirect_multi test binary was added recently, it wasn't added to
.gitignore. Fix that.

Fixes: d23292476297 ("selftests/bpf: Add xdp_redirect_multi test")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210603004026.2698513-5-andrii@kernel.org
---
 tools/testing/selftests/bpf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index a030aa4a8a9e..a18f57044014 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -38,3 +38,4 @@ test_cpp
 /bench
 *.ko
 xdpxceiver
+xdp_redirect_multi
-- 
cgit v1.2.3-70-g09d2


From 0779890fed7817725f399d6ec85730e08ebfdeee Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 3 Jun 2021 19:05:16 +0200
Subject: tools/bpftool: Fix cross-build

When the bootstrap and final bpftool have different architectures, we
need to build two distinct disasm.o objects. Add a recipe for the
bootstrap disasm.o.

After commit d510296d331a ("bpftool: Use syscall/loader program in
"prog load" and "gen skeleton" command.") cross-building bpftool didn't
work anymore, because the bootstrap bpftool was linked using objects
from different architectures:

  $ make O=/tmp/bpftool ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/bpf/bpftool/ V=1
  [...]
  aarch64-linux-gnu-gcc ... -c -MMD -o /tmp/bpftool/disasm.o /home/z/src/linux/kernel/bpf/disasm.c
  gcc ... -c -MMD -o /tmp/bpftool//bootstrap/main.o main.c
  gcc ... -o /tmp/bpftool//bootstrap/bpftool /tmp/bpftool//bootstrap/main.o ... /tmp/bpftool/disasm.o
  /usr/bin/ld: /tmp/bpftool/disasm.o: Relocations in generic ELF (EM: 183)
  /usr/bin/ld: /tmp/bpftool/disasm.o: Relocations in generic ELF (EM: 183)
  /usr/bin/ld: /tmp/bpftool/disasm.o: Relocations in generic ELF (EM: 183)
  /usr/bin/ld: /tmp/bpftool/disasm.o: error adding symbols: file in wrong format
  collect2: error: ld returned 1 exit status
  [...]

The final bpftool was built for e.g. arm64, while the bootstrap bpftool,
executed on the host, was built for x86. The problem here was that disasm.o
linked into the bootstrap bpftool was arm64 rather than x86. With the fix
we build two disasm.o, one for the target bpftool in arm64, and one for
the bootstrap bpftool in x86.

Fixes: d510296d331a ("bpftool: Use syscall/loader program in "prog load" and "gen skeleton" command.")
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210603170515.1854642-1-jean-philippe@linaro.org
---
 tools/bpf/bpftool/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index d16d289ade7a..d73232be1e99 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -136,7 +136,7 @@ endif
 
 BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool
 
-BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o) $(OUTPUT)disasm.o
+BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o)
 OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
 
 VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)				\
@@ -180,6 +180,9 @@ endif
 
 CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS)
 
+$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
+	$(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD -o $@ $<
+
 $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
 	$(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $<
 
-- 
cgit v1.2.3-70-g09d2


From edc0571c5f67c7e24958149a8ec6a904ca84840b Mon Sep 17 00:00:00 2001
From: Michal Suchanek <msuchanek@suse.de>
Date: Fri, 4 Jun 2021 13:24:48 +0200
Subject: libbpf: Fix pr_warn type warnings on 32bit

The printed value is ptrdiff_t and is formatted wiht %ld. This works on
64bit but produces a warning on 32bit. Fix the format specifier to %td.

Fixes: 67234743736a ("libbpf: Generate loader program out of BPF ELF file.")
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210604112448.32297-1-msuchanek@suse.de
---
 tools/lib/bpf/libbpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1c4e20e75237..65f87cc1220c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4581,7 +4581,7 @@ static int init_map_slots(struct bpf_object *obj, struct bpf_map *map)
 		targ_map = map->init_slots[i];
 		fd = bpf_map__fd(targ_map);
 		if (obj->gen_loader) {
-			pr_warn("// TODO map_update_elem: idx %ld key %d value==map_idx %ld\n",
+			pr_warn("// TODO map_update_elem: idx %td key %d value==map_idx %td\n",
 				map - obj->maps, i, targ_map - obj->maps);
 			return -ENOTSUP;
 		} else {
@@ -6205,7 +6205,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 		return -EINVAL;
 
 	if (prog->obj->gen_loader) {
-		pr_warn("// TODO core_relo: prog %ld insn[%d] %s %s kind %d\n",
+		pr_warn("// TODO core_relo: prog %td insn[%d] %s %s kind %d\n",
 			prog - prog->obj->programs, relo->insn_off / 8,
 			local_name, spec_str, relo->kind);
 		return -ENOTSUP;
-- 
cgit v1.2.3-70-g09d2


From 380afe7208966ab59e5215c9daab3f6b06193d8c Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Mon, 7 Jun 2021 18:57:56 -0700
Subject: selftests, bpf: Make docs tests fail more reliably

Previously, if rst2man caught errors, then these would be ignored and
the output file would be written anyway. This would allow developers to
introduce regressions in the docs comments in the BPF headers.

Additionally, even if you instruct rst2man to fail out, it will still
write out to the destination target file, so if you ran the tests twice
in a row it would always pass. Use a temporary file for the initial run
to ensure that if rst2man fails out under "--strict" mode, subsequent
runs will not automatically pass.

Tested via ./tools/testing/selftests/bpf/test_doc_build.sh

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20210608015756.340385-1-joe@cilium.io
---
 tools/testing/selftests/bpf/.gitignore        | 1 +
 tools/testing/selftests/bpf/Makefile.docs     | 3 ++-
 tools/testing/selftests/bpf/test_doc_build.sh | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index a18f57044014..eae6fc7d3ed8 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -37,5 +37,6 @@ test_cpp
 /runqslower
 /bench
 *.ko
+*.tmp
 xdpxceiver
 xdp_redirect_multi
diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs
index ccf260021e83..eb6a4fea8c79 100644
--- a/tools/testing/selftests/bpf/Makefile.docs
+++ b/tools/testing/selftests/bpf/Makefile.docs
@@ -52,7 +52,8 @@ $(OUTPUT)%.$2: $(OUTPUT)%.rst
 ifndef RST2MAN_DEP
 	$$(error "rst2man not found, but required to generate man pages")
 endif
-	$$(QUIET_GEN)rst2man $$< > $$@
+	$$(QUIET_GEN)rst2man --exit-status=1 $$< > $$@.tmp
+	$$(QUIET_GEN)mv $$@.tmp $$@
 
 docs-clean-$1:
 	$$(call QUIET_CLEAN, eBPF_$1-manpage)
diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh
index 7eb940a7b2eb..ed12111cd2f0 100755
--- a/tools/testing/selftests/bpf/test_doc_build.sh
+++ b/tools/testing/selftests/bpf/test_doc_build.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+set -e
 
 # Assume script is located under tools/testing/selftests/bpf/. We want to start
 # build attempts from the top of kernel repository.
-- 
cgit v1.2.3-70-g09d2


From 3b3af91cb6893967bbec30f5c14562d0f7f00c2a Mon Sep 17 00:00:00 2001
From: Wang Hai <wanghai38@huawei.com>
Date: Wed, 9 Jun 2021 19:56:51 +0800
Subject: libbpf: Simplify the return expression of bpf_object__init_maps
 function

There is no need for special treatment of the 'ret == 0' case.
This patch simplifies the return expression.

Signed-off-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210609115651.3392580-1-wanghai38@huawei.com
---
 tools/lib/bpf/libbpf.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 65f87cc1220c..128715b8794b 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2465,10 +2465,8 @@ static int bpf_object__init_maps(struct bpf_object *obj,
 	err = err ?: bpf_object__init_global_data_maps(obj);
 	err = err ?: bpf_object__init_kconfig_map(obj);
 	err = err ?: bpf_object__init_struct_ops_maps(obj);
-	if (err)
-		return err;
 
-	return 0;
+	return err;
 }
 
 static bool section_have_execinstr(struct bpf_object *obj, int idx)
-- 
cgit v1.2.3-70-g09d2


From ca16b429f39b4ce013bfa7e197f25681e65a2a42 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 9 Jun 2021 19:59:16 +0800
Subject: tools/bpftool: Fix error return code in do_batch()

Fix to return a negative error code from the error handling
case instead of 0, as done elsewhere in this function.

Fixes: 668da745af3c2 ("tools: bpftool: add support for quotations ...")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20210609115916.2186872-1-chengzhihao1@huawei.com
---
 tools/bpf/bpftool/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 7f2817d97079..3ddfd4843738 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -341,8 +341,10 @@ static int do_batch(int argc, char **argv)
 		n_argc = make_args(buf, n_argv, BATCH_ARG_NB_MAX, lines);
 		if (!n_argc)
 			continue;
-		if (n_argc < 0)
+		if (n_argc < 0) {
+			err = n_argc;
 			goto err_close;
+		}
 
 		if (json_output) {
 			jsonw_start_object(json_wtr);
-- 
cgit v1.2.3-70-g09d2


From 4e164f8716853b879e2b1a21a12d54c57f11372e Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 12 Jun 2021 08:05:00 +0530
Subject: libbpf: Remove unneeded check for flags during tc detach

Coverity complained about this being unreachable code. It is right
because we already enforce flags to be unset, so a check validating
the flag value is redundant.

Fixes: 715c5ce454a6 ("libbpf: Add low level TC-BPF management API")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210612023502.1283837-2-memxor@gmail.com
---
 tools/lib/bpf/netlink.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index d743c8721aa7..efbb50ad59d8 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -675,8 +675,6 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook,
 		return -EINVAL;
 	if (priority > UINT16_MAX)
 		return -EINVAL;
-	if (flags & ~BPF_TC_F_REPLACE)
-		return -EINVAL;
 	if (!flush) {
 		if (!handle || !priority)
 			return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From bbf29d3a2e49e482d5267311798aec42f00e88f3 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 12 Jun 2021 08:05:01 +0530
Subject: libbpf: Set NLM_F_EXCL when creating qdisc

This got lost during the refactoring across versions. We always use
NLM_F_EXCL when creating some TC object, so reflect what the function
says and set the flag.

Fixes: 715c5ce454a6 ("libbpf: Add low level TC-BPF management API")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210612023502.1283837-3-memxor@gmail.com
---
 tools/lib/bpf/netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index efbb50ad59d8..cf9381f03b16 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -457,7 +457,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags)
 
 static int tc_qdisc_create_excl(struct bpf_tc_hook *hook)
 {
-	return tc_qdisc_modify(hook, RTM_NEWQDISC, NLM_F_CREATE);
+	return tc_qdisc_modify(hook, RTM_NEWQDISC, NLM_F_CREATE | NLM_F_EXCL);
 }
 
 static int tc_qdisc_delete(struct bpf_tc_hook *hook)
-- 
cgit v1.2.3-70-g09d2


From f9ac779f881c2ec3d1cdcd7fa9d4f9442bf60e80 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:14 +0900
Subject: net: Introduce net.ipv4.tcp_migrate_req.

This commit adds a new sysctl option: net.ipv4.tcp_migrate_req. If this
option is enabled or eBPF program is attached, we will be able to migrate
child sockets from a listener to another in the same reuseport group after
close() or shutdown() syscalls.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-2-kuniyu@amazon.co.jp
---
 Documentation/networking/ip-sysctl.rst | 25 +++++++++++++++++++++++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  9 +++++++++
 3 files changed, 35 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a5c250044500..b0436d3a4f11 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -761,6 +761,31 @@ tcp_syncookies - INTEGER
 	network connections you can set this knob to 2 to enable
 	unconditionally generation of syncookies.
 
+tcp_migrate_req - BOOLEAN
+	The incoming connection is tied to a specific listening socket when
+	the initial SYN packet is received during the three-way handshake.
+	When a listener is closed, in-flight request sockets during the
+	handshake and established sockets in the accept queue are aborted.
+
+	If the listener has SO_REUSEPORT enabled, other listeners on the
+	same port should have been able to accept such connections. This
+	option makes it possible to migrate such child sockets to another
+	listener after close() or shutdown().
+
+	The BPF_SK_REUSEPORT_SELECT_OR_MIGRATE type of eBPF program should
+	usually be used to define the policy to pick an alive listener.
+	Otherwise, the kernel will randomly pick an alive listener only if
+	this option is enabled.
+
+	Note that migration between listeners with different settings may
+	crash applications. Let's say migration happens from listener A to
+	B, and only B has TCP_SAVE_SYN enabled. B cannot read SYN data from
+	the requests migrated from A. To avoid such a situation, cancel
+	migration by returning SK_DROP in the type of eBPF program, or
+	disable this option.
+
+	Default: 0
+
 tcp_fastopen - INTEGER
 	Enable TCP Fast Open (RFC7413) to send and accept data in the opening
 	SYN packet.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 746c80cd4257..b8620519eace 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -126,6 +126,7 @@ struct netns_ipv4 {
 	u8 sysctl_tcp_syn_retries;
 	u8 sysctl_tcp_synack_retries;
 	u8 sysctl_tcp_syncookies;
+	u8 sysctl_tcp_migrate_req;
 	int sysctl_tcp_reordering;
 	u8 sysctl_tcp_retries1;
 	u8 sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4fa77f182dcb..6f1e64d49232 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -960,6 +960,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dou8vec_minmax,
 	},
 #endif
+	{
+		.procname	= "tcp_migrate_req",
+		.data		= &init_net.ipv4.sysctl_tcp_migrate_req,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
 	{
 		.procname	= "tcp_reordering",
 		.data		= &init_net.ipv4.sysctl_tcp_reordering,
-- 
cgit v1.2.3-70-g09d2


From 5c040eaf5d1753aafe12989ca712175df0b9c436 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:15 +0900
Subject: tcp: Add num_closed_socks to struct sock_reuseport.

As noted in the following commit, a closed listener has to hold the
reference to the reuseport group for socket migration. This patch adds a
field (num_closed_socks) to struct sock_reuseport to manage closed sockets
within the same reuseport group. Moreover, this and the following commits
introduce some helper functions to split socks[] into two sections and keep
TCP_LISTEN and TCP_CLOSE sockets in each section. Like a double-ended
queue, we will place TCP_LISTEN sockets from the front and TCP_CLOSE
sockets from the end.

  TCP_LISTEN---------->       <-------TCP_CLOSE
  +---+---+  ---  +---+  ---  +---+  ---  +---+
  | 0 | 1 |  ...  | i |  ...  | j |  ...  | k |
  +---+---+  ---  +---+  ---  +---+  ---  +---+

  i = num_socks - 1
  j = max_socks - num_closed_socks
  k = max_socks - 1

This patch also extends reuseport_add_sock() and reuseport_grow() to
support num_closed_socks.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-3-kuniyu@amazon.co.jp
---
 include/net/sock_reuseport.h |  5 +--
 net/core/sock_reuseport.c    | 75 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 505f1e18e9bf..0e558ca7afbf 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
 struct sock_reuseport {
 	struct rcu_head		rcu;
 
-	u16			max_socks;	/* length of socks */
-	u16			num_socks;	/* elements in socks */
+	u16			max_socks;		/* length of socks */
+	u16			num_socks;		/* elements in socks */
+	u16			num_closed_socks;	/* closed elements in socks */
 	/* The last synq overflow event timestamp of this
 	 * reuse->socks[] group.
 	 */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index b065f0a103ed..f478c65a281b 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -18,6 +18,49 @@ DEFINE_SPINLOCK(reuseport_lock);
 
 static DEFINE_IDA(reuseport_ida);
 
+static int reuseport_sock_index(struct sock *sk,
+				const struct sock_reuseport *reuse,
+				bool closed)
+{
+	int left, right;
+
+	if (!closed) {
+		left = 0;
+		right = reuse->num_socks;
+	} else {
+		left = reuse->max_socks - reuse->num_closed_socks;
+		right = reuse->max_socks;
+	}
+
+	for (; left < right; left++)
+		if (reuse->socks[left] == sk)
+			return left;
+	return -1;
+}
+
+static void __reuseport_add_sock(struct sock *sk,
+				 struct sock_reuseport *reuse)
+{
+	reuse->socks[reuse->num_socks] = sk;
+	/* paired with smp_rmb() in reuseport_select_sock() */
+	smp_wmb();
+	reuse->num_socks++;
+}
+
+static bool __reuseport_detach_sock(struct sock *sk,
+				    struct sock_reuseport *reuse)
+{
+	int i = reuseport_sock_index(sk, reuse, false);
+
+	if (i == -1)
+		return false;
+
+	reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+	reuse->num_socks--;
+
+	return true;
+}
+
 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
 {
 	unsigned int size = sizeof(struct sock_reuseport) +
@@ -72,9 +115,9 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
 	}
 
 	reuse->reuseport_id = id;
+	reuse->bind_inany = bind_inany;
 	reuse->socks[0] = sk;
 	reuse->num_socks = 1;
-	reuse->bind_inany = bind_inany;
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 out:
@@ -98,6 +141,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 		return NULL;
 
 	more_reuse->num_socks = reuse->num_socks;
+	more_reuse->num_closed_socks = reuse->num_closed_socks;
 	more_reuse->prog = reuse->prog;
 	more_reuse->reuseport_id = reuse->reuseport_id;
 	more_reuse->bind_inany = reuse->bind_inany;
@@ -105,9 +149,13 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 
 	memcpy(more_reuse->socks, reuse->socks,
 	       reuse->num_socks * sizeof(struct sock *));
+	memcpy(more_reuse->socks +
+	       (more_reuse->max_socks - more_reuse->num_closed_socks),
+	       reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
+	       reuse->num_closed_socks * sizeof(struct sock *));
 	more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
 
-	for (i = 0; i < reuse->num_socks; ++i)
+	for (i = 0; i < reuse->max_socks; ++i)
 		rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
 				   more_reuse);
 
@@ -158,7 +206,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 		return -EBUSY;
 	}
 
-	if (reuse->num_socks == reuse->max_socks) {
+	if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
 		reuse = reuseport_grow(reuse);
 		if (!reuse) {
 			spin_unlock_bh(&reuseport_lock);
@@ -166,10 +214,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 		}
 	}
 
-	reuse->socks[reuse->num_socks] = sk;
-	/* paired with smp_rmb() in reuseport_select_sock() */
-	smp_wmb();
-	reuse->num_socks++;
+	__reuseport_add_sock(sk, reuse);
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 	spin_unlock_bh(&reuseport_lock);
@@ -183,7 +228,6 @@ EXPORT_SYMBOL(reuseport_add_sock);
 void reuseport_detach_sock(struct sock *sk)
 {
 	struct sock_reuseport *reuse;
-	int i;
 
 	spin_lock_bh(&reuseport_lock);
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
@@ -200,16 +244,11 @@ void reuseport_detach_sock(struct sock *sk)
 	bpf_sk_reuseport_detach(sk);
 
 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+	__reuseport_detach_sock(sk, reuse);
+
+	if (reuse->num_socks + reuse->num_closed_socks == 0)
+		call_rcu(&reuse->rcu, reuseport_free_rcu);
 
-	for (i = 0; i < reuse->num_socks; i++) {
-		if (reuse->socks[i] == sk) {
-			reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
-			reuse->num_socks--;
-			if (reuse->num_socks == 0)
-				call_rcu(&reuse->rcu, reuseport_free_rcu);
-			break;
-		}
-	}
 	spin_unlock_bh(&reuseport_lock);
 }
 EXPORT_SYMBOL(reuseport_detach_sock);
@@ -274,7 +313,7 @@ struct sock *reuseport_select_sock(struct sock *sk,
 	prog = rcu_dereference(reuse->prog);
 	socks = READ_ONCE(reuse->num_socks);
 	if (likely(socks)) {
-		/* paired with smp_wmb() in reuseport_add_sock() */
+		/* paired with smp_wmb() in __reuseport_add_sock() */
 		smp_rmb();
 
 		if (!prog || !skb)
-- 
cgit v1.2.3-70-g09d2


From 333bb73f620e1a5f2e0b8df2c0d25300fab36d89 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:16 +0900
Subject: tcp: Keep TCP_CLOSE sockets in the reuseport group.

When we close a listening socket, to migrate its connections to another
listener in the same reuseport group, we have to handle two kinds of child
sockets. One is that a listening socket has a reference to, and the other
is not.

The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
accept queue of their listening socket. So we can pop them out and push
them into another listener's queue at close() or shutdown() syscalls. On
the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
three-way handshake and not in the accept queue. Thus, we cannot access
such sockets at close() or shutdown() syscalls. Accordingly, we have to
migrate immature sockets after their listening socket has been closed.

Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
that time, if we could select a new listener from the same reuseport group,
no connection would be aborted. However, we cannot do that because
reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
the reuseport group from closed sockets.

This patch allows TCP_CLOSE sockets to remain in the reuseport group and
access it while any child socket references them. The point is that
reuseport_detach_sock() was called twice from inet_unhash() and
sk_destruct(). This patch replaces the first reuseport_detach_sock() with
reuseport_stop_listen_sock(), which checks if the reuseport group is
capable of migration. If capable, it decrements num_socks, moves the socket
backwards in socks[] and increments num_closed_socks. When all connections
are migrated, sk_destruct() calls reuseport_detach_sock() to remove the
socket from socks[], decrement num_closed_socks, and set NULL to
sk_reuseport_cb.

By this change, closed or shutdowned sockets can keep sk_reuseport_cb.
Consequently, calling listen() after shutdown() can cause EADDRINUSE or
EBUSY in inet_csk_bind_conflict() or reuseport_add_sock() which expects
such sockets not to have the reuseport group. Therefore, this patch also
loosens such validation rules so that a socket can listen again if it has a
reuseport group with num_closed_socks more than 0.

When such sockets listen again, we handle them in reuseport_resurrect(). If
there is an existing reuseport group (reuseport_add_sock() path), we move
the socket from the old group to the new one and free the old one if
necessary. If there is no existing group (reuseport_alloc() path), we
allocate a new reuseport group, detach sk from the old one, and free it if
necessary, not to break the current shutdown behaviour:

  - we cannot carry over the eBPF prog of shutdowned sockets
  - we cannot attach/detach an eBPF prog to/from listening sockets via
    shutdowned sockets

Note that when the number of sockets gets over U16_MAX, we try to detach a
closed socket randomly to make room for the new listening socket in
reuseport_grow().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-4-kuniyu@amazon.co.jp
---
 include/net/sock_reuseport.h    |   1 +
 net/core/sock_reuseport.c       | 182 ++++++++++++++++++++++++++++++++++++++--
 net/ipv4/inet_connection_sock.c |  12 ++-
 net/ipv4/inet_hashtables.c      |   2 +-
 4 files changed, 186 insertions(+), 11 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 0e558ca7afbf..1333d0cddfbc 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -32,6 +32,7 @@ extern int reuseport_alloc(struct sock *sk, bool bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
 			      bool bind_inany);
 extern void reuseport_detach_sock(struct sock *sk);
+void reuseport_stop_listen_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
 					  u32 hash,
 					  struct sk_buff *skb,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index f478c65a281b..41fcd55ab5ae 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -17,6 +17,8 @@
 DEFINE_SPINLOCK(reuseport_lock);
 
 static DEFINE_IDA(reuseport_ida);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+			       struct sock_reuseport *reuse, bool bind_inany);
 
 static int reuseport_sock_index(struct sock *sk,
 				const struct sock_reuseport *reuse,
@@ -61,6 +63,29 @@ static bool __reuseport_detach_sock(struct sock *sk,
 	return true;
 }
 
+static void __reuseport_add_closed_sock(struct sock *sk,
+					struct sock_reuseport *reuse)
+{
+	reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
+	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
+	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+}
+
+static bool __reuseport_detach_closed_sock(struct sock *sk,
+					   struct sock_reuseport *reuse)
+{
+	int i = reuseport_sock_index(sk, reuse, true);
+
+	if (i == -1)
+		return false;
+
+	reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
+	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
+
+	return true;
+}
+
 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
 {
 	unsigned int size = sizeof(struct sock_reuseport) +
@@ -92,6 +117,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
 	if (reuse) {
+		if (reuse->num_closed_socks) {
+			/* sk was shutdown()ed before */
+			ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
+			goto out;
+		}
+
 		/* Only set reuse->bind_inany if the bind_inany is true.
 		 * Otherwise, it will overwrite the reuse->bind_inany
 		 * which was set by the bind/hash path.
@@ -133,8 +164,23 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 	u32 more_socks_size, i;
 
 	more_socks_size = reuse->max_socks * 2U;
-	if (more_socks_size > U16_MAX)
+	if (more_socks_size > U16_MAX) {
+		if (reuse->num_closed_socks) {
+			/* Make room by removing a closed sk.
+			 * The child has already been migrated.
+			 * Only reqsk left at this point.
+			 */
+			struct sock *sk;
+
+			sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+			RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
+			__reuseport_detach_closed_sock(sk, reuse);
+
+			return reuse;
+		}
+
 		return NULL;
+	}
 
 	more_reuse = __reuseport_alloc(more_socks_size);
 	if (!more_reuse)
@@ -200,7 +246,15 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 	reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
 	old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
-					     lockdep_is_held(&reuseport_lock));
+					      lockdep_is_held(&reuseport_lock));
+	if (old_reuse && old_reuse->num_closed_socks) {
+		/* sk was shutdown()ed before */
+		int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
+
+		spin_unlock_bh(&reuseport_lock);
+		return err;
+	}
+
 	if (old_reuse && old_reuse->num_socks != 1) {
 		spin_unlock_bh(&reuseport_lock);
 		return -EBUSY;
@@ -225,6 +279,65 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 }
 EXPORT_SYMBOL(reuseport_add_sock);
 
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+			       struct sock_reuseport *reuse, bool bind_inany)
+{
+	if (old_reuse == reuse) {
+		/* If sk was in the same reuseport group, just pop sk out of
+		 * the closed section and push sk into the listening section.
+		 */
+		__reuseport_detach_closed_sock(sk, old_reuse);
+		__reuseport_add_sock(sk, old_reuse);
+		return 0;
+	}
+
+	if (!reuse) {
+		/* In bind()/listen() path, we cannot carry over the eBPF prog
+		 * for the shutdown()ed socket. In setsockopt() path, we should
+		 * not change the eBPF prog of listening sockets by attaching a
+		 * prog to the shutdown()ed socket. Thus, we will allocate a new
+		 * reuseport group and detach sk from the old group.
+		 */
+		int id;
+
+		reuse = __reuseport_alloc(INIT_SOCKS);
+		if (!reuse)
+			return -ENOMEM;
+
+		id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+		if (id < 0) {
+			kfree(reuse);
+			return id;
+		}
+
+		reuse->reuseport_id = id;
+		reuse->bind_inany = bind_inany;
+	} else {
+		/* Move sk from the old group to the new one if
+		 * - all the other listeners in the old group were close()d or
+		 *   shutdown()ed, and then sk2 has listen()ed on the same port
+		 * OR
+		 * - sk listen()ed without bind() (or with autobind), was
+		 *   shutdown()ed, and then listen()s on another port which
+		 *   sk2 listen()s on.
+		 */
+		if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
+			reuse = reuseport_grow(reuse);
+			if (!reuse)
+				return -ENOMEM;
+		}
+	}
+
+	__reuseport_detach_closed_sock(sk, old_reuse);
+	__reuseport_add_sock(sk, reuse);
+	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+	if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
+		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+
+	return 0;
+}
+
 void reuseport_detach_sock(struct sock *sk)
 {
 	struct sock_reuseport *reuse;
@@ -233,6 +346,10 @@ void reuseport_detach_sock(struct sock *sk)
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
 
+	/* reuseport_grow() has detached a closed sk */
+	if (!reuse)
+		goto out;
+
 	/* Notify the bpf side. The sk may be added to a sockarray
 	 * map. If so, sockarray logic will remove it from the map.
 	 *
@@ -244,15 +361,49 @@ void reuseport_detach_sock(struct sock *sk)
 	bpf_sk_reuseport_detach(sk);
 
 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
-	__reuseport_detach_sock(sk, reuse);
+
+	if (!__reuseport_detach_closed_sock(sk, reuse))
+		__reuseport_detach_sock(sk, reuse);
 
 	if (reuse->num_socks + reuse->num_closed_socks == 0)
 		call_rcu(&reuse->rcu, reuseport_free_rcu);
 
+out:
 	spin_unlock_bh(&reuseport_lock);
 }
 EXPORT_SYMBOL(reuseport_detach_sock);
 
+void reuseport_stop_listen_sock(struct sock *sk)
+{
+	if (sk->sk_protocol == IPPROTO_TCP) {
+		struct sock_reuseport *reuse;
+
+		spin_lock_bh(&reuseport_lock);
+
+		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+						  lockdep_is_held(&reuseport_lock));
+
+		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
+			/* Migration capable, move sk from the listening section
+			 * to the closed section.
+			 */
+			bpf_sk_reuseport_detach(sk);
+
+			__reuseport_detach_sock(sk, reuse);
+			__reuseport_add_closed_sock(sk, reuse);
+
+			spin_unlock_bh(&reuseport_lock);
+			return;
+		}
+
+		spin_unlock_bh(&reuseport_lock);
+	}
+
+	/* Not capable to do migration, detach immediately */
+	reuseport_detach_sock(sk);
+}
+EXPORT_SYMBOL(reuseport_stop_listen_sock);
+
 static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
 				   struct bpf_prog *prog, struct sk_buff *skb,
 				   int hdr_len)
@@ -352,9 +503,13 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 	struct sock_reuseport *reuse;
 	struct bpf_prog *old_prog;
 
-	if (sk_unhashed(sk) && sk->sk_reuseport) {
-		int err = reuseport_alloc(sk, false);
+	if (sk_unhashed(sk)) {
+		int err;
 
+		if (!sk->sk_reuseport)
+			return -EINVAL;
+
+		err = reuseport_alloc(sk, false);
 		if (err)
 			return err;
 	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
@@ -380,13 +535,24 @@ int reuseport_detach_prog(struct sock *sk)
 	struct sock_reuseport *reuse;
 	struct bpf_prog *old_prog;
 
-	if (!rcu_access_pointer(sk->sk_reuseport_cb))
-		return sk->sk_reuseport ? -ENOENT : -EINVAL;
-
 	old_prog = NULL;
 	spin_lock_bh(&reuseport_lock);
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
+
+	/* reuse must be checked after acquiring the reuseport_lock
+	 * because reuseport_grow() can detach a closed sk.
+	 */
+	if (!reuse) {
+		spin_unlock_bh(&reuseport_lock);
+		return sk->sk_reuseport ? -ENOENT : -EINVAL;
+	}
+
+	if (sk_unhashed(sk) && reuse->num_closed_socks) {
+		spin_unlock_bh(&reuseport_lock);
+		return -ENOENT;
+	}
+
 	old_prog = rcu_replace_pointer(reuse->prog, old_prog,
 				       lockdep_is_held(&reuseport_lock));
 	spin_unlock_bh(&reuseport_lock);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fd472eae4f5c..fa806e9167ec 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -135,10 +135,18 @@ static int inet_csk_bind_conflict(const struct sock *sk,
 				  bool relax, bool reuseport_ok)
 {
 	struct sock *sk2;
+	bool reuseport_cb_ok;
 	bool reuse = sk->sk_reuse;
 	bool reuseport = !!sk->sk_reuseport;
+	struct sock_reuseport *reuseport_cb;
 	kuid_t uid = sock_i_uid((struct sock *)sk);
 
+	rcu_read_lock();
+	reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
+	/* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
+	reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
+	rcu_read_unlock();
+
 	/*
 	 * Unlike other sk lookup places we do not check
 	 * for sk_net here, since _all_ the socks listed
@@ -156,14 +164,14 @@ static int inet_csk_bind_conflict(const struct sock *sk,
 				if ((!relax ||
 				     (!reuseport_ok &&
 				      reuseport && sk2->sk_reuseport &&
-				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+				      reuseport_cb_ok &&
 				      (sk2->sk_state == TCP_TIME_WAIT ||
 				       uid_eq(uid, sock_i_uid(sk2))))) &&
 				    inet_rcv_saddr_equal(sk, sk2, true))
 					break;
 			} else if (!reuseport_ok ||
 				   !reuseport || !sk2->sk_reuseport ||
-				   rcu_access_pointer(sk->sk_reuseport_cb) ||
+				   !reuseport_cb_ok ||
 				   (sk2->sk_state != TCP_TIME_WAIT &&
 				    !uid_eq(uid, sock_i_uid(sk2)))) {
 				if (inet_rcv_saddr_equal(sk, sk2, true))
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c96866a53a66..80aeaf9e6e16 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -697,7 +697,7 @@ void inet_unhash(struct sock *sk)
 		goto unlock;
 
 	if (rcu_access_pointer(sk->sk_reuseport_cb))
-		reuseport_detach_sock(sk);
+		reuseport_stop_listen_sock(sk);
 	if (ilb) {
 		inet_unhash2(hashinfo, sk);
 		ilb->count--;
-- 
cgit v1.2.3-70-g09d2


From 1cd62c21572c1df6e7090ea4cabf4cf509616dbb Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:17 +0900
Subject: tcp: Add reuseport_migrate_sock() to select a new listener.

reuseport_migrate_sock() does the same check done in
reuseport_listen_stop_sock(). If the reuseport group is capable of
migration, reuseport_migrate_sock() selects a new listener by the child
socket hash and increments the listener's sk_refcnt beforehand. Thus, if we
fail in the migration, we have to decrement it later.

We will support migration by eBPF in the later commits.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-5-kuniyu@amazon.co.jp
---
 include/net/sock_reuseport.h |  3 ++
 net/core/sock_reuseport.c    | 78 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 1333d0cddfbc..473b0b0fa4ab 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -37,6 +37,9 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
 					  u32 hash,
 					  struct sk_buff *skb,
 					  int hdr_len);
+struct sock *reuseport_migrate_sock(struct sock *sk,
+				    struct sock *migrating_sk,
+				    struct sk_buff *skb);
 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 extern int reuseport_detach_prog(struct sock *sk);
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 41fcd55ab5ae..b239f8cd9d39 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -44,7 +44,7 @@ static void __reuseport_add_sock(struct sock *sk,
 				 struct sock_reuseport *reuse)
 {
 	reuse->socks[reuse->num_socks] = sk;
-	/* paired with smp_rmb() in reuseport_select_sock() */
+	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
 	smp_wmb();
 	reuse->num_socks++;
 }
@@ -434,6 +434,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
 	return reuse->socks[index];
 }
 
+static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
+						  u32 hash, u16 num_socks)
+{
+	int i, j;
+
+	i = j = reciprocal_scale(hash, num_socks);
+	while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+		i++;
+		if (i >= num_socks)
+			i = 0;
+		if (i == j)
+			return NULL;
+	}
+
+	return reuse->socks[i];
+}
+
 /**
  *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
  *  @sk: First socket in the group.
@@ -477,19 +494,8 @@ struct sock *reuseport_select_sock(struct sock *sk,
 
 select_by_hash:
 		/* no bpf or invalid bpf result: fall back to hash usage */
-		if (!sk2) {
-			int i, j;
-
-			i = j = reciprocal_scale(hash, socks);
-			while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
-				i++;
-				if (i >= socks)
-					i = 0;
-				if (i == j)
-					goto out;
-			}
-			sk2 = reuse->socks[i];
-		}
+		if (!sk2)
+			sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
 	}
 
 out:
@@ -498,6 +504,50 @@ out:
 }
 EXPORT_SYMBOL(reuseport_select_sock);
 
+/**
+ *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
+ *  @sk: close()ed or shutdown()ed socket in the group.
+ *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
+ *    NEW_SYN_RECV request socket during 3WHS.
+ *  @skb: skb to run through BPF filter.
+ *  Returns a socket (with sk_refcnt +1) that should accept the child socket
+ *  (or NULL on error).
+ */
+struct sock *reuseport_migrate_sock(struct sock *sk,
+				    struct sock *migrating_sk,
+				    struct sk_buff *skb)
+{
+	struct sock_reuseport *reuse;
+	struct sock *nsk = NULL;
+	u16 socks;
+	u32 hash;
+
+	rcu_read_lock();
+
+	reuse = rcu_dereference(sk->sk_reuseport_cb);
+	if (!reuse)
+		goto out;
+
+	socks = READ_ONCE(reuse->num_socks);
+	if (unlikely(!socks))
+		goto out;
+
+	/* paired with smp_wmb() in __reuseport_add_sock() */
+	smp_rmb();
+
+	hash = migrating_sk->sk_hash;
+	if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
+
+	if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
+		nsk = NULL;
+
+out:
+	rcu_read_unlock();
+	return nsk;
+}
+EXPORT_SYMBOL(reuseport_migrate_sock);
+
 int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 {
 	struct sock_reuseport *reuse;
-- 
cgit v1.2.3-70-g09d2


From 54b92e84193749c9968aff2dd46e3b0f42643e18 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:18 +0900
Subject: tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

When we call close() or shutdown() for listening sockets, each child socket
in the accept queue are freed at inet_csk_listen_stop(). If we can get a
new listener by reuseport_migrate_sock() and clone the request by
inet_reqsk_clone(), we try to add it into the new listener's accept queue
by inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free()
to call sock_put() for its listener and free the cloned request.

After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets
NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be
non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid
double free.

Note that we do not update req->rsk_listener and instead clone the req to
migrate because another path may reference the original request. If we
protected it by RCU, we would need to add rcu_read_lock() in many places.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-6-kuniyu@amazon.co.jp
---
 net/ipv4/inet_connection_sock.c | 70 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fa806e9167ec..08878ef1bc70 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -695,6 +695,52 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+static struct request_sock *inet_reqsk_clone(struct request_sock *req,
+					     struct sock *sk)
+{
+	struct sock *req_sk, *nreq_sk;
+	struct request_sock *nreq;
+
+	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+	if (!nreq) {
+		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+		sock_put(sk);
+		return NULL;
+	}
+
+	req_sk = req_to_sk(req);
+	nreq_sk = req_to_sk(nreq);
+
+	memcpy(nreq_sk, req_sk,
+	       offsetof(struct sock, sk_dontcopy_begin));
+	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+	sk_node_init(&nreq_sk->sk_node);
+	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_XPS
+	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+	nreq->rsk_listener = sk;
+
+	/* We need not acquire fastopenq->lock
+	 * because the child socket is locked in inet_csk_listen_stop().
+	 */
+	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
+		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+	return nreq;
+}
+
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	inet_rsk(req)->ipv6_opt = NULL;
+#endif
+}
+
 /* return true if req was found in the ehash table */
 static bool reqsk_queue_unlink(struct request_sock *req)
 {
@@ -1036,14 +1082,36 @@ void inet_csk_listen_stop(struct sock *sk)
 	 * of the variants now.			--ANK
 	 */
 	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
-		struct sock *child = req->sk;
+		struct sock *child = req->sk, *nsk;
+		struct request_sock *nreq;
 
 		local_bh_disable();
 		bh_lock_sock(child);
 		WARN_ON(sock_owned_by_user(child));
 		sock_hold(child);
 
+		nsk = reuseport_migrate_sock(sk, child, NULL);
+		if (nsk) {
+			nreq = inet_reqsk_clone(req, nsk);
+			if (nreq) {
+				refcount_set(&nreq->rsk_refcnt, 1);
+
+				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+					reqsk_migrate_reset(req);
+				} else {
+					reqsk_migrate_reset(nreq);
+					__reqsk_free(nreq);
+				}
+
+				/* inet_csk_reqsk_queue_add() has already
+				 * called inet_child_forget() on failure case.
+				 */
+				goto skip_child_forget;
+			}
+		}
+
 		inet_child_forget(sk, req, child);
+skip_child_forget:
 		reqsk_put(req);
 		bh_unlock_sock(child);
 		local_bh_enable();
-- 
cgit v1.2.3-70-g09d2


From c905dee62232db583b50fe214080b98db623151e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:19 +0900
Subject: tcp: Migrate TCP_NEW_SYN_RECV requests at retransmitting SYN+ACKs.

As with the preceding patch, this patch changes reqsk_timer_handler() to
call reuseport_migrate_sock() and inet_reqsk_clone() to migrate in-flight
requests at retransmitting SYN+ACKs. If we can select a new listener and
clone the request, we resume setting the SYN+ACK timer for the new req. If
we can set the timer, we call inet_ehash_insert() to unhash the old req and
put the new req into ehash.

The noteworthy point here is that by unhashing the old req, another CPU
processing it may lose the "own_req" race in tcp_v[46]_syn_recv_sock() and
drop the final ACK packet. However, the new timer will recover this
situation.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-7-kuniyu@amazon.co.jp
---
 net/ipv4/inet_connection_sock.c | 75 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 08878ef1bc70..f4b771e45ac1 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -734,10 +734,22 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req,
 	return nreq;
 }
 
+static void reqsk_queue_migrated(struct request_sock_queue *queue,
+				 const struct request_sock *req)
+{
+	if (req->num_timeout == 0)
+		atomic_inc(&queue->young);
+	atomic_inc(&queue->qlen);
+}
+
 static void reqsk_migrate_reset(struct request_sock *req)
 {
+	req->saved_syn = NULL;
 #if IS_ENABLED(CONFIG_IPV6)
 	inet_rsk(req)->ipv6_opt = NULL;
+	inet_rsk(req)->pktopts = NULL;
+#else
+	inet_rsk(req)->ireq_opt = NULL;
 #endif
 }
 
@@ -781,15 +793,39 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
 static void reqsk_timer_handler(struct timer_list *t)
 {
 	struct request_sock *req = from_timer(req, t, rsk_timer);
+	struct request_sock *nreq = NULL, *oreq = req;
 	struct sock *sk_listener = req->rsk_listener;
-	struct net *net = sock_net(sk_listener);
-	struct inet_connection_sock *icsk = inet_csk(sk_listener);
-	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+	struct inet_connection_sock *icsk;
+	struct request_sock_queue *queue;
+	struct net *net;
 	int max_syn_ack_retries, qlen, expire = 0, resend = 0;
 
-	if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
-		goto drop;
+	if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
+		struct sock *nsk;
+
+		nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL);
+		if (!nsk)
+			goto drop;
+
+		nreq = inet_reqsk_clone(req, nsk);
+		if (!nreq)
+			goto drop;
+
+		/* The new timer for the cloned req can decrease the 2
+		 * by calling inet_csk_reqsk_queue_drop_and_put(), so
+		 * hold another count to prevent use-after-free and
+		 * call reqsk_put() just before return.
+		 */
+		refcount_set(&nreq->rsk_refcnt, 2 + 1);
+		timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+		reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req);
+
+		req = nreq;
+		sk_listener = nsk;
+	}
 
+	icsk = inet_csk(sk_listener);
+	net = sock_net(sk_listener);
 	max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
 	/* Normally all the openreqs are young and become mature
 	 * (i.e. converted to established socket) for first timeout.
@@ -808,6 +844,7 @@ static void reqsk_timer_handler(struct timer_list *t)
 	 * embrions; and abort old ones without pity, if old
 	 * ones are about to clog our table.
 	 */
+	queue = &icsk->icsk_accept_queue;
 	qlen = reqsk_queue_len(queue);
 	if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
 		int young = reqsk_queue_len_young(queue) << 1;
@@ -832,10 +869,36 @@ static void reqsk_timer_handler(struct timer_list *t)
 			atomic_dec(&queue->young);
 		timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
 		mod_timer(&req->rsk_timer, jiffies + timeo);
+
+		if (!nreq)
+			return;
+
+		if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
+			/* delete timer */
+			inet_csk_reqsk_queue_drop(sk_listener, nreq);
+			goto drop;
+		}
+
+		reqsk_migrate_reset(oreq);
+		reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq);
+		reqsk_put(oreq);
+
+		reqsk_put(nreq);
 		return;
 	}
+
 drop:
-	inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
+	/* Even if we can clone the req, we may need not retransmit any more
+	 * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another
+	 * CPU may win the "own_req" race so that inet_ehash_insert() fails.
+	 */
+	if (nreq) {
+		reqsk_migrate_reset(nreq);
+		reqsk_queue_removed(queue, nreq);
+		__reqsk_free(nreq);
+	}
+
+	inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq);
 }
 
 static void reqsk_queue_hash_req(struct request_sock *req,
-- 
cgit v1.2.3-70-g09d2


From d4f2c86b2b7e2e606e0868b38c8c6c49cc193a8e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:20 +0900
Subject: tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK.

This patch also changes the code to call reuseport_migrate_sock() and
inet_reqsk_clone(), but unlike the other cases, we do not call
inet_reqsk_clone() right after reuseport_migrate_sock().

Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
has three kinds of refcnt:

  (A) for listener itself
  (B) carried by reuqest_sock
  (C) sock_hold() in tcp_v[46]_rcv()

While processing the req, (A) may disappear by close(listener). Also, (B)
can disappear by accept(listener) once we put the req into the accept
queue. So, we have to hold another refcnt (C) for the listener to prevent
use-after-free.

For socket migration, we call reuseport_migrate_sock() to select a listener
with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
Thus we have to take another refcnt (B) for the newly cloned request_sock.

In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
try to put the new req into the accept queue. By migrating req after
winning the "own_req" race, we can avoid such a worst situation:

  CPU 1 looks up req1
  CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
  CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
  ...

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-8-kuniyu@amazon.co.jp
---
 net/ipv4/inet_connection_sock.c | 34 +++++++++++++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c             | 20 ++++++++++++++------
 net/ipv4/tcp_minisocks.c        |  4 ++--
 net/ipv6/tcp_ipv6.c             | 14 +++++++++++---
 4 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f4b771e45ac1..0eea878edc30 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1114,12 +1114,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 					 struct request_sock *req, bool own_req)
 {
 	if (own_req) {
-		inet_csk_reqsk_queue_drop(sk, req);
-		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-		if (inet_csk_reqsk_queue_add(sk, req, child))
+		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+
+		if (sk != req->rsk_listener) {
+			/* another listening sk has been selected,
+			 * migrate the req to it.
+			 */
+			struct request_sock *nreq;
+
+			/* hold a refcnt for the nreq->rsk_listener
+			 * which is assigned in inet_reqsk_clone()
+			 */
+			sock_hold(sk);
+			nreq = inet_reqsk_clone(req, sk);
+			if (!nreq) {
+				inet_child_forget(sk, req, child);
+				goto child_put;
+			}
+
+			refcount_set(&nreq->rsk_refcnt, 1);
+			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+				reqsk_migrate_reset(req);
+				reqsk_put(req);
+				return child;
+			}
+
+			reqsk_migrate_reset(nreq);
+			__reqsk_free(nreq);
+		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
 			return child;
+		}
 	}
 	/* Too bad, another child took ownership of the request, undo. */
+child_put:
 	bh_unlock_sock(child);
 	sock_put(child);
 	return NULL;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4f5b68a90be9..6cb8e269f1ab 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2002,13 +2002,21 @@ process:
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			/* We own a reference on the listener, increase it again
+			 * as we might lose it too soon.
+			 */
+			sock_hold(sk);
 		}
-		/* We own a reference on the listener, increase it again
-		 * as we might lose it too soon.
-		 */
-		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
 		if (!tcp_filter(sk, skb)) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7513ba45553d..f258a4c0da71 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -775,8 +775,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		goto listen_overflow;
 
 	if (own_req && rsk_drop_req(req)) {
-		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-		inet_csk_reqsk_queue_drop_and_put(sk, req);
+		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+		inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
 		return child;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4435fa342e7a..4d71464094b3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1664,10 +1664,18 @@ process:
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			sock_hold(sk);
 		}
-		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
 		if (!tcp_filter(sk, skb)) {
-- 
cgit v1.2.3-70-g09d2


From e061047684af63f2d4f1338ec73140f6e29eb59f Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:21 +0900
Subject: bpf: Support BPF_FUNC_get_socket_cookie() for
 BPF_PROG_TYPE_SK_REUSEPORT.

We will call sock_reuseport.prog for socket migration in the next commit,
so the eBPF program has to know which listener is closing to select a new
listener.

We can currently get a unique ID of each listener in the userspace by
calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.

This patch makes the pointer of sk available in sk_reuseport_md so that we
can get the ID by BPF_FUNC_get_socket_cookie() in the eBPF program.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201119001154.kapwihc2plp4f7zc@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-9-kuniyu@amazon.co.jp
---
 include/uapi/linux/bpf.h       |  1 +
 net/core/filter.c              | 10 ++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 3 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2c1ba70abbf1..f3b72588442b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5416,6 +5416,7 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	__bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/net/core/filter.c b/net/core/filter.c
index caa88955562e..f753ab550525 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10172,6 +10172,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
 		return &sk_reuseport_load_bytes_proto;
 	case BPF_FUNC_skb_load_bytes_relative:
 		return &sk_reuseport_load_bytes_relative_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_ptr_cookie_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -10201,6 +10203,10 @@ sk_reuseport_is_valid_access(int off, int size,
 	case offsetof(struct sk_reuseport_md, hash):
 		return size == size_default;
 
+	case offsetof(struct sk_reuseport_md, sk):
+		info->reg_type = PTR_TO_SOCKET;
+		return size == sizeof(__u64);
+
 	/* Fields that allow narrowing */
 	case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
 		if (size < sizeof_field(struct sk_buff, protocol))
@@ -10273,6 +10279,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct sk_reuseport_md, bind_inany):
 		SK_REUSEPORT_LOAD_FIELD(bind_inany);
 		break;
+
+	case offsetof(struct sk_reuseport_md, sk):
+		SK_REUSEPORT_LOAD_FIELD(sk);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2c1ba70abbf1..f3b72588442b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5416,6 +5416,7 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	__bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 #define BPF_TAG_SIZE	8
-- 
cgit v1.2.3-70-g09d2


From d5e4ddaeb6ab2c3c7fbb7b247a6d34bb0b18d87e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:22 +0900
Subject: bpf: Support socket migration by eBPF.

This patch introduces a new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT
to check if the attached eBPF program is capable of migrating sockets. When
the eBPF program is attached, we run it for socket migration if the
expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE or
net.ipv4.tcp_migrate_req is enabled.

Currently, the expected_attach_type is not enforced for the
BPF_PROG_TYPE_SK_REUSEPORT type of program. Thus, this commit follows the
earlier idea in the commit aac3fc320d94 ("bpf: Post-hooks for sys_bind") to
fix up the zero expected_attach_type in bpf_prog_load_fixup_attach_type().

Moreover, this patch adds a new field (migrating_sk) to sk_reuseport_md to
select a new listener based on the child socket. migrating_sk varies
depending on if it is migrating a request in the accept queue or during
3WHS.

  - accept_queue : sock (ESTABLISHED/SYN_RECV)
  - 3WHS         : request_sock (NEW_SYN_RECV)

In the eBPF program, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.

  - SK_PASS with selected_sk, select it as a new listener
  - SK_PASS with selected_sk NULL, fallbacks to the random selection
  - SK_DROP, cancel the migration.

There is a noteworthy point. We select a listening socket in three places,
but we do not have struct skb at closing a listener or retransmitting a
SYN+ACK. On the other hand, some helper functions do not expect skb is NULL
(e.g. skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer()
in BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb
temporarily before running the eBPF program.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6tg6h@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201203042402.6cskdlit5f3mw4ru@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-10-kuniyu@amazon.co.jp
---
 include/linux/bpf.h            |  1 +
 include/linux/filter.h         |  2 ++
 include/uapi/linux/bpf.h       | 15 +++++++++++++++
 kernel/bpf/syscall.c           | 13 +++++++++++++
 net/core/filter.c              | 13 ++++++++++++-
 net/core/sock_reuseport.c      | 34 ++++++++++++++++++++++++++++++----
 tools/include/uapi/linux/bpf.h | 15 +++++++++++++++
 7 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 86dec5001ae2..f309fc1509f2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2048,6 +2048,7 @@ struct sk_reuseport_kern {
 	struct sk_buff *skb;
 	struct sock *sk;
 	struct sock *selected_sk;
+	struct sock *migrating_sk;
 	void *data_end;
 	u32 hash;
 	u32 reuseport_id;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c5ad7df029ed..688856e0b28a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -996,11 +996,13 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 				  struct bpf_prog *prog, struct sk_buff *skb,
+				  struct sock *migrating_sk,
 				  u32 hash);
 #else
 static inline struct sock *
 bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 		     struct bpf_prog *prog, struct sk_buff *skb,
+		     struct sock *migrating_sk,
 		     u32 hash)
 {
 	return NULL;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f3b72588442b..bf9252c7381e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -994,6 +994,8 @@ enum bpf_attach_type {
 	BPF_SK_LOOKUP,
 	BPF_XDP,
 	BPF_SK_SKB_VERDICT,
+	BPF_SK_REUSEPORT_SELECT,
+	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -5416,7 +5418,20 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
+	 * new incoming connection request (e.g. selecting a listen sk for
+	 * the received SYN in the TCP case).  reuse->sk is one of the sk
+	 * in the reuseport group. The bpf prog can use reuse->sk to learn
+	 * the local listening ip/port without looking into the skb.
+	 *
+	 * When reuse->migrating_sk is not NULL, reuse->sk is closed and
+	 * reuse->migrating_sk is the socket that needs to be migrated
+	 * to another listening socket.  migrating_sk could be a fullsock
+	 * sk that is fully established or a reqsk that is in-the-middle
+	 * of 3-way handshake.
+	 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
+	__bpf_md_ptr(struct bpf_sock *, migrating_sk);
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 50457019da27..dbbc5342f221 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1972,6 +1972,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
 			attr->expected_attach_type =
 				BPF_CGROUP_INET_SOCK_CREATE;
 		break;
+	case BPF_PROG_TYPE_SK_REUSEPORT:
+		if (!attr->expected_attach_type)
+			attr->expected_attach_type =
+				BPF_SK_REUSEPORT_SELECT;
+		break;
 	}
 }
 
@@ -2055,6 +2060,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		if (expected_attach_type == BPF_SK_LOOKUP)
 			return 0;
 		return -EINVAL;
+	case BPF_PROG_TYPE_SK_REUSEPORT:
+		switch (expected_attach_type) {
+		case BPF_SK_REUSEPORT_SELECT:
+		case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	case BPF_PROG_TYPE_SYSCALL:
 	case BPF_PROG_TYPE_EXT:
 		if (expected_attach_type)
diff --git a/net/core/filter.c b/net/core/filter.c
index f753ab550525..5b86e47ef079 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10044,11 +10044,13 @@ out:
 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 				    struct sock_reuseport *reuse,
 				    struct sock *sk, struct sk_buff *skb,
+				    struct sock *migrating_sk,
 				    u32 hash)
 {
 	reuse_kern->skb = skb;
 	reuse_kern->sk = sk;
 	reuse_kern->selected_sk = NULL;
+	reuse_kern->migrating_sk = migrating_sk;
 	reuse_kern->data_end = skb->data + skb_headlen(skb);
 	reuse_kern->hash = hash;
 	reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -10057,12 +10059,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 				  struct bpf_prog *prog, struct sk_buff *skb,
+				  struct sock *migrating_sk,
 				  u32 hash)
 {
 	struct sk_reuseport_kern reuse_kern;
 	enum sk_action action;
 
-	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
 	action = BPF_PROG_RUN(prog, &reuse_kern);
 
 	if (action == SK_PASS)
@@ -10207,6 +10210,10 @@ sk_reuseport_is_valid_access(int off, int size,
 		info->reg_type = PTR_TO_SOCKET;
 		return size == sizeof(__u64);
 
+	case offsetof(struct sk_reuseport_md, migrating_sk):
+		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+		return size == sizeof(__u64);
+
 	/* Fields that allow narrowing */
 	case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
 		if (size < sizeof_field(struct sk_buff, protocol))
@@ -10283,6 +10290,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct sk_reuseport_md, sk):
 		SK_REUSEPORT_LOAD_FIELD(sk);
 		break;
+
+	case offsetof(struct sk_reuseport_md, migrating_sk):
+		SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index b239f8cd9d39..de5ee3ae86d5 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -377,13 +377,17 @@ void reuseport_stop_listen_sock(struct sock *sk)
 {
 	if (sk->sk_protocol == IPPROTO_TCP) {
 		struct sock_reuseport *reuse;
+		struct bpf_prog *prog;
 
 		spin_lock_bh(&reuseport_lock);
 
 		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 						  lockdep_is_held(&reuseport_lock));
+		prog = rcu_dereference_protected(reuse->prog,
+						 lockdep_is_held(&reuseport_lock));
 
-		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
+		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
+		    (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
 			/* Migration capable, move sk from the listening section
 			 * to the closed section.
 			 */
@@ -488,7 +492,7 @@ struct sock *reuseport_select_sock(struct sock *sk,
 			goto select_by_hash;
 
 		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
-			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
 		else
 			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
 
@@ -519,6 +523,8 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 {
 	struct sock_reuseport *reuse;
 	struct sock *nsk = NULL;
+	bool allocated = false;
+	struct bpf_prog *prog;
 	u16 socks;
 	u32 hash;
 
@@ -536,10 +542,30 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 	smp_rmb();
 
 	hash = migrating_sk->sk_hash;
-	if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+	prog = rcu_dereference(reuse->prog);
+	if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
+		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+			goto select_by_hash;
+		goto out;
+	}
+
+	if (!skb) {
+		skb = alloc_skb(0, GFP_ATOMIC);
+		if (!skb)
+			goto out;
+		allocated = true;
+	}
+
+	nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
+
+	if (allocated)
+		kfree_skb(skb);
+
+select_by_hash:
+	if (!nsk)
 		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
 
-	if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
+	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
 		nsk = NULL;
 
 out:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f3b72588442b..bf9252c7381e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -994,6 +994,8 @@ enum bpf_attach_type {
 	BPF_SK_LOOKUP,
 	BPF_XDP,
 	BPF_SK_SKB_VERDICT,
+	BPF_SK_REUSEPORT_SELECT,
+	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -5416,7 +5418,20 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
+	 * new incoming connection request (e.g. selecting a listen sk for
+	 * the received SYN in the TCP case).  reuse->sk is one of the sk
+	 * in the reuseport group. The bpf prog can use reuse->sk to learn
+	 * the local listening ip/port without looking into the skb.
+	 *
+	 * When reuse->migrating_sk is not NULL, reuse->sk is closed and
+	 * reuse->migrating_sk is the socket that needs to be migrated
+	 * to another listening socket.  migrating_sk could be a fullsock
+	 * sk that is fully established or a reqsk that is in-the-middle
+	 * of 3-way handshake.
+	 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
+	__bpf_md_ptr(struct bpf_sock *, migrating_sk);
 };
 
 #define BPF_TAG_SIZE	8
-- 
cgit v1.2.3-70-g09d2


From 50501271e773c51afe602918915c6beb62ac369f Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:23 +0900
Subject: libbpf: Set expected_attach_type for BPF_PROG_TYPE_SK_REUSEPORT.

This commit introduces a new section (sk_reuseport/migrate) and sets
expected_attach_type to two each section in BPF_PROG_TYPE_SK_REUSEPORT
program.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-11-kuniyu@amazon.co.jp
---
 tools/lib/bpf/libbpf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 128715b8794b..f6b91fb0c857 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9075,7 +9075,10 @@ static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
 
 static const struct bpf_sec_def section_defs[] = {
 	BPF_PROG_SEC("socket",			BPF_PROG_TYPE_SOCKET_FILTER),
-	BPF_PROG_SEC("sk_reuseport",		BPF_PROG_TYPE_SK_REUSEPORT),
+	BPF_EAPROG_SEC("sk_reuseport/migrate",	BPF_PROG_TYPE_SK_REUSEPORT,
+						BPF_SK_REUSEPORT_SELECT_OR_MIGRATE),
+	BPF_EAPROG_SEC("sk_reuseport",		BPF_PROG_TYPE_SK_REUSEPORT,
+						BPF_SK_REUSEPORT_SELECT),
 	SEC_DEF("kprobe/", KPROBE,
 		.attach_fn = attach_kprobe),
 	BPF_PROG_SEC("uprobe/",			BPF_PROG_TYPE_KPROBE),
-- 
cgit v1.2.3-70-g09d2


From c9d0bdef89a6c943e98c851e8cc10c9c534329e6 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:24 +0900
Subject: bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

This patch adds a test for BPF_SK_REUSEPORT_SELECT_OR_MIGRATE and
removes 'static' from settimeo() in network_helpers.c.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-12-kuniyu@amazon.co.jp
---
 tools/testing/selftests/bpf/network_helpers.c      |   2 +-
 tools/testing/selftests/bpf/network_helpers.h      |   1 +
 .../selftests/bpf/prog_tests/migrate_reuseport.c   | 555 +++++++++++++++++++++
 .../selftests/bpf/progs/test_migrate_reuseport.c   | 135 +++++
 4 files changed, 692 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_migrate_reuseport.c

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 12ee40284da0..2060bc122c53 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -40,7 +40,7 @@ struct ipv6_packet pkt_v6 = {
 	.tcp.doff = 5,
 };
 
-static int settimeo(int fd, int timeout_ms)
+int settimeo(int fd, int timeout_ms)
 {
 	struct timeval timeout = { .tv_sec = 3 };
 
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 7205f8afdba1..5e0d51c07b63 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -33,6 +33,7 @@ struct ipv6_packet {
 } __packed;
 extern struct ipv6_packet pkt_v6;
 
+int settimeo(int fd, int timeout_ms);
 int start_server(int family, int type, const char *addr, __u16 port,
 		 int timeout_ms);
 int connect_to_fd(int server_fd, int timeout_ms);
diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
new file mode 100644
index 000000000000..0fa3f750567d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check if we can migrate child sockets.
+ *
+ *   1. call listen() for 4 server sockets.
+ *   2. call connect() for 25 client sockets.
+ *   3. call listen() for 1 server socket. (migration target)
+ *   4. update a map to migrate all child sockets
+ *        to the last server socket (migrate_map[cookie] = 4)
+ *   5. call shutdown() for first 4 server sockets
+ *        and migrate the requests in the accept queue
+ *        to the last server socket.
+ *   6. call listen() for the second server socket.
+ *   7. call shutdown() for the last server
+ *        and migrate the requests in the accept queue
+ *        to the second server socket.
+ *   8. call listen() for the last server.
+ *   9. call shutdown() for the second server
+ *        and migrate the requests in the accept queue
+ *        to the last server socket.
+ *  10. call accept() for the last server socket.
+ *
+ * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
+ */
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "test_progs.h"
+#include "test_migrate_reuseport.skel.h"
+#include "network_helpers.h"
+
+#define IFINDEX_LO 1
+
+#define NR_SERVERS 5
+#define NR_CLIENTS (NR_SERVERS * 5)
+#define MIGRATED_TO (NR_SERVERS - 1)
+
+/* fastopenq->max_qlen and sk->sk_max_ack_backlog */
+#define QLEN (NR_CLIENTS * 5)
+
+#define MSG "Hello World\0"
+#define MSGLEN 12
+
+static struct migrate_reuseport_test_case {
+	const char *name;
+	__s64 servers[NR_SERVERS];
+	__s64 clients[NR_CLIENTS];
+	struct sockaddr_storage addr;
+	socklen_t addrlen;
+	int family;
+	int state;
+	bool drop_ack;
+	bool expire_synack_timer;
+	bool fastopen;
+	struct bpf_link *link;
+} test_cases[] = {
+	{
+		.name = "IPv4 TCP_ESTABLISHED  inet_csk_listen_stop",
+		.family = AF_INET,
+		.state = BPF_TCP_ESTABLISHED,
+		.drop_ack = false,
+		.expire_synack_timer = false,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv4 TCP_SYN_RECV     inet_csk_listen_stop",
+		.family = AF_INET,
+		.state = BPF_TCP_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = false,
+		.fastopen = true,
+	},
+	{
+		.name = "IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler",
+		.family = AF_INET,
+		.state = BPF_TCP_NEW_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = true,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance",
+		.family = AF_INET,
+		.state = BPF_TCP_NEW_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = false,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv6 TCP_ESTABLISHED  inet_csk_listen_stop",
+		.family = AF_INET6,
+		.state = BPF_TCP_ESTABLISHED,
+		.drop_ack = false,
+		.expire_synack_timer = false,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv6 TCP_SYN_RECV     inet_csk_listen_stop",
+		.family = AF_INET6,
+		.state = BPF_TCP_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = false,
+		.fastopen = true,
+	},
+	{
+		.name = "IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler",
+		.family = AF_INET6,
+		.state = BPF_TCP_NEW_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = true,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance",
+		.family = AF_INET6,
+		.state = BPF_TCP_NEW_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = false,
+		.fastopen = false,
+	}
+};
+
+static void init_fds(__s64 fds[], int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		fds[i] = -1;
+}
+
+static void close_fds(__s64 fds[], int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (fds[i] != -1) {
+			close(fds[i]);
+			fds[i] = -1;
+		}
+	}
+}
+
+static int setup_fastopen(char *buf, int size, int *saved_len, bool restore)
+{
+	int err = 0, fd, len;
+
+	fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR);
+	if (!ASSERT_NEQ(fd, -1, "open"))
+		return -1;
+
+	if (restore) {
+		len = write(fd, buf, *saved_len);
+		if (!ASSERT_EQ(len, *saved_len, "write - restore"))
+			err = -1;
+	} else {
+		*saved_len = read(fd, buf, size);
+		if (!ASSERT_GE(*saved_len, 1, "read")) {
+			err = -1;
+			goto close;
+		}
+
+		err = lseek(fd, 0, SEEK_SET);
+		if (!ASSERT_OK(err, "lseek"))
+			goto close;
+
+		/* (TFO_CLIENT_ENABLE | TFO_SERVER_ENABLE |
+		 *  TFO_CLIENT_NO_COOKIE | TFO_SERVER_COOKIE_NOT_REQD)
+		 */
+		len = write(fd, "519", 3);
+		if (!ASSERT_EQ(len, 3, "write - setup"))
+			err = -1;
+	}
+
+close:
+	close(fd);
+
+	return err;
+}
+
+static int drop_ack(struct migrate_reuseport_test_case *test_case,
+		    struct test_migrate_reuseport *skel)
+{
+	if (test_case->family == AF_INET)
+		skel->bss->server_port = ((struct sockaddr_in *)
+					  &test_case->addr)->sin_port;
+	else
+		skel->bss->server_port = ((struct sockaddr_in6 *)
+					  &test_case->addr)->sin6_port;
+
+	test_case->link = bpf_program__attach_xdp(skel->progs.drop_ack,
+						  IFINDEX_LO);
+	if (!ASSERT_OK_PTR(test_case->link, "bpf_program__attach_xdp"))
+		return -1;
+
+	return 0;
+}
+
+static int pass_ack(struct migrate_reuseport_test_case *test_case)
+{
+	int err;
+
+	err = bpf_link__detach(test_case->link);
+	if (!ASSERT_OK(err, "bpf_link__detach"))
+		return -1;
+
+	test_case->link = NULL;
+
+	return 0;
+}
+
+static int start_servers(struct migrate_reuseport_test_case *test_case,
+			 struct test_migrate_reuseport *skel)
+{
+	int i, err, prog_fd, reuseport = 1, qlen = QLEN;
+
+	prog_fd = bpf_program__fd(skel->progs.migrate_reuseport);
+
+	make_sockaddr(test_case->family,
+		      test_case->family == AF_INET ? "127.0.0.1" : "::1", 0,
+		      &test_case->addr, &test_case->addrlen);
+
+	for (i = 0; i < NR_SERVERS; i++) {
+		test_case->servers[i] = socket(test_case->family, SOCK_STREAM,
+					       IPPROTO_TCP);
+		if (!ASSERT_NEQ(test_case->servers[i], -1, "socket"))
+			return -1;
+
+		err = setsockopt(test_case->servers[i], SOL_SOCKET,
+				 SO_REUSEPORT, &reuseport, sizeof(reuseport));
+		if (!ASSERT_OK(err, "setsockopt - SO_REUSEPORT"))
+			return -1;
+
+		err = bind(test_case->servers[i],
+			   (struct sockaddr *)&test_case->addr,
+			   test_case->addrlen);
+		if (!ASSERT_OK(err, "bind"))
+			return -1;
+
+		if (i == 0) {
+			err = setsockopt(test_case->servers[i], SOL_SOCKET,
+					 SO_ATTACH_REUSEPORT_EBPF,
+					 &prog_fd, sizeof(prog_fd));
+			if (!ASSERT_OK(err,
+				       "setsockopt - SO_ATTACH_REUSEPORT_EBPF"))
+				return -1;
+
+			err = getsockname(test_case->servers[i],
+					  (struct sockaddr *)&test_case->addr,
+					  &test_case->addrlen);
+			if (!ASSERT_OK(err, "getsockname"))
+				return -1;
+		}
+
+		if (test_case->fastopen) {
+			err = setsockopt(test_case->servers[i],
+					 SOL_TCP, TCP_FASTOPEN,
+					 &qlen, sizeof(qlen));
+			if (!ASSERT_OK(err, "setsockopt - TCP_FASTOPEN"))
+				return -1;
+		}
+
+		/* All requests will be tied to the first four listeners */
+		if (i != MIGRATED_TO) {
+			err = listen(test_case->servers[i], qlen);
+			if (!ASSERT_OK(err, "listen"))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int start_clients(struct migrate_reuseport_test_case *test_case)
+{
+	char buf[MSGLEN] = MSG;
+	int i, err;
+
+	for (i = 0; i < NR_CLIENTS; i++) {
+		test_case->clients[i] = socket(test_case->family, SOCK_STREAM,
+					       IPPROTO_TCP);
+		if (!ASSERT_NEQ(test_case->clients[i], -1, "socket"))
+			return -1;
+
+		/* The attached XDP program drops only the final ACK, so
+		 * clients will transition to TCP_ESTABLISHED immediately.
+		 */
+		err = settimeo(test_case->clients[i], 100);
+		if (!ASSERT_OK(err, "settimeo"))
+			return -1;
+
+		if (test_case->fastopen) {
+			int fastopen = 1;
+
+			err = setsockopt(test_case->clients[i], IPPROTO_TCP,
+					 TCP_FASTOPEN_CONNECT, &fastopen,
+					 sizeof(fastopen));
+			if (!ASSERT_OK(err,
+				       "setsockopt - TCP_FASTOPEN_CONNECT"))
+				return -1;
+		}
+
+		err = connect(test_case->clients[i],
+			      (struct sockaddr *)&test_case->addr,
+			      test_case->addrlen);
+		if (!ASSERT_OK(err, "connect"))
+			return -1;
+
+		err = write(test_case->clients[i], buf, MSGLEN);
+		if (!ASSERT_EQ(err, MSGLEN, "write"))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int update_maps(struct migrate_reuseport_test_case *test_case,
+		       struct test_migrate_reuseport *skel)
+{
+	int i, err, migrated_to = MIGRATED_TO;
+	int reuseport_map_fd, migrate_map_fd;
+	__u64 value;
+
+	reuseport_map_fd = bpf_map__fd(skel->maps.reuseport_map);
+	migrate_map_fd = bpf_map__fd(skel->maps.migrate_map);
+
+	for (i = 0; i < NR_SERVERS; i++) {
+		value = (__u64)test_case->servers[i];
+		err = bpf_map_update_elem(reuseport_map_fd, &i, &value,
+					  BPF_NOEXIST);
+		if (!ASSERT_OK(err, "bpf_map_update_elem - reuseport_map"))
+			return -1;
+
+		err = bpf_map_lookup_elem(reuseport_map_fd, &i, &value);
+		if (!ASSERT_OK(err, "bpf_map_lookup_elem - reuseport_map"))
+			return -1;
+
+		err = bpf_map_update_elem(migrate_map_fd, &value, &migrated_to,
+					  BPF_NOEXIST);
+		if (!ASSERT_OK(err, "bpf_map_update_elem - migrate_map"))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int migrate_dance(struct migrate_reuseport_test_case *test_case)
+{
+	int i, err;
+
+	/* Migrate TCP_ESTABLISHED and TCP_SYN_RECV requests
+	 * to the last listener based on eBPF.
+	 */
+	for (i = 0; i < MIGRATED_TO; i++) {
+		err = shutdown(test_case->servers[i], SHUT_RDWR);
+		if (!ASSERT_OK(err, "shutdown"))
+			return -1;
+	}
+
+	/* No dance for TCP_NEW_SYN_RECV to migrate based on eBPF */
+	if (test_case->state == BPF_TCP_NEW_SYN_RECV)
+		return 0;
+
+	/* Note that we use the second listener instead of the
+	 * first one here.
+	 *
+	 * The fist listener is bind()ed with port 0 and,
+	 * SOCK_BINDPORT_LOCK is not set to sk_userlocks, so
+	 * calling listen() again will bind() the first listener
+	 * on a new ephemeral port and detach it from the existing
+	 * reuseport group.  (See: __inet_bind(), tcp_set_state())
+	 *
+	 * OTOH, the second one is bind()ed with a specific port,
+	 * and SOCK_BINDPORT_LOCK is set. Thus, re-listen() will
+	 * resurrect the listener on the existing reuseport group.
+	 */
+	err = listen(test_case->servers[1], QLEN);
+	if (!ASSERT_OK(err, "listen"))
+		return -1;
+
+	/* Migrate from the last listener to the second one.
+	 *
+	 * All listeners were detached out of the reuseport_map,
+	 * so migration will be done by kernel random pick from here.
+	 */
+	err = shutdown(test_case->servers[MIGRATED_TO], SHUT_RDWR);
+	if (!ASSERT_OK(err, "shutdown"))
+		return -1;
+
+	/* Back to the existing reuseport group */
+	err = listen(test_case->servers[MIGRATED_TO], QLEN);
+	if (!ASSERT_OK(err, "listen"))
+		return -1;
+
+	/* Migrate back to the last one from the second one */
+	err = shutdown(test_case->servers[1], SHUT_RDWR);
+	if (!ASSERT_OK(err, "shutdown"))
+		return -1;
+
+	return 0;
+}
+
+static void count_requests(struct migrate_reuseport_test_case *test_case,
+			   struct test_migrate_reuseport *skel)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int err, cnt = 0, client;
+	char buf[MSGLEN];
+
+	err = settimeo(test_case->servers[MIGRATED_TO], 4000);
+	if (!ASSERT_OK(err, "settimeo"))
+		goto out;
+
+	for (; cnt < NR_CLIENTS; cnt++) {
+		client = accept(test_case->servers[MIGRATED_TO],
+				(struct sockaddr *)&addr, &len);
+		if (!ASSERT_NEQ(client, -1, "accept"))
+			goto out;
+
+		memset(buf, 0, MSGLEN);
+		read(client, &buf, MSGLEN);
+		close(client);
+
+		if (!ASSERT_STREQ(buf, MSG, "read"))
+			goto out;
+	}
+
+out:
+	ASSERT_EQ(cnt, NR_CLIENTS, "count in userspace");
+
+	switch (test_case->state) {
+	case BPF_TCP_ESTABLISHED:
+		cnt = skel->bss->migrated_at_close;
+		break;
+	case BPF_TCP_SYN_RECV:
+		cnt = skel->bss->migrated_at_close_fastopen;
+		break;
+	case BPF_TCP_NEW_SYN_RECV:
+		if (test_case->expire_synack_timer)
+			cnt = skel->bss->migrated_at_send_synack;
+		else
+			cnt = skel->bss->migrated_at_recv_ack;
+		break;
+	default:
+		cnt = 0;
+	}
+
+	ASSERT_EQ(cnt, NR_CLIENTS, "count in BPF prog");
+}
+
+static void run_test(struct migrate_reuseport_test_case *test_case,
+		     struct test_migrate_reuseport *skel)
+{
+	int err, saved_len;
+	char buf[16];
+
+	skel->bss->migrated_at_close = 0;
+	skel->bss->migrated_at_close_fastopen = 0;
+	skel->bss->migrated_at_send_synack = 0;
+	skel->bss->migrated_at_recv_ack = 0;
+
+	init_fds(test_case->servers, NR_SERVERS);
+	init_fds(test_case->clients, NR_CLIENTS);
+
+	if (test_case->fastopen) {
+		memset(buf, 0, sizeof(buf));
+
+		err = setup_fastopen(buf, sizeof(buf), &saved_len, false);
+		if (!ASSERT_OK(err, "setup_fastopen - setup"))
+			return;
+	}
+
+	err = start_servers(test_case, skel);
+	if (!ASSERT_OK(err, "start_servers"))
+		goto close_servers;
+
+	if (test_case->drop_ack) {
+		/* Drop the final ACK of the 3-way handshake and stick the
+		 * in-flight requests on TCP_SYN_RECV or TCP_NEW_SYN_RECV.
+		 */
+		err = drop_ack(test_case, skel);
+		if (!ASSERT_OK(err, "drop_ack"))
+			goto close_servers;
+	}
+
+	/* Tie requests to the first four listners */
+	err = start_clients(test_case);
+	if (!ASSERT_OK(err, "start_clients"))
+		goto close_clients;
+
+	err = listen(test_case->servers[MIGRATED_TO], QLEN);
+	if (!ASSERT_OK(err, "listen"))
+		goto close_clients;
+
+	err = update_maps(test_case, skel);
+	if (!ASSERT_OK(err, "fill_maps"))
+		goto close_clients;
+
+	/* Migrate the requests in the accept queue only.
+	 * TCP_NEW_SYN_RECV requests are not migrated at this point.
+	 */
+	err = migrate_dance(test_case);
+	if (!ASSERT_OK(err, "migrate_dance"))
+		goto close_clients;
+
+	if (test_case->expire_synack_timer) {
+		/* Wait for SYN+ACK timers to expire so that
+		 * reqsk_timer_handler() migrates TCP_NEW_SYN_RECV requests.
+		 */
+		sleep(1);
+	}
+
+	if (test_case->link) {
+		/* Resume 3WHS and migrate TCP_NEW_SYN_RECV requests */
+		err = pass_ack(test_case);
+		if (!ASSERT_OK(err, "pass_ack"))
+			goto close_clients;
+	}
+
+	count_requests(test_case, skel);
+
+close_clients:
+	close_fds(test_case->clients, NR_CLIENTS);
+
+	if (test_case->link) {
+		err = pass_ack(test_case);
+		ASSERT_OK(err, "pass_ack - clean up");
+	}
+
+close_servers:
+	close_fds(test_case->servers, NR_SERVERS);
+
+	if (test_case->fastopen) {
+		err = setup_fastopen(buf, sizeof(buf), &saved_len, true);
+		ASSERT_OK(err, "setup_fastopen - restore");
+	}
+}
+
+void test_migrate_reuseport(void)
+{
+	struct test_migrate_reuseport *skel;
+	int i;
+
+	skel = test_migrate_reuseport__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+		test__start_subtest(test_cases[i].name);
+		run_test(&test_cases[i], skel);
+	}
+
+	test_migrate_reuseport__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c b/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c
new file mode 100644
index 000000000000..27df571abf5b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check if we can migrate child sockets.
+ *
+ *   1. If reuse_md->migrating_sk is NULL (SYN packet),
+ *        return SK_PASS without selecting a listener.
+ *   2. If reuse_md->migrating_sk is not NULL (socket migration),
+ *        select a listener (reuseport_map[migrate_map[cookie]])
+ *
+ * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/in.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
+	__uint(max_entries, 256);
+	__type(key, int);
+	__type(value, __u64);
+} reuseport_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 256);
+	__type(key, __u64);
+	__type(value, int);
+} migrate_map SEC(".maps");
+
+int migrated_at_close = 0;
+int migrated_at_close_fastopen = 0;
+int migrated_at_send_synack = 0;
+int migrated_at_recv_ack = 0;
+__be16 server_port;
+
+SEC("xdp")
+int drop_ack(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	struct ethhdr *eth = data;
+	struct tcphdr *tcp = NULL;
+
+	if (eth + 1 > data_end)
+		goto pass;
+
+	switch (bpf_ntohs(eth->h_proto)) {
+	case ETH_P_IP: {
+		struct iphdr *ip = (struct iphdr *)(eth + 1);
+
+		if (ip + 1 > data_end)
+			goto pass;
+
+		if (ip->protocol != IPPROTO_TCP)
+			goto pass;
+
+		tcp = (struct tcphdr *)((void *)ip + ip->ihl * 4);
+		break;
+	}
+	case ETH_P_IPV6: {
+		struct ipv6hdr *ipv6 = (struct ipv6hdr *)(eth + 1);
+
+		if (ipv6 + 1 > data_end)
+			goto pass;
+
+		if (ipv6->nexthdr != IPPROTO_TCP)
+			goto pass;
+
+		tcp = (struct tcphdr *)(ipv6 + 1);
+		break;
+	}
+	default:
+		goto pass;
+	}
+
+	if (tcp + 1 > data_end)
+		goto pass;
+
+	if (tcp->dest != server_port)
+		goto pass;
+
+	if (!tcp->syn && tcp->ack)
+		return XDP_DROP;
+
+pass:
+	return XDP_PASS;
+}
+
+SEC("sk_reuseport/migrate")
+int migrate_reuseport(struct sk_reuseport_md *reuse_md)
+{
+	int *key, flags = 0, state, err;
+	__u64 cookie;
+
+	if (!reuse_md->migrating_sk)
+		return SK_PASS;
+
+	state = reuse_md->migrating_sk->state;
+	cookie = bpf_get_socket_cookie(reuse_md->sk);
+
+	key = bpf_map_lookup_elem(&migrate_map, &cookie);
+	if (!key)
+		return SK_DROP;
+
+	err = bpf_sk_select_reuseport(reuse_md, &reuseport_map, key, flags);
+	if (err)
+		return SK_PASS;
+
+	switch (state) {
+	case BPF_TCP_ESTABLISHED:
+		__sync_fetch_and_add(&migrated_at_close, 1);
+		break;
+	case BPF_TCP_SYN_RECV:
+		__sync_fetch_and_add(&migrated_at_close_fastopen, 1);
+		break;
+	case BPF_TCP_NEW_SYN_RECV:
+		if (!reuse_md->len)
+			__sync_fetch_and_add(&migrated_at_send_synack, 1);
+		else
+			__sync_fetch_and_add(&migrated_at_recv_ack, 1);
+		break;
+	}
+
+	return SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3-70-g09d2


From 809ed84de8b3f2fd7b1d06efb94bf98fd318a7d7 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 16 Jun 2021 14:52:11 -0700
Subject: selftests/bpf: Whitelist test_progs.h from .gitignore

Somehow test_progs.h was being included by the existing rule:

    /test_progs*

This is bad because:

    1) test_progs.h is a checked in file
    2) grep-like tools like ripgrep[0] respect gitignore and
       test_progs.h was being hidden from searches

[0]: https://github.com/BurntSushi/ripgrep

Fixes: 74b5a5968fe8 ("selftests/bpf: Replace test_progs and test_maps w/ general rule")
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/a46f64944bf678bc652410ca6028d3450f4f7f4b.1623880296.git.dxu@dxuuu.xyz
---
 tools/testing/selftests/bpf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index eae6fc7d3ed8..addcfd8b615e 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -10,6 +10,7 @@ FEATURE-DUMP.libbpf
 fixdep
 test_dev_cgroup
 /test_progs*
+!test_progs.h
 test_verifier_log
 feature
 test_sock
-- 
cgit v1.2.3-70-g09d2


From 712b78c697cd75b62b7707b5a20bcd914aedfdaa Mon Sep 17 00:00:00 2001
From: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Date: Wed, 16 Jun 2021 10:04:36 +0800
Subject: bpf: Fix typo in kernel/bpf/bpf_lsm.c

Fix s/sleeable/sleepable/ typo in a comment.

Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1623809076-97907-1-git-send-email-chengshuyi@linux.alibaba.com
---
 kernel/bpf/bpf_lsm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 5efb2b24012c..99ada85da8bb 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -125,7 +125,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 }
 
 /* The set of hooks which are called without pagefaults disabled and are allowed
- * to "sleep" and thus can be used for sleeable BPF programs.
+ * to "sleep" and thus can be used for sleepable BPF programs.
  */
 BTF_SET_START(sleepable_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf)
-- 
cgit v1.2.3-70-g09d2


From bf067f1c51239ba70be714c513820d2c4a5cd457 Mon Sep 17 00:00:00 2001
From: Wang Hai <wanghai38@huawei.com>
Date: Tue, 15 Jun 2021 21:55:54 +0800
Subject: samples/bpf: Add missing option to xdp_fwd usage

xdp_fwd usage() is missing the introduction of the "-S"
and "-F" options, this patch adds it.

Fixes: d50ecc46d18f ("samples/bpf: Attach XDP programs in driver mode by default")
Signed-off-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20210615135554.29158-1-wanghai38@huawei.com
---
 samples/bpf/xdp_fwd_user.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
index 74a4583d0d86..00061261a8da 100644
--- a/samples/bpf/xdp_fwd_user.c
+++ b/samples/bpf/xdp_fwd_user.c
@@ -67,6 +67,8 @@ static void usage(const char *prog)
 		"usage: %s [OPTS] interface-list\n"
 		"\nOPTS:\n"
 		"    -d    detach program\n"
+		"    -S    use skb-mode\n"
+		"    -F    force loading prog\n"
 		"    -D    direct table lookups (skip fib rules)\n",
 		prog);
 }
-- 
cgit v1.2.3-70-g09d2


From dfdda1a0f4aad476ae25f2840c9426da3b99506d Mon Sep 17 00:00:00 2001
From: Wang Hai <wanghai38@huawei.com>
Date: Tue, 15 Jun 2021 21:57:24 +0800
Subject: samples/bpf: Add missing option to xdp_sample_pkts usage

xdp_sample_pkts usage() is missing the introduction of the
"-S" option, this patch adds it.

Fixes: d50ecc46d18f ("samples/bpf: Attach XDP programs in driver mode by default")
Signed-off-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20210615135724.29528-1-wanghai38@huawei.com
---
 samples/bpf/xdp_sample_pkts_user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
index 706475e004cb..495e09897bd3 100644
--- a/samples/bpf/xdp_sample_pkts_user.c
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -103,7 +103,8 @@ static void usage(const char *prog)
 	fprintf(stderr,
 		"%s: %s [OPTS] <ifname|ifindex>\n\n"
 		"OPTS:\n"
-		"    -F    force loading prog\n",
+		"    -F    force loading prog\n"
+		"    -S    use skb-mode\n",
 		__func__, prog);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4a638d581a7a3f00f277349903ff419b6790e2ae Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Wed, 16 Jun 2021 09:36:35 +0100
Subject: libbpf: Fail compilation if target arch is missing

bpf2go is the Go equivalent of libbpf skeleton. The convention is that
the compiled BPF is checked into the repository to facilitate distributing
BPF as part of Go packages. To make this portable, bpf2go by default
generates both bpfel and bpfeb variants of the C.

Using bpf_tracing.h is inherently non-portable since the fields of
struct pt_regs differ between platforms, so CO-RE can't help us here.
The only way of working around this is to compile for each target
platform independently. bpf2go can't do this by default since there
are too many platforms.

Define the various PT_... macros when no target can be determined and
turn them into compilation failures. This works because bpf2go always
compiles for bpf targets, so the compiler fallback doesn't kick in.
Conditionally define __BPF_MISSING_TARGET so that we can inject a
more appropriate error message at build time. The user can then
choose which platform to target explicitly.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210616083635.11434-1-lmb@cloudflare.com
---
 tools/lib/bpf/bpf_tracing.h | 46 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index c0f3a26aa582..d6bfbe009296 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -25,26 +25,35 @@
 	#define bpf_target_sparc
 	#define bpf_target_defined
 #else
-	#undef bpf_target_defined
-#endif
 
 /* Fall back to what the compiler says */
-#ifndef bpf_target_defined
 #if defined(__x86_64__)
 	#define bpf_target_x86
+	#define bpf_target_defined
 #elif defined(__s390__)
 	#define bpf_target_s390
+	#define bpf_target_defined
 #elif defined(__arm__)
 	#define bpf_target_arm
+	#define bpf_target_defined
 #elif defined(__aarch64__)
 	#define bpf_target_arm64
+	#define bpf_target_defined
 #elif defined(__mips__)
 	#define bpf_target_mips
+	#define bpf_target_defined
 #elif defined(__powerpc__)
 	#define bpf_target_powerpc
+	#define bpf_target_defined
 #elif defined(__sparc__)
 	#define bpf_target_sparc
+	#define bpf_target_defined
+#endif /* no compiler target */
+
 #endif
+
+#ifndef __BPF_TARGET_MISSING
+#define __BPF_TARGET_MISSING "GCC error \"Must specify a BPF target arch via __TARGET_ARCH_xxx\""
 #endif
 
 #if defined(bpf_target_x86)
@@ -287,7 +296,7 @@ struct pt_regs;
 #elif defined(bpf_target_sparc)
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = PT_REGS_RET(ctx); })
 #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
-#else
+#elif defined(bpf_target_defined)
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)					    \
 	({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); })
 #define BPF_KRETPROBE_READ_RET_IP(ip, ctx)				    \
@@ -295,6 +304,35 @@ struct pt_regs;
 			  (void *)(PT_REGS_FP(ctx) + sizeof(ip))); })
 #endif
 
+#if !defined(bpf_target_defined)
+
+#define PT_REGS_PARM1(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM2(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM3(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM4(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM5(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_RET(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_FP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_RC(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_SP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_IP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+
+#define PT_REGS_PARM1_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM2_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM3_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM4_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM5_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_RET_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_FP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_RC_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_SP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_IP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+
+#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+
+#endif /* !defined(bpf_target_defined) */
+
 #ifndef ___bpf_concat
 #define ___bpf_concat(a, b) a ## b
 #endif
-- 
cgit v1.2.3-70-g09d2


From 28131e9d933339a92f78e7ab6429f4aaaa07061c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 16 Jun 2021 11:25:11 +0200
Subject: bpf: Fix up register-based shifts in interpreter to silence KUBSAN

syzbot reported a shift-out-of-bounds that KUBSAN observed in the
interpreter:

  [...]
  UBSAN: shift-out-of-bounds in kernel/bpf/core.c:1420:2
  shift exponent 255 is too large for 64-bit type 'long long unsigned int'
  CPU: 1 PID: 11097 Comm: syz-executor.4 Not tainted 5.12.0-rc2-syzkaller #0
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  Call Trace:
   __dump_stack lib/dump_stack.c:79 [inline]
   dump_stack+0x141/0x1d7 lib/dump_stack.c:120
   ubsan_epilogue+0xb/0x5a lib/ubsan.c:148
   __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:327
   ___bpf_prog_run.cold+0x19/0x56c kernel/bpf/core.c:1420
   __bpf_prog_run32+0x8f/0xd0 kernel/bpf/core.c:1735
   bpf_dispatcher_nop_func include/linux/bpf.h:644 [inline]
   bpf_prog_run_pin_on_cpu include/linux/filter.h:624 [inline]
   bpf_prog_run_clear_cb include/linux/filter.h:755 [inline]
   run_filter+0x1a1/0x470 net/packet/af_packet.c:2031
   packet_rcv+0x313/0x13e0 net/packet/af_packet.c:2104
   dev_queue_xmit_nit+0x7c2/0xa90 net/core/dev.c:2387
   xmit_one net/core/dev.c:3588 [inline]
   dev_hard_start_xmit+0xad/0x920 net/core/dev.c:3609
   __dev_queue_xmit+0x2121/0x2e00 net/core/dev.c:4182
   __bpf_tx_skb net/core/filter.c:2116 [inline]
   __bpf_redirect_no_mac net/core/filter.c:2141 [inline]
   __bpf_redirect+0x548/0xc80 net/core/filter.c:2164
   ____bpf_clone_redirect net/core/filter.c:2448 [inline]
   bpf_clone_redirect+0x2ae/0x420 net/core/filter.c:2420
   ___bpf_prog_run+0x34e1/0x77d0 kernel/bpf/core.c:1523
   __bpf_prog_run512+0x99/0xe0 kernel/bpf/core.c:1737
   bpf_dispatcher_nop_func include/linux/bpf.h:644 [inline]
   bpf_test_run+0x3ed/0xc50 net/bpf/test_run.c:50
   bpf_prog_test_run_skb+0xabc/0x1c50 net/bpf/test_run.c:582
   bpf_prog_test_run kernel/bpf/syscall.c:3127 [inline]
   __do_sys_bpf+0x1ea9/0x4f00 kernel/bpf/syscall.c:4406
   do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
   entry_SYSCALL_64_after_hwframe+0x44/0xae
  [...]

Generally speaking, KUBSAN reports from the kernel should be fixed.
However, in case of BPF, this particular report caused concerns since
the large shift is not wrong from BPF point of view, just undefined.
In the verifier, K-based shifts that are >= {64,32} (depending on the
bitwidth of the instruction) are already rejected. The register-based
cases were not given their content might not be known at verification
time. Ideas such as verifier instruction rewrite with an additional
AND instruction for the source register were brought up, but regularly
rejected due to the additional runtime overhead they incur.

As Edward Cree rightly put it:

  Shifts by more than insn bitness are legal in the BPF ISA; they are
  implementation-defined behaviour [of the underlying architecture],
  rather than UB, and have been made legal for performance reasons.
  Each of the JIT backends compiles the BPF shift operations to machine
  instructions which produce implementation-defined results in such a
  case; the resulting contents of the register may be arbitrary but
  program behaviour as a whole remains defined.

  Guard checks in the fast path (i.e. affecting JITted code) will thus
  not be accepted.

  The case of division by zero is not truly analogous here, as division
  instructions on many of the JIT-targeted architectures will raise a
  machine exception / fault on division by zero, whereas (to the best
  of my knowledge) none will do so on an out-of-bounds shift.

Given the KUBSAN report only affects the BPF interpreter, but not JITs,
one solution is to add the ANDs with 63 or 31 into ___bpf_prog_run().
That would make the shifts defined, and thus shuts up KUBSAN, and the
compiler would optimize out the AND on any CPU that interprets the shift
amounts modulo the width anyway (e.g., confirmed from disassembly that
on x86-64 and arm64 the generated interpreter code is the same before
and after this fix).

The BPF interpreter is slow path, and most likely compiled out anyway
as distros select BPF_JIT_ALWAYS_ON to avoid speculative execution of
BPF instructions by the interpreter. Given the main argument was to
avoid sacrificing performance, the fact that the AND is optimized away
from compiler for mainstream archs helps as well as a solution moving
forward. Also add a comment on LSH/RSH/ARSH translation for JIT authors
to provide guidance when they see the ___bpf_prog_run() interpreter
code and use it as a model for a new JIT backend.

Reported-by: syzbot+bed360704c521841c85d@syzkaller.appspotmail.com
Reported-by: Kurt Manucredo <fuzzybritches0@gmail.com>
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Co-developed-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: syzbot+bed360704c521841c85d@syzkaller.appspotmail.com
Cc: Edward Cree <ecree.xilinx@gmail.com>
Link: https://lore.kernel.org/bpf/0000000000008f912605bd30d5d7@google.com
Link: https://lore.kernel.org/bpf/bac16d8d-c174-bdc4-91bd-bfa62b410190@gmail.com
---
 kernel/bpf/core.c | 61 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5e31ee9f7512..034ad93a1ad7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1392,29 +1392,54 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
 select_insn:
 	goto *jumptable[insn->code];
 
-	/* ALU */
-#define ALU(OPCODE, OP)			\
-	ALU64_##OPCODE##_X:		\
-		DST = DST OP SRC;	\
-		CONT;			\
-	ALU_##OPCODE##_X:		\
-		DST = (u32) DST OP (u32) SRC;	\
-		CONT;			\
-	ALU64_##OPCODE##_K:		\
-		DST = DST OP IMM;		\
-		CONT;			\
-	ALU_##OPCODE##_K:		\
-		DST = (u32) DST OP (u32) IMM;	\
+	/* Explicitly mask the register-based shift amounts with 63 or 31
+	 * to avoid undefined behavior. Normally this won't affect the
+	 * generated code, for example, in case of native 64 bit archs such
+	 * as x86-64 or arm64, the compiler is optimizing the AND away for
+	 * the interpreter. In case of JITs, each of the JIT backends compiles
+	 * the BPF shift operations to machine instructions which produce
+	 * implementation-defined results in such a case; the resulting
+	 * contents of the register may be arbitrary, but program behaviour
+	 * as a whole remains defined. In other words, in case of JIT backends,
+	 * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
+	 */
+	/* ALU (shifts) */
+#define SHT(OPCODE, OP)					\
+	ALU64_##OPCODE##_X:				\
+		DST = DST OP (SRC & 63);		\
+		CONT;					\
+	ALU_##OPCODE##_X:				\
+		DST = (u32) DST OP ((u32) SRC & 31);	\
+		CONT;					\
+	ALU64_##OPCODE##_K:				\
+		DST = DST OP IMM;			\
+		CONT;					\
+	ALU_##OPCODE##_K:				\
+		DST = (u32) DST OP (u32) IMM;		\
+		CONT;
+	/* ALU (rest) */
+#define ALU(OPCODE, OP)					\
+	ALU64_##OPCODE##_X:				\
+		DST = DST OP SRC;			\
+		CONT;					\
+	ALU_##OPCODE##_X:				\
+		DST = (u32) DST OP (u32) SRC;		\
+		CONT;					\
+	ALU64_##OPCODE##_K:				\
+		DST = DST OP IMM;			\
+		CONT;					\
+	ALU_##OPCODE##_K:				\
+		DST = (u32) DST OP (u32) IMM;		\
 		CONT;
-
 	ALU(ADD,  +)
 	ALU(SUB,  -)
 	ALU(AND,  &)
 	ALU(OR,   |)
-	ALU(LSH, <<)
-	ALU(RSH, >>)
 	ALU(XOR,  ^)
 	ALU(MUL,  *)
+	SHT(LSH, <<)
+	SHT(RSH, >>)
+#undef SHT
 #undef ALU
 	ALU_NEG:
 		DST = (u32) -DST;
@@ -1439,13 +1464,13 @@ select_insn:
 		insn++;
 		CONT;
 	ALU_ARSH_X:
-		DST = (u64) (u32) (((s32) DST) >> SRC);
+		DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
 		CONT;
 	ALU_ARSH_K:
 		DST = (u64) (u32) (((s32) DST) >> IMM);
 		CONT;
 	ALU64_ARSH_X:
-		(*(s64 *) &DST) >>= SRC;
+		(*(s64 *) &DST) >>= (SRC & 63);
 		CONT;
 	ALU64_ARSH_K:
 		(*(s64 *) &DST) >>= IMM;
-- 
cgit v1.2.3-70-g09d2


From f20792d425d2efd2680f2855c1e3fec01c2e569e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 16 Jun 2021 21:14:46 -0700
Subject: selftests/bpf: Fix selftests build with old system-wide headers

migrate_reuseport.c selftest relies on having TCP_FASTOPEN_CONNECT defined in
system-wide netinet/tcp.h. Selftests can use up-to-date uapi/linux/tcp.h, but
that one doesn't have SOL_TCP. So instead of switching everything to uapi
header, add #define for TCP_FASTOPEN_CONNECT to fix the build.

Fixes: c9d0bdef89a6 ("bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Link: https://lore.kernel.org/bpf/20210617041446.425283-1-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
index 0fa3f750567d..59adb4715394 100644
--- a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
@@ -30,6 +30,10 @@
 #include "test_migrate_reuseport.skel.h"
 #include "network_helpers.h"
 
+#ifndef TCP_FASTOPEN_CONNECT
+#define TCP_FASTOPEN_CONNECT 30
+#endif
+
 #define IFINDEX_LO 1
 
 #define NR_SERVERS 5
-- 
cgit v1.2.3-70-g09d2