4 files changed, 351 insertions, 36 deletions
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 61d45fcb4057..6b6155a8ad09 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -14,6 +14,7 @@ perf-y += kallsyms-parse.o
 perf-y += find-bit-bench.o
 perf-y += inject-buildid.o
 perf-y += evlist-open-close.o
+perf-y += breakpoint.o
 
 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
 perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index b3480bc33fe8..6cefb4315d75 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -49,6 +49,8 @@ int bench_synthesize(int argc, const char **argv);
 int bench_kallsyms_parse(int argc, const char **argv);
 int bench_inject_build_id(int argc, const char **argv);
 int bench_evlist_open_close(int argc, const char **argv);
+int bench_breakpoint_thread(int argc, const char **argv);
+int bench_breakpoint_enable(int argc, const char **argv);
 
 #define BENCH_FORMAT_DEFAULT_STR	"default"
 #define BENCH_FORMAT_DEFAULT		0
diff --git a/tools/perf/bench/breakpoint.c b/tools/perf/bench/breakpoint.c
new file mode 100644
index 000000000000..41385f89ffc7
--- /dev/null
+++ b/tools/perf/bench/breakpoint.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <subcmd/parse-options.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
+#include <linux/time64.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include "bench.h"
+#include "futex.h"
+
+struct {
+	unsigned int nbreakpoints;
+	unsigned int nparallel;
+	unsigned int nthreads;
+} thread_params = {
+	.nbreakpoints = 1,
+	.nparallel = 1,
+	.nthreads = 1,
+};
+
+static const struct option thread_options[] = {
+	OPT_UINTEGER('b', "breakpoints", &thread_params.nbreakpoints,
+		"Specify amount of breakpoints"),
+	OPT_UINTEGER('p', "parallelism", &thread_params.nparallel, "Specify amount of parallelism"),
+	OPT_UINTEGER('t', "threads", &thread_params.nthreads, "Specify amount of threads"),
+	OPT_END()
+};
+
+static const char * const thread_usage[] = {
+	"perf bench breakpoint thread <options>",
+	NULL
+};
+
+struct breakpoint {
+	int fd;
+	char watched;
+};
+
+static int breakpoint_setup(void *addr)
+{
+	struct perf_event_attr attr = { .size = 0, };
+
+	attr.type = PERF_TYPE_BREAKPOINT;
+	attr.size = sizeof(attr);
+	attr.inherit = 1;
+	attr.exclude_kernel = 1;
+	attr.exclude_hv = 1;
+	attr.bp_addr = (unsigned long)addr;
+	attr.bp_type = HW_BREAKPOINT_RW;
+	attr.bp_len = HW_BREAKPOINT_LEN_1;
+	return syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
+}
+
+static void *passive_thread(void *arg)
+{
+	unsigned int *done = (unsigned int *)arg;
+
+	while (!__atomic_load_n(done, __ATOMIC_RELAXED))
+		futex_wait(done, 0, NULL, 0);
+	return NULL;
+}
+
+static void *active_thread(void *arg)
+{
+	unsigned int *done = (unsigned int *)arg;
+
+	while (!__atomic_load_n(done, __ATOMIC_RELAXED));
+	return NULL;
+}
+
+static void *breakpoint_thread(void *arg)
+{
+	unsigned int i, done;
+	int *repeat = (int *)arg;
+	pthread_t *threads;
+
+	threads = calloc(thread_params.nthreads, sizeof(threads[0]));
+	if (!threads)
+		exit((perror("calloc"), EXIT_FAILURE));
+
+	while (__atomic_fetch_sub(repeat, 1, __ATOMIC_RELAXED) > 0) {
+		done = 0;
+		for (i = 0; i < thread_params.nthreads; i++) {
+			if (pthread_create(&threads[i], NULL, passive_thread, &done))
+				exit((perror("pthread_create"), EXIT_FAILURE));
+		}
+		__atomic_store_n(&done, 1, __ATOMIC_RELAXED);
+		futex_wake(&done, thread_params.nthreads, 0);
+		for (i = 0; i < thread_params.nthreads; i++)
+			pthread_join(threads[i], NULL);
+	}
+	free(threads);
+	return NULL;
+}
+
+// The benchmark creates nbreakpoints inheritable breakpoints,
+// then starts nparallel threads which create and join bench_repeat batches of nthreads threads.
+int bench_breakpoint_thread(int argc, const char **argv)
+{
+	unsigned int i, result_usec;
+	int repeat = bench_repeat;
+	struct breakpoint *breakpoints;
+	pthread_t *parallel;
+	struct timeval start, stop, diff;
+
+	if (parse_options(argc, argv, thread_options, thread_usage, 0)) {
+		usage_with_options(thread_usage, thread_options);
+		exit(EXIT_FAILURE);
+	}
+	breakpoints = calloc(thread_params.nbreakpoints, sizeof(breakpoints[0]));
+	parallel = calloc(thread_params.nparallel, sizeof(parallel[0]));
+	if (!breakpoints || !parallel)
+		exit((perror("calloc"), EXIT_FAILURE));
+
+	for (i = 0; i < thread_params.nbreakpoints; i++) {
+		breakpoints[i].fd = breakpoint_setup(&breakpoints[i].watched);
+		if (breakpoints[i].fd == -1)
+			exit((perror("perf_event_open"), EXIT_FAILURE));
+	}
+	gettimeofday(&start, NULL);
+	for (i = 0; i < thread_params.nparallel; i++) {
+		if (pthread_create(&parallel[i], NULL, breakpoint_thread, &repeat))
+			exit((perror("pthread_create"), EXIT_FAILURE));
+	}
+	for (i = 0; i < thread_params.nparallel; i++)
+		pthread_join(parallel[i], NULL);
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+	for (i = 0; i < thread_params.nbreakpoints; i++)
+		close(breakpoints[i].fd);
+	free(parallel);
+	free(breakpoints);
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		printf("# Created/joined %d threads with %d breakpoints and %d parallelism\n",
+			bench_repeat, thread_params.nbreakpoints, thread_params.nparallel);
+		printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
+			(long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC));
+		result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+		printf(" %14lf usecs/op\n",
+			(double)result_usec / bench_repeat / thread_params.nthreads);
+		printf(" %14lf usecs/op/cpu\n",
+			(double)result_usec / bench_repeat /
+			thread_params.nthreads * thread_params.nparallel);
+		break;
+	case BENCH_FORMAT_SIMPLE:
+		printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC));
+		break;
+	default:
+		fprintf(stderr, "Unknown format: %d\n", bench_format);
+		exit(EXIT_FAILURE);
+	}
+	return 0;
+}
+
+struct {
+	unsigned int npassive;
+	unsigned int nactive;
+} enable_params = {
+	.nactive = 0,
+	.npassive = 0,
+};
+
+static const struct option enable_options[] = {
+	OPT_UINTEGER('p', "passive", &enable_params.npassive, "Specify amount of passive threads"),
+	OPT_UINTEGER('a', "active", &enable_params.nactive, "Specify amount of active threads"),
+	OPT_END()
+};
+
+static const char * const enable_usage[] = {
+	"perf bench breakpoint enable <options>",
+	NULL
+};
+
+// The benchmark creates an inheritable breakpoint,
+// then starts npassive threads that block and nactive threads that actively spin
+// and then disables and enables the breakpoint bench_repeat times.
+int bench_breakpoint_enable(int argc, const char **argv)
+{
+	unsigned int i, nthreads, result_usec, done = 0;
+	char watched;
+	int fd;
+	pthread_t *threads;
+	struct timeval start, stop, diff;
+
+	if (parse_options(argc, argv, enable_options, enable_usage, 0)) {
+		usage_with_options(enable_usage, enable_options);
+		exit(EXIT_FAILURE);
+	}
+	fd = breakpoint_setup(&watched);
+	if (fd == -1)
+		exit((perror("perf_event_open"), EXIT_FAILURE));
+	nthreads = enable_params.npassive + enable_params.nactive;
+	threads = calloc(nthreads, sizeof(threads[0]));
+	if (!threads)
+		exit((perror("calloc"), EXIT_FAILURE));
+
+	for (i = 0; i < nthreads; i++) {
+		if (pthread_create(&threads[i], NULL,
+			i < enable_params.npassive ? passive_thread : active_thread, &done))
+			exit((perror("pthread_create"), EXIT_FAILURE));
+	}
+	usleep(10000);  // let the threads block
+	gettimeofday(&start, NULL);
+	for (i = 0; i < bench_repeat; i++) {
+		if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0))
+			exit((perror("ioctl(PERF_EVENT_IOC_DISABLE)"), EXIT_FAILURE));
+		if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0))
+			exit((perror("ioctl(PERF_EVENT_IOC_ENABLE)"), EXIT_FAILURE));
+	}
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+	__atomic_store_n(&done, 1, __ATOMIC_RELAXED);
+	futex_wake(&done, enable_params.npassive, 0);
+	for (i = 0; i < nthreads; i++)
+		pthread_join(threads[i], NULL);
+	free(threads);
+	close(fd);
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		printf("# Enabled/disabled breakpoint %d time with %d passive and %d active threads\n",
+			bench_repeat, enable_params.npassive, enable_params.nactive);
+		printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
+			(long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC));
+		result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+		printf(" %14lf usecs/op\n", (double)result_usec / bench_repeat);
+		break;
+	case BENCH_FORMAT_SIMPLE:
+		printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC));
+		break;
+	default:
+		fprintf(stderr, "Unknown format: %d\n", bench_format);
+		exit(EXIT_FAILURE);
+	}
+	return 0;
+}
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index f2640179ada9..20eed1e53f80 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -34,6 +34,7 @@
 #include <linux/numa.h>
 #include <linux/zalloc.h>
 
+#include "../util/header.h"
 #include <numa.h>
 #include <numaif.h>
 
@@ -54,7 +55,7 @@
 
 struct thread_data {
 	int			curr_cpu;
-	cpu_set_t		bind_cpumask;
+	cpu_set_t		*bind_cpumask;
 	int			bind_node;
 	u8			*process_data;
 	int			process_nr;
@@ -266,71 +267,117 @@ static bool node_has_cpus(int node)
 	return ret;
 }
 
-static cpu_set_t bind_to_cpu(int target_cpu)
+static cpu_set_t *bind_to_cpu(int target_cpu)
 {
-	cpu_set_t orig_mask, mask;
-	int ret;
+	int nrcpus = numa_num_possible_cpus();
+	cpu_set_t *orig_mask, *mask;
+	size_t size;
 
-	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
-	BUG_ON(ret);
+	orig_mask = CPU_ALLOC(nrcpus);
+	BUG_ON(!orig_mask);
+	size = CPU_ALLOC_SIZE(nrcpus);
+	CPU_ZERO_S(size, orig_mask);
+
+	if (sched_getaffinity(0, size, orig_mask))
+		goto err_out;
 
-	CPU_ZERO(&mask);
+	mask = CPU_ALLOC(nrcpus);
+	if (!mask)
+		goto err_out;
+
+	CPU_ZERO_S(size, mask);
 
 	if (target_cpu == -1) {
 		int cpu;
 
 		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-			CPU_SET(cpu, &mask);
+			CPU_SET_S(cpu, size, mask);
 	} else {
-		BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
-		CPU_SET(target_cpu, &mask);
+		if (target_cpu < 0 || target_cpu >= g->p.nr_cpus)
+			goto err;
+
+		CPU_SET_S(target_cpu, size, mask);
 	}
 
-	ret = sched_setaffinity(0, sizeof(mask), &mask);
-	BUG_ON(ret);
+	if (sched_setaffinity(0, size, mask))
+		goto err;
 
 	return orig_mask;
+
+err:
+	CPU_FREE(mask);
+err_out:
+	CPU_FREE(orig_mask);
+
+	/* BUG_ON due to failure in allocation of orig_mask/mask */
+	BUG_ON(-1);
+	return NULL;
 }
 
-static cpu_set_t bind_to_node(int target_node)
+static cpu_set_t *bind_to_node(int target_node)
 {
-	cpu_set_t orig_mask, mask;
+	int nrcpus = numa_num_possible_cpus();
+	size_t size;
+	cpu_set_t *orig_mask, *mask;
 	int cpu;
-	int ret;
 
-	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
-	BUG_ON(ret);
+	orig_mask = CPU_ALLOC(nrcpus);
+	BUG_ON(!orig_mask);
+	size = CPU_ALLOC_SIZE(nrcpus);
+	CPU_ZERO_S(size, orig_mask);
 
-	CPU_ZERO(&mask);
+	if (sched_getaffinity(0, size, orig_mask))
+		goto err_out;
+
+	mask = CPU_ALLOC(nrcpus);
+	if (!mask)
+		goto err_out;
+
+	CPU_ZERO_S(size, mask);
 
 	if (target_node == NUMA_NO_NODE) {
 		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-			CPU_SET(cpu, &mask);
+			CPU_SET_S(cpu, size, mask);
 	} else {
 		struct bitmask *cpumask = numa_allocate_cpumask();
 
-		BUG_ON(!cpumask);
+		if (!cpumask)
+			goto err;
+
 		if (!numa_node_to_cpus(target_node, cpumask)) {
 			for (cpu = 0; cpu < (int)cpumask->size; cpu++) {
 				if (numa_bitmask_isbitset(cpumask, cpu))
-					CPU_SET(cpu, &mask);
+					CPU_SET_S(cpu, size, mask);
 			}
 		}
 		numa_free_cpumask(cpumask);
 	}
 
-	ret = sched_setaffinity(0, sizeof(mask), &mask);
-	BUG_ON(ret);
+	if (sched_setaffinity(0, size, mask))
+		goto err;
 
 	return orig_mask;
+
+err:
+	CPU_FREE(mask);
+err_out:
+	CPU_FREE(orig_mask);
+
+	/* BUG_ON due to failure in allocation of orig_mask/mask */
+	BUG_ON(-1);
+	return NULL;
 }
 
-static void bind_to_cpumask(cpu_set_t mask)
+static void bind_to_cpumask(cpu_set_t *mask)
 {
 	int ret;
+	size_t size = CPU_ALLOC_SIZE(numa_num_possible_cpus());
 
-	ret = sched_setaffinity(0, sizeof(mask), &mask);
-	BUG_ON(ret);
+	ret = sched_setaffinity(0, size, mask);
+	if (ret) {
+		CPU_FREE(mask);
+		BUG_ON(ret);
+	}
 }
 
 static void mempol_restore(void)
@@ -376,7 +423,7 @@ do {							\
 static u8 *alloc_data(ssize_t bytes0, int map_flags,
 		      int init_zero, int init_cpu0, int thp, int init_random)
 {
-	cpu_set_t orig_mask;
+	cpu_set_t *orig_mask = NULL;
 	ssize_t bytes;
 	u8 *buf;
 	int ret;
@@ -434,6 +481,7 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags,
 	/* Restore affinity: */
 	if (init_cpu0) {
 		bind_to_cpumask(orig_mask);
+		CPU_FREE(orig_mask);
 		mempol_restore();
 	}
 
@@ -585,10 +633,16 @@ static int parse_setup_cpu_list(void)
 			return -1;
 		}
 
+		if (is_cpu_online(bind_cpu_0) != 1 || is_cpu_online(bind_cpu_1) != 1) {
+			printf("\nTest not applicable, bind_cpu_0 or bind_cpu_1 is offline\n");
+			return -1;
+		}
+
 		BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0);
 		BUG_ON(bind_cpu_0 > bind_cpu_1);
 
 		for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
+			size_t size = CPU_ALLOC_SIZE(g->p.nr_cpus);
 			int i;
 
 			for (i = 0; i < mul; i++) {
@@ -608,10 +662,15 @@ static int parse_setup_cpu_list(void)
 					tprintf("%2d", bind_cpu);
 				}
 
-				CPU_ZERO(&td->bind_cpumask);
+				td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus);
+				BUG_ON(!td->bind_cpumask);
+				CPU_ZERO_S(size, td->bind_cpumask);
 				for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
-					BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
-					CPU_SET(cpu, &td->bind_cpumask);
+					if (cpu < 0 || cpu >= g->p.nr_cpus) {
+						CPU_FREE(td->bind_cpumask);
+						BUG_ON(-1);
+					}
+					CPU_SET_S(cpu, size, td->bind_cpumask);
 				}
 				t++;
 			}
@@ -752,8 +811,6 @@ static int parse_nodes_opt(const struct option *opt __maybe_unused,
 	return parse_node_list(arg);
 }
 
-#define BIT(x) (1ul << x)
-
 static inline uint32_t lfsr_32(uint32_t lfsr)
 {
 	const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31);
@@ -1241,7 +1298,7 @@ static void *worker_thread(void *__tdata)
 		 * by migrating to CPU#0:
 		 */
 		if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
-			cpu_set_t orig_mask;
+			cpu_set_t *orig_mask;
 			int target_cpu;
 			int this_cpu;
 
@@ -1265,6 +1322,7 @@ static void *worker_thread(void *__tdata)
 				printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
 
 			bind_to_cpumask(orig_mask);
+			CPU_FREE(orig_mask);
 		}
 
 		if (details >= 3) {
@@ -1398,21 +1456,31 @@ static void init_thread_data(void)
 
 	for (t = 0; t < g->p.nr_tasks; t++) {
 		struct thread_data *td = g->threads + t;
+		size_t cpuset_size = CPU_ALLOC_SIZE(g->p.nr_cpus);
 		int cpu;
 
 		/* Allow all nodes by default: */
 		td->bind_node = NUMA_NO_NODE;
 
 		/* Allow all CPUs by default: */
-		CPU_ZERO(&td->bind_cpumask);
+		td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus);
+		BUG_ON(!td->bind_cpumask);
+		CPU_ZERO_S(cpuset_size, td->bind_cpumask);
 		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-			CPU_SET(cpu, &td->bind_cpumask);
+			CPU_SET_S(cpu, cpuset_size, td->bind_cpumask);
 	}
 }
 
 static void deinit_thread_data(void)
 {
 	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
+	int t;
+
+	/* Free the bind_cpumask allocated for thread_data */
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		struct thread_data *td = g->threads + t;
+		CPU_FREE(td->bind_cpumask);
+	}
 
 	free_data(g->threads, size);
 }
@@ -1672,7 +1740,7 @@ static int __bench_numa(const char *name)
 		"GB/sec,", "total-speed",	"GB/sec total speed");
 
 	if (g->p.show_details >= 2) {
-		char tname[14 + 2 * 10 + 1];
+		char tname[14 + 2 * 11 + 1];
 		struct thread_data *td;
 		for (p = 0; p < g->p.nr_proc; p++) {
 			for (t = 0; t < g->p.nr_threads; t++) {