diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-14 17:13:53 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-14 17:13:53 -0800 |
commit | 1ac0884d5474fea8dc6ceabbd0e870d1bf4b7b42 (patch) | |
tree | 5158d582c0f5040a9c8a28fe9051881c5ec7b5eb /tools | |
parent | ff6135959a9150ad45cb92ca38da270903a74343 (diff) | |
parent | c6156e1da633f241e132eaea3b676d674376d770 (diff) |
Merge tag 'core-entry-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull core entry/exit updates from Thomas Gleixner:
"A set of updates for entry/exit handling:
- More generalization of entry/exit functionality
- The consolidation work to reclaim TIF flags on x86 and also for
non-x86 specific TIF flags which are solely relevant for syscall
related work and have been moved into their own storage space. The
x86 specific part had to be merged in to avoid a major conflict.
- The TIF_NOTIFY_SIGNAL work which replaces the inefficient signal
delivery mode of task work and results in an impressive performance
improvement for io_uring. The non-x86 consolidation of this is
going to come seperate via Jens.
- The selective syscall redirection facility which provides a clean
and efficient way to support the non-Linux syscalls of WINE by
catching them at syscall entry and redirecting them to the user
space emulation. This can be utilized for other purposes as well
and has been designed carefully to avoid overhead for the regular
fastpath. This includes the core changes and the x86 support code.
- Simplification of the context tracking entry/exit handling for the
users of the generic entry code which guarantee the proper ordering
and protection.
- Preparatory changes to make the generic entry code accomodate S390
specific requirements which are mostly related to their syscall
restart mechanism"
* tag 'core-entry-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
entry: Add syscall_exit_to_user_mode_work()
entry: Add exit_to_user_mode() wrapper
entry_Add_enter_from_user_mode_wrapper
entry: Rename exit_to_user_mode()
entry: Rename enter_from_user_mode()
docs: Document Syscall User Dispatch
selftests: Add benchmark for syscall user dispatch
selftests: Add kselftest for syscall user dispatch
entry: Support Syscall User Dispatch on common syscall entry
kernel: Implement selective syscall userspace redirection
signal: Expose SYS_USER_DISPATCH si_code type
x86: vdso: Expose sigreturn address on vdso to the kernel
MAINTAINERS: Add entry for common entry code
entry: Fix boot for !CONFIG_GENERIC_ENTRY
x86: Support HAVE_CONTEXT_TRACKING_OFFSTACK
context_tracking: Only define schedule_user() on !HAVE_CONTEXT_TRACKING_OFFSTACK archs
sched: Detect call to schedule from critical entry code
context_tracking: Don't implement exception_enter/exit() on CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK
context_tracking: Introduce HAVE_CONTEXT_TRACKING_OFFSTACK
x86: Reclaim unused x86 TI flags
...
Diffstat (limited to 'tools')
6 files changed, 524 insertions, 0 deletions
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2e20e30a6faa..e93f10386e76 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -57,6 +57,7 @@ TARGETS += sparc64 TARGETS += splice TARGETS += static_keys TARGETS += sync +TARGETS += syscall_user_dispatch TARGETS += sysctl TARGETS += tc-testing TARGETS += timens diff --git a/tools/testing/selftests/syscall_user_dispatch/.gitignore b/tools/testing/selftests/syscall_user_dispatch/.gitignore new file mode 100644 index 000000000000..f539615ad5da --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +sud_test +sud_benchmark diff --git a/tools/testing/selftests/syscall_user_dispatch/Makefile b/tools/testing/selftests/syscall_user_dispatch/Makefile new file mode 100644 index 000000000000..03c120270953 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +top_srcdir = ../../../.. +INSTALL_HDR_PATH = $(top_srcdir)/usr +LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ + +CFLAGS += -Wall -I$(LINUX_HDR_PATH) + +TEST_GEN_PROGS := sud_test sud_benchmark +include ../lib.mk diff --git a/tools/testing/selftests/syscall_user_dispatch/config b/tools/testing/selftests/syscall_user_dispatch/config new file mode 100644 index 000000000000..039e303e59d7 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/config @@ -0,0 +1 @@ +CONFIG_GENERIC_ENTRY=y diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c new file mode 100644 index 000000000000..6689f1183dbf --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Collabora Ltd. + * + * Benchmark and test syscall user dispatch + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <signal.h> +#include <errno.h> +#include <time.h> +#include <sys/time.h> +#include <unistd.h> +#include <sys/sysinfo.h> +#include <sys/prctl.h> +#include <sys/syscall.h> + +#ifndef PR_SET_SYSCALL_USER_DISPATCH +# define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +#endif + +#ifdef __NR_syscalls +# define MAGIC_SYSCALL_1 (__NR_syscalls + 1) /* Bad Linux syscall number */ +#else +# define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ +#endif + +/* + * To test returning from a sigsys with selector blocked, the test + * requires some per-architecture support (i.e. knowledge about the + * signal trampoline address). On i386, we know it is on the vdso, and + * a small trampoline is open-coded for x86_64. Other architectures + * that have a trampoline in the vdso will support TEST_BLOCKED_RETURN + * out of the box, but don't enable them until they support syscall user + * dispatch. + */ +#if defined(__x86_64__) || defined(__i386__) +#define TEST_BLOCKED_RETURN +#endif + +#ifdef __x86_64__ +void* (syscall_dispatcher_start)(void); +void* (syscall_dispatcher_end)(void); +#else +unsigned long syscall_dispatcher_start = 0; +unsigned long syscall_dispatcher_end = 0; +#endif + +unsigned long trapped_call_count = 0; +unsigned long native_call_count = 0; + +char selector; +#define SYSCALL_BLOCK (selector = PR_SYS_DISPATCH_ON) +#define SYSCALL_UNBLOCK (selector = PR_SYS_DISPATCH_OFF) + +#define CALIBRATION_STEP 100000 +#define CALIBRATE_TO_SECS 5 +int factor; + +static double one_sysinfo_step(void) +{ + struct timespec t1, t2; + int i; + struct sysinfo info; + + clock_gettime(CLOCK_MONOTONIC, &t1); + for (i = 0; i < CALIBRATION_STEP; i++) + sysinfo(&info); + clock_gettime(CLOCK_MONOTONIC, &t2); + return (t2.tv_sec - t1.tv_sec) + 1.0e-9 * (t2.tv_nsec - t1.tv_nsec); +} + +static void calibrate_set(void) +{ + double elapsed = 0; + + printf("Calibrating test set to last ~%d seconds...\n", CALIBRATE_TO_SECS); + + while (elapsed < 1) { + elapsed += one_sysinfo_step(); + factor += CALIBRATE_TO_SECS; + } + + printf("test iterations = %d\n", CALIBRATION_STEP * factor); +} + +static double perf_syscall(void) +{ + unsigned int i; + double partial = 0; + + for (i = 0; i < factor; ++i) + partial += one_sysinfo_step()/(CALIBRATION_STEP*factor); + return partial; +} + +static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) +{ + char buf[1024]; + int len; + + SYSCALL_UNBLOCK; + + /* printf and friends are not signal-safe. */ + len = snprintf(buf, 1024, "Caught sys_%x\n", info->si_syscall); + write(1, buf, len); + + if (info->si_syscall == MAGIC_SYSCALL_1) + trapped_call_count++; + else + native_call_count++; + +#ifdef TEST_BLOCKED_RETURN + SYSCALL_BLOCK; +#endif + +#ifdef __x86_64__ + __asm__ volatile("movq $0xf, %rax"); + __asm__ volatile("leaveq"); + __asm__ volatile("add $0x8, %rsp"); + __asm__ volatile("syscall_dispatcher_start:"); + __asm__ volatile("syscall"); + __asm__ volatile("nop"); /* Landing pad within dispatcher area */ + __asm__ volatile("syscall_dispatcher_end:"); +#endif + +} + +int main(void) +{ + struct sigaction act; + double time1, time2; + int ret; + sigset_t mask; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + calibrate_set(); + + time1 = perf_syscall(); + printf("Avg syscall time %.0lfns.\n", time1 * 1.0e9); + + ret = sigaction(SIGSYS, &act, NULL); + if (ret) { + perror("Error sigaction:"); + exit(-1); + } + + fprintf(stderr, "Enabling syscall trapping.\n"); + + if (prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, + syscall_dispatcher_start, + (syscall_dispatcher_end - syscall_dispatcher_start + 1), + &selector)) { + perror("prctl failed\n"); + exit(-1); + } + + SYSCALL_BLOCK; + syscall(MAGIC_SYSCALL_1); + +#ifdef TEST_BLOCKED_RETURN + if (selector == PR_SYS_DISPATCH_OFF) { + fprintf(stderr, "Failed to return with selector blocked.\n"); + exit(-1); + } +#endif + + SYSCALL_UNBLOCK; + + if (!trapped_call_count) { + fprintf(stderr, "syscall trapping does not work.\n"); + exit(-1); + } + + time2 = perf_syscall(); + + if (native_call_count) { + perror("syscall trapping intercepted more syscalls than expected\n"); + exit(-1); + } + + printf("trapped_call_count %lu, native_call_count %lu.\n", + trapped_call_count, native_call_count); + printf("Avg syscall time %.0lfns.\n", time2 * 1.0e9); + printf("Interception overhead: %.1lf%% (+%.0lfns).\n", + 100.0 * (time2 / time1 - 1.0), 1.0e9 * (time2 - time1)); + return 0; + +} diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_test.c b/tools/testing/selftests/syscall_user_dispatch/sud_test.c new file mode 100644 index 000000000000..6498b050ef89 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/sud_test.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Collabora Ltd. + * + * Test code for syscall user dispatch + */ + +#define _GNU_SOURCE +#include <sys/prctl.h> +#include <sys/sysinfo.h> +#include <sys/syscall.h> +#include <signal.h> + +#include <asm/unistd.h> +#include "../kselftest_harness.h" + +#ifndef PR_SET_SYSCALL_USER_DISPATCH +# define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +#endif + +#ifndef SYS_USER_DISPATCH +# define SYS_USER_DISPATCH 2 +#endif + +#ifdef __NR_syscalls +# define MAGIC_SYSCALL_1 (__NR_syscalls + 1) /* Bad Linux syscall number */ +#else +# define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ +#endif + +#define SYSCALL_DISPATCH_ON(x) ((x) = 1) +#define SYSCALL_DISPATCH_OFF(x) ((x) = 0) + +/* Test Summary: + * + * - dispatch_trigger_sigsys: Verify if PR_SET_SYSCALL_USER_DISPATCH is + * able to trigger SIGSYS on a syscall. + * + * - bad_selector: Test that a bad selector value triggers SIGSYS with + * si_errno EINVAL. + * + * - bad_prctl_param: Test that the API correctly rejects invalid + * parameters on prctl + * + * - dispatch_and_return: Test that a syscall is selectively dispatched + * to userspace depending on the value of selector. + * + * - disable_dispatch: Test that the PR_SYS_DISPATCH_OFF correctly + * disables the dispatcher + * + * - direct_dispatch_range: Test that a syscall within the allowed range + * can bypass the dispatcher. + */ + +TEST_SIGNAL(dispatch_trigger_sigsys, SIGSYS) +{ + char sel = 0; + struct sysinfo info; + int ret; + + ret = sysinfo(&info); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + SYSCALL_DISPATCH_ON(sel); + + sysinfo(&info); + + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +TEST(bad_prctl_param) +{ + char sel = 0; + int op; + + /* Invalid op */ + op = -1; + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0, 0, &sel); + ASSERT_EQ(EINVAL, errno); + + /* PR_SYS_DISPATCH_OFF */ + op = PR_SYS_DISPATCH_OFF; + + /* offset != 0 */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, 0); + EXPECT_EQ(EINVAL, errno); + + /* len != 0 */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0xff, 0); + EXPECT_EQ(EINVAL, errno); + + /* sel != NULL */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + + /* Valid parameter */ + errno = 0; + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, 0x0); + EXPECT_EQ(0, errno); + + /* PR_SYS_DISPATCH_ON */ + op = PR_SYS_DISPATCH_ON; + + /* Dispatcher region is bad (offset > 0 && len == 0) */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, -1L, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + + /* Invalid selector */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x1, (void *) -1); + ASSERT_EQ(EFAULT, errno); + + /* + * Dispatcher range overflows unsigned long + */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 1, -1L, &sel); + ASSERT_EQ(EINVAL, errno) { + TH_LOG("Should reject bad syscall range"); + } + + /* + * Allowed range overflows usigned long + */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, -1L, 0x1, &sel); + ASSERT_EQ(EINVAL, errno) { + TH_LOG("Should reject bad syscall range"); + } +} + +/* + * Use global selector for handle_sigsys tests, to avoid passing + * selector to signal handler + */ +char glob_sel; +int nr_syscalls_emulated; +int si_code; +int si_errno; + +static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) +{ + si_code = info->si_code; + si_errno = info->si_errno; + + if (info->si_syscall == MAGIC_SYSCALL_1) + nr_syscalls_emulated++; + + /* In preparation for sigreturn. */ + SYSCALL_DISPATCH_OFF(glob_sel); +} + +TEST(dispatch_and_return) +{ + long ret; + struct sigaction act; + sigset_t mask; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret); + + /* Make sure selector is good prior to prctl. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + /* MAGIC_SYSCALL_1 doesn't exist. */ + SYSCALL_DISPATCH_OFF(glob_sel); + ret = syscall(MAGIC_SYSCALL_1); + EXPECT_EQ(-1, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } + + /* MAGIC_SYSCALL_1 should be emulated. */ + nr_syscalls_emulated = 0; + SYSCALL_DISPATCH_ON(glob_sel); + + ret = syscall(MAGIC_SYSCALL_1); + EXPECT_EQ(MAGIC_SYSCALL_1, ret) { + TH_LOG("Failed to intercept syscall"); + } + EXPECT_EQ(1, nr_syscalls_emulated) { + TH_LOG("Failed to emulate syscall"); + } + ASSERT_EQ(SYS_USER_DISPATCH, si_code) { + TH_LOG("Bad si_code in SIGSYS"); + } + ASSERT_EQ(0, si_errno) { + TH_LOG("Bad si_errno in SIGSYS"); + } +} + +TEST_SIGNAL(bad_selector, SIGSYS) +{ + long ret; + struct sigaction act; + sigset_t mask; + struct sysinfo info; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret); + + /* Make sure selector is good prior to prctl. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + glob_sel = -1; + + sysinfo(&info); + + /* Even though it is ready to catch SIGSYS, the signal is + * supposed to be uncatchable. + */ + + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +TEST(disable_dispatch) +{ + int ret; + struct sysinfo info; + char sel = 0; + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + /* MAGIC_SYSCALL_1 doesn't exist. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_OFF, 0, 0, 0); + EXPECT_EQ(0, ret) { + TH_LOG("Failed to unset syscall user dispatch"); + } + + /* Shouldn't have any effect... */ + SYSCALL_DISPATCH_ON(glob_sel); + + ret = syscall(__NR_sysinfo, &info); + EXPECT_EQ(0, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } +} + +TEST(direct_dispatch_range) +{ + int ret = 0; + struct sysinfo info; + char sel = 0; + + /* + * Instead of calculating libc addresses; allow the entire + * memory map and lock the selector. + */ + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, -1L, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + SYSCALL_DISPATCH_ON(sel); + + ret = sysinfo(&info); + ASSERT_EQ(0, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } +} + +TEST_HARNESS_MAIN |