From 937ed91c712273131de6d2a02caafd3ee84e0c72 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Sun, 24 Oct 2021 19:28:14 +0200 Subject: tools/nolibc: x86-64: Fix startup code bug Before this patch, the `_start` function looks like this: ``` 0000000000001170 <_start>: 1170: pop %rdi 1171: mov %rsp,%rsi 1174: lea 0x8(%rsi,%rdi,8),%rdx 1179: and $0xfffffffffffffff0,%rsp 117d: sub $0x8,%rsp 1181: call 1000
1186: movzbq %al,%rdi 118a: mov $0x3c,%rax 1191: syscall 1193: hlt 1194: data16 cs nopw 0x0(%rax,%rax,1) 119f: nop ``` Note the "and" to %rsp with $-16, it makes the %rsp be 16-byte aligned, but then there is a "sub" with $0x8 which makes the %rsp no longer 16-byte aligned, then it calls main. That's the bug! What actually the x86-64 System V ABI mandates is that right before the "call", the %rsp must be 16-byte aligned, not after the "call". So the "sub" with $0x8 here breaks the alignment. Remove it. An example where this rule matters is when the callee needs to align its stack at 16-byte for aligned move instruction, like `movdqa` and `movaps`. If the callee can't align its stack properly, it will result in segmentation fault. x86-64 System V ABI also mandates the deepest stack frame should be zero. Just to be safe, let's zero the %rbp on startup as the content of %rbp may be unspecified when the program starts. Now it looks like this: ``` 0000000000001170 <_start>: 1170: pop %rdi 1171: mov %rsp,%rsi 1174: lea 0x8(%rsi,%rdi,8),%rdx 1179: xor %ebp,%ebp # zero the %rbp 117b: and $0xfffffffffffffff0,%rsp # align the %rsp 117f: call 1000
1184: movzbq %al,%rdi 1188: mov $0x3c,%rax 118f: syscall 1191: hlt 1192: data16 cs nopw 0x0(%rax,%rax,1) 119d: nopl (%rax) ``` Cc: Bedirhan KURT Cc: Louvian Lyndal Reported-by: Peter Cordes Signed-off-by: Ammar Faizi [wt: I did this on purpose due to a misunderstanding of the spec, other archs will thus have to be rechecked, particularly i386] Cc: stable@vger.kernel.org Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'tools/include/nolibc') diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 3430667b0d24..96b6d56acb57 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -399,14 +399,20 @@ struct stat { }) /* startup code */ +/* + * x86-64 System V ABI mandates: + * 1) %rsp must be 16-byte aligned right before the function call. + * 2) The deepest stack frame should be zero (the %rbp). + * + */ asm(".section .text\n" ".global _start\n" "_start:\n" "pop %rdi\n" // argc (first arg, %rdi) "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) - "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned when - "sub $8, %rsp\n" // entering the callee + "xor %ebp, %ebp\n" // zero the stack frame + "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned before call "call main\n" // main() returns the status code, we'll exit with it. "movzb %al, %rdi\n" // retrieve exit code from 8 lower bits "mov $60, %rax\n" // NR_exit == 60 -- cgit v1.2.3-70-g09d2 From ebbe0d8a449d183fa43b42d84fcb248e25303985 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 24 Oct 2021 19:28:15 +0200 Subject: tools/nolibc: i386: fix initial stack alignment After re-checking in the spec and comparing stack offsets with glibc, The last pushed argument must be 16-byte aligned (i.e. aligned before the call) so that in the callee esp+4 is multiple of 16, so the principle is the 32-bit equivalent to what Ammar fixed for x86_64. It's possible that 32-bit code using SSE2 or MMX could have been affected. In addition the frame pointer ought to be zero at the deepest level. Link: https://gitlab.com/x86-psABIs/i386-ABI/-/wikis/Intel386-psABI Cc: Ammar Faizi Cc: stable@vger.kernel.org Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'tools/include/nolibc') diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 96b6d56acb57..7f300dc379e7 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -583,13 +583,21 @@ struct sys_stat_struct { }) /* startup code */ +/* + * i386 System V ABI mandates: + * 1) last pushed argument must be 16-byte aligned. + * 2) The deepest stack frame should be set to zero + * + */ asm(".section .text\n" ".global _start\n" "_start:\n" "pop %eax\n" // argc (first arg, %eax) "mov %esp, %ebx\n" // argv[] (second arg, %ebx) "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) - "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned when + "xor %ebp, %ebp\n" // zero the stack frame + "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned before + "sub $4, %esp\n" // the call instruction (args are aligned) "push %ecx\n" // push all registers on the stack so that we "push %ebx\n" // support both regparm and plain stack modes "push %eax\n" -- cgit v1.2.3-70-g09d2 From de0244ae40ae91145faaf164a4252347607c3711 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 24 Oct 2021 19:28:16 +0200 Subject: tools/nolibc: fix incorrect truncation of exit code Ammar Faizi reported that our exit code handling is wrong. We truncate it to the lowest 8 bits but the syscall itself is expected to take a regular 32-bit signed integer, not an unsigned char. It's the kernel that later truncates it to the lowest 8 bits. The difference is visible in strace, where the program below used to show exit(255) instead of exit(-1): int main(void) { return -1; } This patch applies the fix to all archs. x86_64, i386, arm64, armv7 and mips were all tested and confirmed to work fine now. Risc-v was not tested but the change is trivial and exactly the same as for other archs. Reported-by: Ammar Faizi Cc: stable@vger.kernel.org Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'tools/include/nolibc') diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 7f300dc379e7..3e2c6f2ed587 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -414,7 +414,7 @@ asm(".section .text\n" "xor %ebp, %ebp\n" // zero the stack frame "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned before call "call main\n" // main() returns the status code, we'll exit with it. - "movzb %al, %rdi\n" // retrieve exit code from 8 lower bits + "mov %eax, %edi\n" // retrieve exit code (32 bit) "mov $60, %rax\n" // NR_exit == 60 "syscall\n" // really exit "hlt\n" // ensure it does not return @@ -602,9 +602,9 @@ asm(".section .text\n" "push %ebx\n" // support both regparm and plain stack modes "push %eax\n" "call main\n" // main() returns the status code in %eax - "movzbl %al, %ebx\n" // retrieve exit code from lower 8 bits - "movl $1, %eax\n" // NR_exit == 1 - "int $0x80\n" // exit now + "mov %eax, %ebx\n" // retrieve exit code (32-bit int) + "movl $1, %eax\n" // NR_exit == 1 + "int $0x80\n" // exit now "hlt\n" // ensure it does not ""); @@ -788,7 +788,6 @@ asm(".section .text\n" "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) "bl main\n" // main() returns the status code, we'll exit with it. - "and %r0, %r0, $0xff\n" // limit exit code to 8 bits "movs r7, $1\n" // NR_exit == 1 "svc $0x00\n" ""); @@ -985,7 +984,6 @@ asm(".section .text\n" "add x2, x2, x1\n" // + argv "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee "bl main\n" // main() returns the status code, we'll exit with it. - "and x0, x0, 0xff\n" // limit exit code to 8 bits "mov x8, 93\n" // NR_exit == 93 "svc #0\n" ""); @@ -1190,7 +1188,7 @@ asm(".section .text\n" "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! "jal main\n" // main() returns the status code, we'll exit with it. "nop\n" // delayed slot - "and $a0, $v0, 0xff\n" // limit exit code to 8 bits + "move $a0, $v0\n" // retrieve 32-bit exit code from v0 "li $v0, 4001\n" // NR_exit == 4001 "syscall\n" ".end __start\n" @@ -1388,7 +1386,6 @@ asm(".section .text\n" "add a2,a2,a1\n" // + argv "andi sp,a1,-16\n" // sp must be 16-byte aligned "call main\n" // main() returns the status code, we'll exit with it. - "andi a0, a0, 0xff\n" // limit exit code to 8 bits "li a7, 93\n" // NR_exit == 93 "ecall\n" ""); -- cgit v1.2.3-70-g09d2 From bf91666959eeac44fb686e9359e37830944beef2 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Sun, 24 Oct 2021 19:43:21 +0200 Subject: tools/nolibc: x86: Remove `r8`, `r9` and `r10` from the clobber list Linux x86-64 syscall only clobbers rax, rcx and r11 (and "memory"). - rax for the return value. - rcx to save the return address. - r11 to save the rflags. Other registers are preserved. Having r8, r9 and r10 in the syscall clobber list is harmless, but this results in a missed-optimization. As the syscall doesn't clobber r8-r10, GCC should be allowed to reuse their value after the syscall returns to userspace. But since they are in the clobber list, GCC will always miss this opportunity. Remove them from the x86-64 syscall clobber list to help GCC generate better code and fix the comment. See also the x86-64 ABI, section A.2 AMD64 Linux Kernel Conventions, A.2.1 Calling Conventions [1]. Extra note: Some people may think it does not really give a benefit to remove r8, r9 and r10 from the syscall clobber list because the impression of syscall is a C function call, and function call always clobbers those 3. However, that is not the case for nolibc.h, because we have a potential to inline the "syscall" instruction (which its opcode is "0f 05") to the user functions. All syscalls in the nolibc.h are written as a static function with inline ASM and are likely always inline if we use optimization flag, so this is a profit not to have r8, r9 and r10 in the clobber list. Here is the example where this matters. Consider the following C code: ``` #include "tools/include/nolibc/nolibc.h" #define read_abc(a, b, c) __asm__ volatile("nop"::"r"(a),"r"(b),"r"(c)) int main(void) { int a = 0xaa; int b = 0xbb; int c = 0xcc; read_abc(a, b, c); write(1, "test\n", 5); read_abc(a, b, c); return 0; } ``` Compile with: gcc -Os test.c -o test -nostdlib With r8, r9, r10 in the clobber list, GCC generates this: 0000000000001000
: 1000: f3 0f 1e fa endbr64 1004: 41 54 push %r12 1006: 41 bc cc 00 00 00 mov $0xcc,%r12d 100c: 55 push %rbp 100d: bd bb 00 00 00 mov $0xbb,%ebp 1012: 53 push %rbx 1013: bb aa 00 00 00 mov $0xaa,%ebx 1018: 90 nop 1019: b8 01 00 00 00 mov $0x1,%eax 101e: bf 01 00 00 00 mov $0x1,%edi 1023: ba 05 00 00 00 mov $0x5,%edx 1028: 48 8d 35 d1 0f 00 00 lea 0xfd1(%rip),%rsi 102f: 0f 05 syscall 1031: 90 nop 1032: 31 c0 xor %eax,%eax 1034: 5b pop %rbx 1035: 5d pop %rbp 1036: 41 5c pop %r12 1038: c3 ret GCC thinks that syscall will clobber r8, r9, r10. So it spills 0xaa, 0xbb and 0xcc to callee saved registers (r12, rbp and rbx). This is clearly extra memory access and extra stack size for preserving them. But syscall does not actually clobber them, so this is a missed optimization. Now without r8, r9, r10 in the clobber list, GCC generates better code: 0000000000001000
: 1000: f3 0f 1e fa endbr64 1004: 41 b8 aa 00 00 00 mov $0xaa,%r8d 100a: 41 b9 bb 00 00 00 mov $0xbb,%r9d 1010: 41 ba cc 00 00 00 mov $0xcc,%r10d 1016: 90 nop 1017: b8 01 00 00 00 mov $0x1,%eax 101c: bf 01 00 00 00 mov $0x1,%edi 1021: ba 05 00 00 00 mov $0x5,%edx 1026: 48 8d 35 d3 0f 00 00 lea 0xfd3(%rip),%rsi 102d: 0f 05 syscall 102f: 90 nop 1030: 31 c0 xor %eax,%eax 1032: c3 ret Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: David Laight Acked-by: Andy Lutomirski Signed-off-by: Ammar Faizi Link: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/x86-64-psABI [1] Link: https://lore.kernel.org/lkml/20211011040344.437264-1-ammar.faizi@students.amikom.ac.id/ Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'tools/include/nolibc') diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 3e2c6f2ed587..f9afe89ec6f2 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -265,12 +265,17 @@ struct stat { * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively * - the system call is performed by calling the syscall instruction * - syscall return comes in rax - * - rcx and r8..r11 may be clobbered, others are preserved. + * - rcx and r11 are clobbered, others are preserved. * - the arguments are cast to long and assigned into the target registers * which are then simply passed as registers to the asm code, so that we * don't have to experience issues with register constraints. * - the syscall number is always specified last in order to allow to force * some registers before (gcc refuses a %-register at the last position). + * - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1 + * Calling Conventions. + * + * Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/x86-64-psABI + * */ #define my_syscall0(num) \ @@ -280,9 +285,9 @@ struct stat { \ asm volatile ( \ "syscall\n" \ - : "=a" (_ret) \ + : "=a"(_ret) \ : "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + : "rcx", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -295,10 +300,10 @@ struct stat { \ asm volatile ( \ "syscall\n" \ - : "=a" (_ret) \ + : "=a"(_ret) \ : "r"(_arg1), \ "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + : "rcx", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -312,10 +317,10 @@ struct stat { \ asm volatile ( \ "syscall\n" \ - : "=a" (_ret) \ + : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), \ "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + : "rcx", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -330,10 +335,10 @@ struct stat { \ asm volatile ( \ "syscall\n" \ - : "=a" (_ret) \ + : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + : "rcx", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -349,10 +354,10 @@ struct stat { \ asm volatile ( \ "syscall\n" \ - : "=a" (_ret), "=r"(_arg4) \ + : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ "0"(_num) \ - : "rcx", "r8", "r9", "r11", "memory", "cc" \ + : "rcx", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -369,10 +374,10 @@ struct stat { \ asm volatile ( \ "syscall\n" \ - : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ + : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ "0"(_num) \ - : "rcx", "r9", "r11", "memory", "cc" \ + : "rcx", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -390,7 +395,7 @@ struct stat { \ asm volatile ( \ "syscall\n" \ - : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ + : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ "r"(_arg6), "0"(_num) \ : "rcx", "r11", "memory", "cc" \ -- cgit v1.2.3-70-g09d2 From 7bdc0e7a390511cd3df8194003b908f15a6170a5 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Sun, 24 Oct 2021 19:43:22 +0200 Subject: tools/nolibc: x86-64: Use `mov $60,%eax` instead of `mov $60,%rax` Note that mov to 32-bit register will zero extend to 64-bit register. Thus `mov $60,%eax` has the same effect with `mov $60,%rax`. Use the shorter opcode to achieve the same thing. ``` b8 3c 00 00 00 mov $60,%eax (5 bytes) [1] 48 c7 c0 3c 00 00 00 mov $60,%rax (7 bytes) [2] ``` Currently, we use [2]. Change it to [1] for shorter code. Signed-off-by: Ammar Faizi Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include/nolibc') diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index f9afe89ec6f2..4988866af0b5 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -420,7 +420,7 @@ asm(".section .text\n" "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned before call "call main\n" // main() returns the status code, we'll exit with it. "mov %eax, %edi\n" // retrieve exit code (32 bit) - "mov $60, %rax\n" // NR_exit == 60 + "mov $60, %eax\n" // NR_exit == 60 "syscall\n" // really exit "hlt\n" // ensure it does not return ""); -- cgit v1.2.3-70-g09d2 From b0fe9dec66370cef23e9c281d229f4c44f84cfa2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sun, 24 Oct 2021 19:43:23 +0200 Subject: tools/nolibc: Implement gettid() Allow test programs to determine their thread ID. Signed-off-by: Mark Brown Cc: Willy Tarreau Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools/include/nolibc') diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 4988866af0b5..c1c285fe494a 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -1571,6 +1571,12 @@ pid_t sys_getpid(void) return my_syscall0(__NR_getpid); } +static __attribute__((unused)) +pid_t sys_gettid(void) +{ + return my_syscall0(__NR_gettid); +} + static __attribute__((unused)) int sys_gettimeofday(struct timeval *tv, struct timezone *tz) { @@ -2029,6 +2035,18 @@ pid_t getpid(void) return ret; } +static __attribute__((unused)) +pid_t gettid(void) +{ + pid_t ret = sys_gettid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + static __attribute__((unused)) int gettimeofday(struct timeval *tv, struct timezone *tz) { -- cgit v1.2.3-70-g09d2