summaryrefslogtreecommitdiff
path: root/tools
AgeCommit message (Collapse)Author
2024-09-10perf trace: Support collecting 'union's with the BPF augmenterArnaldo Carvalho de Melo
And reuse the BTF based struct pretty printer, with that we can offer initial support for the 'bpf' syscall's second argument, a 'union bpf_attr' pointer. But this is not that satisfactory as the libbpf btf dumper will pretty print _all_ the union, we need to have a way to say that the first arg selects the type for the union member to be pretty printed, something like what pahole does translating the PERF_RECORD_ selector into a name, and using that name to find a matching struct. In the case of 'union bpf_attr' it would map PROG_LOAD to one of the union members, but unfortunately there is no such mapping: root@number:~# pahole bpf_attr union bpf_attr { struct { __u32 map_type; /* 0 4 */ __u32 key_size; /* 4 4 */ __u32 value_size; /* 8 4 */ __u32 max_entries; /* 12 4 */ __u32 map_flags; /* 16 4 */ __u32 inner_map_fd; /* 20 4 */ __u32 numa_node; /* 24 4 */ char map_name[16]; /* 28 16 */ __u32 map_ifindex; /* 44 4 */ __u32 btf_fd; /* 48 4 */ __u32 btf_key_type_id; /* 52 4 */ __u32 btf_value_type_id; /* 56 4 */ __u32 btf_vmlinux_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u64 map_extra; /* 64 8 */ __s32 value_type_btf_obj_fd; /* 72 4 */ __s32 map_token_fd; /* 76 4 */ }; /* 0 80 */ struct { __u32 map_fd; /* 0 4 */ /* XXX 4 bytes hole, try to pack */ __u64 key; /* 8 8 */ union { __u64 value; /* 16 8 */ __u64 next_key; /* 16 8 */ }; /* 16 8 */ __u64 flags; /* 24 8 */ }; /* 0 32 */ struct { __u64 in_batch; /* 0 8 */ __u64 out_batch; /* 8 8 */ __u64 keys; /* 16 8 */ __u64 values; /* 24 8 */ __u32 count; /* 32 4 */ __u32 map_fd; /* 36 4 */ __u64 elem_flags; /* 40 8 */ __u64 flags; /* 48 8 */ } batch; /* 0 56 */ struct { __u32 prog_type; /* 0 4 */ __u32 insn_cnt; /* 4 4 */ __u64 insns; /* 8 8 */ __u64 license; /* 16 8 */ __u32 log_level; /* 24 4 */ __u32 log_size; /* 28 4 */ __u64 log_buf; /* 32 8 */ __u32 kern_version; /* 40 4 */ __u32 prog_flags; /* 44 4 */ char prog_name[16]; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u32 prog_ifindex; /* 64 4 */ __u32 expected_attach_type; /* 68 4 */ __u32 prog_btf_fd; /* 72 4 */ __u32 func_info_rec_size; /* 76 4 */ __u64 func_info; /* 80 8 */ __u32 func_info_cnt; /* 88 4 */ __u32 line_info_rec_size; /* 92 4 */ __u64 line_info; /* 96 8 */ __u32 line_info_cnt; /* 104 4 */ __u32 attach_btf_id; /* 108 4 */ union { __u32 attach_prog_fd; /* 112 4 */ __u32 attach_btf_obj_fd; /* 112 4 */ }; /* 112 4 */ __u32 core_relo_cnt; /* 116 4 */ __u64 fd_array; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ __u64 core_relos; /* 128 8 */ __u32 core_relo_rec_size; /* 136 4 */ __u32 log_true_size; /* 140 4 */ __s32 prog_token_fd; /* 144 4 */ }; /* 0 152 */ struct { __u64 pathname; /* 0 8 */ __u32 bpf_fd; /* 8 4 */ __u32 file_flags; /* 12 4 */ __s32 path_fd; /* 16 4 */ }; /* 0 24 */ struct { union { __u32 target_fd; /* 0 4 */ __u32 target_ifindex; /* 0 4 */ }; /* 0 4 */ __u32 attach_bpf_fd; /* 4 4 */ __u32 attach_type; /* 8 4 */ __u32 attach_flags; /* 12 4 */ __u32 replace_bpf_fd; /* 16 4 */ union { __u32 relative_fd; /* 20 4 */ __u32 relative_id; /* 20 4 */ }; /* 20 4 */ __u64 expected_revision; /* 24 8 */ }; /* 0 32 */ struct { __u32 prog_fd; /* 0 4 */ __u32 retval; /* 4 4 */ __u32 data_size_in; /* 8 4 */ __u32 data_size_out; /* 12 4 */ __u64 data_in; /* 16 8 */ __u64 data_out; /* 24 8 */ __u32 repeat; /* 32 4 */ __u32 duration; /* 36 4 */ __u32 ctx_size_in; /* 40 4 */ __u32 ctx_size_out; /* 44 4 */ __u64 ctx_in; /* 48 8 */ __u64 ctx_out; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u32 flags; /* 64 4 */ __u32 cpu; /* 68 4 */ __u32 batch_size; /* 72 4 */ } test; /* 0 80 */ struct { union { __u32 start_id; /* 0 4 */ __u32 prog_id; /* 0 4 */ __u32 map_id; /* 0 4 */ __u32 btf_id; /* 0 4 */ __u32 link_id; /* 0 4 */ }; /* 0 4 */ __u32 next_id; /* 4 4 */ __u32 open_flags; /* 8 4 */ }; /* 0 12 */ struct { __u32 bpf_fd; /* 0 4 */ __u32 info_len; /* 4 4 */ __u64 info; /* 8 8 */ } info; /* 0 16 */ struct { union { __u32 target_fd; /* 0 4 */ __u32 target_ifindex; /* 0 4 */ }; /* 0 4 */ __u32 attach_type; /* 4 4 */ __u32 query_flags; /* 8 4 */ __u32 attach_flags; /* 12 4 */ __u64 prog_ids; /* 16 8 */ union { __u32 prog_cnt; /* 24 4 */ __u32 count; /* 24 4 */ }; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ __u64 prog_attach_flags; /* 32 8 */ __u64 link_ids; /* 40 8 */ __u64 link_attach_flags; /* 48 8 */ __u64 revision; /* 56 8 */ } query; /* 0 64 */ struct { __u64 name; /* 0 8 */ __u32 prog_fd; /* 8 4 */ /* XXX 4 bytes hole, try to pack */ __u64 cookie; /* 16 8 */ } raw_tracepoint; /* 0 24 */ struct { __u64 btf; /* 0 8 */ __u64 btf_log_buf; /* 8 8 */ __u32 btf_size; /* 16 4 */ __u32 btf_log_size; /* 20 4 */ __u32 btf_log_level; /* 24 4 */ __u32 btf_log_true_size; /* 28 4 */ __u32 btf_flags; /* 32 4 */ __s32 btf_token_fd; /* 36 4 */ }; /* 0 40 */ struct { __u32 pid; /* 0 4 */ __u32 fd; /* 4 4 */ __u32 flags; /* 8 4 */ __u32 buf_len; /* 12 4 */ __u64 buf; /* 16 8 */ __u32 prog_id; /* 24 4 */ __u32 fd_type; /* 28 4 */ __u64 probe_offset; /* 32 8 */ __u64 probe_addr; /* 40 8 */ } task_fd_query; /* 0 48 */ struct { union { __u32 prog_fd; /* 0 4 */ __u32 map_fd; /* 0 4 */ }; /* 0 4 */ union { __u32 target_fd; /* 4 4 */ __u32 target_ifindex; /* 4 4 */ }; /* 4 4 */ __u32 attach_type; /* 8 4 */ __u32 flags; /* 12 4 */ union { __u32 target_btf_id; /* 16 4 */ struct { __u64 iter_info; /* 16 8 */ __u32 iter_info_len; /* 24 4 */ }; /* 16 16 */ struct { __u64 bpf_cookie; /* 16 8 */ } perf_event; /* 16 8 */ struct { __u32 flags; /* 16 4 */ __u32 cnt; /* 20 4 */ __u64 syms; /* 24 8 */ __u64 addrs; /* 32 8 */ __u64 cookies; /* 40 8 */ } kprobe_multi; /* 16 32 */ struct { __u32 target_btf_id; /* 16 4 */ /* XXX 4 bytes hole, try to pack */ __u64 cookie; /* 24 8 */ } tracing; /* 16 16 */ struct { __u32 pf; /* 16 4 */ __u32 hooknum; /* 20 4 */ __s32 priority; /* 24 4 */ __u32 flags; /* 28 4 */ } netfilter; /* 16 16 */ struct { union { __u32 relative_fd; /* 16 4 */ __u32 relative_id; /* 16 4 */ }; /* 16 4 */ /* XXX 4 bytes hole, try to pack */ __u64 expected_revision; /* 24 8 */ } tcx; /* 16 16 */ struct { __u64 path; /* 16 8 */ __u64 offsets; /* 24 8 */ __u64 ref_ctr_offsets; /* 32 8 */ __u64 cookies; /* 40 8 */ __u32 cnt; /* 48 4 */ __u32 flags; /* 52 4 */ __u32 pid; /* 56 4 */ } uprobe_multi; /* 16 48 */ struct { union { __u32 relative_fd; /* 16 4 */ __u32 relative_id; /* 16 4 */ }; /* 16 4 */ /* XXX 4 bytes hole, try to pack */ __u64 expected_revision; /* 24 8 */ } netkit; /* 16 16 */ }; /* 16 48 */ } link_create; /* 0 64 */ struct { __u32 link_fd; /* 0 4 */ union { __u32 new_prog_fd; /* 4 4 */ __u32 new_map_fd; /* 4 4 */ }; /* 4 4 */ __u32 flags; /* 8 4 */ union { __u32 old_prog_fd; /* 12 4 */ __u32 old_map_fd; /* 12 4 */ }; /* 12 4 */ } link_update; /* 0 16 */ struct { __u32 link_fd; /* 0 4 */ } link_detach; /* 0 4 */ struct { __u32 type; /* 0 4 */ } enable_stats; /* 0 4 */ struct { __u32 link_fd; /* 0 4 */ __u32 flags; /* 4 4 */ } iter_create; /* 0 8 */ struct { __u32 prog_fd; /* 0 4 */ __u32 map_fd; /* 4 4 */ __u32 flags; /* 8 4 */ } prog_bind_map; /* 0 12 */ struct { __u32 flags; /* 0 4 */ __u32 bpffs_fd; /* 4 4 */ } token_create; /* 0 8 */ }; root@number:~# So this is one case where BTF gets us only that far, not getting all the way to automate the pretty printing of unions designed like 'union bpf_attr', we will need a custom pretty printer for this union, as using the libbpf union BTF dumper is way too verbose: root@number:~# perf trace --max-events 1 -e bpf bpftool map 0.000 ( 0.054 ms): bpftool/3409073 bpf(cmd: PROG_LOAD, uattr: (union bpf_attr){(struct){.map_type = (__u32)1,.key_size = (__u32)2,.value_size = (__u32)2755142048,.max_entries = (__u32)32764,.map_flags = (__u32)150263906,.inner_map_fd = (__u32)21920,},(struct){.map_fd = (__u32)1,.key = (__u64)140723063628192,(union){.value = (__u64)94145833392226,.next_key = (__u64)94145833392226,},},.batch = (struct){.in_batch = (__u64)8589934593,.out_batch = (__u64)140723063628192,.keys = (__u64)94145833392226,},(struct){.prog_type = (__u32)1,.insn_cnt = (__u32)2,.insns = (__u64)140723063628192,.license = (__u64)94145833392226,},(struct){.pathname = (__u64)8589934593,.bpf_fd = (__u32)2755142048,.file_flags = (__u32)32764,.path_fd = (__s32)150263906,},(struct){(union){.target_fd = (__u32)1,.target_ifindex = (__u32)1,},.attach_bpf_fd = (__u32)2,.attach_type = (__u32)2755142048,.attach_flags = (__u32)32764,.replace_bpf_fd = (__u32)150263906,(union){.relative_fd = (__u32)21920,.relative_id = (__u32)21920,},},.test = (struct){.prog_fd = (__u32)1,.retval = (__u32)2,.data_size_in = (__u32)2755142048,.data_size_out = (__u32)32764,.data_in = (__u64)94145833392226,},(struct){(union){.start_id = (__u32)1,.prog_id = (__u32)1,.map_id = (__u32)1,.btf_id = (__u32)1,.link_id = (__u32)1,},.next_id = (__u32)2,.open_flags = (__u32)2755142048,},.info = (struct){.bpf_fd = (__u32)1,.info_len = (__u32)2,.info = (__u64)140723063628192,},.query = (struct){(union){.target_fd = (__u32)1,.target_ifindex = (__u32)1,},.attach_type = (__u32)2,.query_flags = (__u32)2755142048,.attach_flags = (__u32)32764,.prog_ids = (__u64)94145833392226,},.raw_tracepoint = (struct){.name = (__u64)8589934593,.prog_fd = (__u32)2755142048,.cookie = (__u64)94145833392226,},(struct){.btf = (__u64)8589934593,.btf_log_buf = (__u64)140723063628192,.btf_size = (__u32)150263906,.btf_log_size = (__u32)21920,},.task_fd_query = (struct){.pid = (__u32)1,.fd = (__u32)2,.flags = (__u32)2755142048,.buf_len = (__u32)32764,.buf = (__u64)94145833392226,},.link_create = (struct){(union){.prog_fd = (__u32)1,.map_fd = (__u32)1,},(u) = 3 root@number:~# 2: prog_array name hid_jmp_table flags 0x0 key 4B value 4B max_entries 1024 memlock 8440B owner_prog_type tracing owner jited 13: hash_of_maps name cgroup_hash flags 0x0 key 8B value 4B max_entries 2048 memlock 167584B pids systemd(1) 960: array name libbpf_global flags 0x0 key 4B value 32B max_entries 1 memlock 280B 961: array name pid_iter.rodata flags 0x480 key 4B value 4B max_entries 1 memlock 8192B btf_id 1846 frozen pids bpftool(3409073) 962: array name libbpf_det_bind flags 0x0 key 4B value 32B max_entries 1 memlock 280B root@number:~# For simpler unions this may be better than not seeing any payload, so keep it there. Acked-by: Howard Chu <howardchu95@gmail.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alan Maguire <alan.maguire@oracle.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/lkml/ZuBLat8cbadILNLA@x1 [ Removed needless parenteses in the if block leading to the trace__btf_scnprintf() call, as per Howard's review comments ] Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-10bpftool: Fix undefined behavior in qsort(NULL, 0, ...)Kuan-Wei Chiu
When netfilter has no entry to display, qsort is called with qsort(NULL, 0, ...). This results in undefined behavior, as UBSan reports: net.c:827:2: runtime error: null pointer passed as argument 1, which is declared to never be null Although the C standard does not explicitly state whether calling qsort with a NULL pointer when the size is 0 constitutes undefined behavior, Section 7.1.4 of the C standard (Use of library functions) mentions: "Each of the following statements applies unless explicitly stated otherwise in the detailed descriptions that follow: If an argument to a function has an invalid value (such as a value outside the domain of the function, or a pointer outside the address space of the program, or a null pointer, or a pointer to non-modifiable storage when the corresponding parameter is not const-qualified) or a type (after promotion) not expected by a function with variable number of arguments, the behavior is undefined." To avoid this, add an early return when nf_link_info is NULL to prevent calling qsort with a NULL pointer. Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Reviewed-by: Quentin Monnet <qmo@kernel.org> Link: https://lore.kernel.org/bpf/20240910150207.3179306-1-visitorckw@gmail.com
2024-09-10libbpf: Fix uretprobe.multi.s programs auto attachmentJiri Olsa
As reported by Andrii we don't currently recognize uretprobe.multi.s programs as return probes due to using (wrong) strcmp function. Using str_has_pfx() instead to match uretprobe.multi prefix. Tests are passing, because the return program was executed as entry program and all counts were incremented properly. Reported-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Jiri Olsa <jolsa@kernel.org> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20240910125336.3056271-1-jolsa@kernel.org
2024-09-10Merge tag 'linux-cpupower-6.12-rc1-2' of ↵Rafael J. Wysocki
ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/shuah/linux Merge the second round of cpupower utility updates for 6.12-rc1 from Shuah Khan: "This cpupower second update for Linux 6.12-rc1 consists of a fix and a new feature. -- adds missing powercap_set_enabled() stub function -- adds SWIG bindings files for libcpupower SWIG is a tool packaged in Fedora and other distros that can generate bindings from C and C++ code for several languages including Python, Perl, and Go. These bindings allows users to easily write scripts that use and extend libcpupower's functionality. Currently, only Python is provided in the makefile, but additional languages may be added if there is demand. Note that while SWIG itself is GPL v3+ licensed; the resulting output, the bindings code, is permissively licensed + the license of the .o files. Please see the following for more details. - https://swig.org/legal.html. - https://lore.kernel.org/linux-pm/Zqv9BOjxLAgyNP5B@hatbackup" * tag 'linux-cpupower-6.12-rc1-2' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/shuah/linux: pm:cpupower: Add error warning when SWIG is not installed MAINTAINERS: Add Maintainers for SWIG Python bindings pm:cpupower: Include test_raw_pylibcpupower.py pm:cpupower: Add SWIG bindings files for libcpupower pm:cpupower: Add missing powercap_set_enabled() stub function
2024-09-10perf trace: Add --force-btf for debuggingHoward Chu
If --force-btf is enabled, prefer btf_dump general pretty printer to perf trace's customized pretty printers. Mostly for debug purposes. Committer testing: diff before/after shows we need several improvements to be able to compare the changes, first we need to cut off/disable mutable data such as pids and timestamps, then what is left are the buffer addresses passed from userspace, returned from kernel space, maybe we can ask 'perf trace' to go on making those reproducible. That would entail a Pointer Address Translation (PAT) like for networking, that would, for simple, reproducible if not for these details, workloads, that we would then use in our regression tests. Enough digression, this is one such diff: openat(dfd: CWD, filename: "/usr/share/locale/locale.alias", flags: RDONLY|CLOEXEC) = 3 -fstat(fd: 3, statbuf: 0x7fff01f212a0) = 0 -read(fd: 3, buf: 0x5596bab2d630, count: 4096) = 2998 -read(fd: 3, buf: 0x5596bab2d630, count: 4096) = 0 +fstat(fd: 3, statbuf: 0x7ffc163cf0e0) = 0 +read(fd: 3, buf: 0x55b4e0631630, count: 4096) = 2998 +read(fd: 3, buf: 0x55b4e0631630, count: 4096) = 0 close(fd: 3) = 0 openat(dfd: CWD, filename: "/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory) openat(dfd: CWD, filename: "/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory) @@ -45,7 +45,7 @@ openat(dfd: CWD, filename: "/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory) openat(dfd: CWD, filename: "/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory) openat(dfd: CWD, filename: "/usr/share/locale/en/LC_MESSAGES/coreutils.mo") = -1 ENOENT (No such file or directory) -{ .tv_sec: 1, .tv_nsec: 0 }, rmtp: 0x7fff01f21990) = 0 +(struct __kernel_timespec){.tv_sec = (__kernel_time64_t)1,}, rmtp: 0x7ffc163cf7d0) = The problem more close to our hands is to make the libbpf BTF pretty printer to have a mode that closely resembles what we're trying to resemble: strace output. Being able to run something with 'perf trace' and with 'strace' and get the exact same output should be of interest of anybody wanting to have strace and 'perf trace' regression tested against each other. That last part is 'perf trace' shot at being something so useful as strace... ;-) Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Howard Chu <howardchu95@gmail.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240824163322.60796-8-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-10perf trace: Collect augmented data using BPFHoward Chu
Include trace_augment.h for TRACE_AUG_MAX_BUF, so that BPF reads TRACE_AUG_MAX_BUF bytes of buffer maximum. Determine what type of argument and how many bytes to read from user space, us ing the value in the beauty_map. This is the relation of parameter type and its corres ponding value in the beauty map, and how many bytes we read eventually: string: 1 -> size of string (till null) struct: size of struct -> size of struct buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_ MAX_BUF) After reading from user space, we output the augmented data using bpf_perf_event_output(). If the struct augmenter, augment_sys_enter() failed, we fall back to using bpf_tail_call(). I have to make the payload 6 times the size of augmented_arg, to pass the BPF verifier. Signed-off-by: Howard Chu <howardchu95@gmail.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240815013626.935097-10-howardchu95@gmail.com Link: https://lore.kernel.org/r/20240824163322.60796-7-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-10perf trace: Pretty print buffer dataHoward Chu
Define TRACE_AUG_MAX_BUF in trace_augment.h data, which is the maximum buffer size we can augment. BPF will include this header too. Print buffer in a way that's different than just printing a string, we print all the control characters in \digits (such as \0 for null, and \10 for newline, LF). For character that has a bigger value than 127, we print the digits instead of the character itself as well. Committer notes: Simplified the buffer scnprintf to avoid using multiple buffers as discussed in the patch review thread. We can't really all 'buf' args to SCA_BUF as we're collecting so far just on the sys_enter path, so we would be printing the previous 'read' arg buffer contents, not what the kernel puts there. So instead of: static int syscall_fmt__cmp(const void *name, const void *fmtp) @@ -1987,8 +1989,6 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field - else if (strstr(field->type, "char *") && strstr(field->name, "buf")) - arg->scnprintf = SCA_BUF; Do: static const struct syscall_fmt syscall_fmts[] = { + { .name = "write", .errpid = true, + .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, from_user = true, }, }, }, Signed-off-by: Howard Chu <howardchu95@gmail.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240815013626.935097-8-howardchu95@gmail.com Link: https://lore.kernel.org/r/20240824163322.60796-6-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-10perf trace: Pretty print struct dataHoward Chu
Change the arg->augmented.args to arg->augmented.args->value to skip the header for customized pretty printers, since we collect data in BPF using the general augment_sys_enter(), which always adds the header. Use btf_dump API to pretty print augmented struct pointer. Prefer existed pretty-printer than btf general pretty-printer. set compact = true and skip_names = true, so that no newline character and argument name are printed. Committer notes: Simplified the btf_dump_snprintf callback to avoid using multiple buffers, as discussed in the thread accessible via the Link tag below. Also made it do: dump_data_opts.skip_names = !arg->trace->show_arg_names; I.e. show the type and struct field names according to that tunable, we probably need another tunable just for this, but for now if the user wants to see syscall names in addition to its value, it makes sense to see the struct field names according to that tunable. Committer testing: The following have explicitely set beautifiers (SCA_FILENAME, SCA_SOCKADDR and SCA_PERF_ATTR), SCA_FILENAME is here just because we have been wiring up the "renameat2" ("renameat" until recently), so it doesn't use the introduced generic fallback (btf_struct_scnprintf(), see the definition of SCA_PERF_ATTR, SCA_SOCKADDR to see the more feature rich beautifiers, that are not using BTF): root@number:~# rm -f 987654 ; touch 123456 ; perf trace -e rename* mv 123456 987654 0.000 ( 0.039 ms): mv/258478 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0 root@number:~# perf trace -e connect,sendto ping -c 1 www.google.com 0.000 ( 0.014 ms): ping/258481 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0 0.040 ( 0.003 ms): ping/258481 sendto(fd: 5, buff: 0x55bc317a6980, len: 97, flags: DONTWAIT|NOSIGNAL) = 97 18.742 ( 0.020 ms): ping/258481 sendto(fd: 5, buff: 0x7ffc04768df0, len: 20, addr: { .family: NETLINK }, addr_len: 0xc) = 20 PING www.google.com (142.251.129.68) 56(84) bytes of data. 18.783 ( 0.012 ms): ping/258481 connect(fd: 5, uservaddr: { .family: INET6, port: 0, addr: 2800:3f0:4004:810::2004 }, addrlen: 28) = 0 18.797 ( 0.001 ms): ping/258481 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 18.800 ( 0.004 ms): ping/258481 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 142.251.129.68 }, addrlen: 16) = 0 18.815 ( 0.002 ms): ping/258481 connect(fd: 5, uservaddr: { .family: INET, port: 1025, addr: 142.251.129.68 }, addrlen: 16) = 0 18.862 ( 0.023 ms): ping/258481 sendto(fd: 3, buff: 0x55bc317a0ac0, len: 64, addr: { .family: INET, port: 0, addr: 142.251.129.68 }, addr_len: 0x10) = 64 63.330 ( 0.038 ms): ping/258481 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0 63.435 ( 0.010 ms): ping/258481 sendto(fd: 5, buff: 0x55bc317a8340, len: 110, flags: DONTWAIT|NOSIGNAL) = 110 64 bytes from rio07s07-in-f4.1e100.net (142.251.129.68): icmp_seq=1 ttl=49 time=44.2 ms --- www.google.com ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 44.158/44.158/44.158/0.000 ms root@number:~# perf trace -e perf_event_open perf stat -e instructions,cache-misses,syscalls:sys_enter*sleep* sleep 1.23456789 0.000 ( 0.010 ms): :258487/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), config: 0xa00000000, disabled: 1, { bp_len, config2 }: 0x900000000, branch_sample_type: USER|COUNTERS, sample_regs_user: 0x3f1b7ffffffff, sample_stack_user: 258487, clockid: -599052088, sample_regs_intr: 0x60a000003eb, sample_max_stack: 14, sig_data: 120259084288 }, cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 3 0.016 ( 0.002 ms): :258487/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), config: 0x400000000, disabled: 1, { bp_len, config2 }: 0x900000000, branch_sample_type: USER|COUNTERS, sample_regs_user: 0x3f1b7ffffffff, sample_stack_user: 258487, clockid: -599044082, sample_regs_intr: 0x60a000003eb, sample_max_stack: 14, sig_data: 120259084288 }, cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 4 1.838 ( 0.006 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0xa00000001, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 5 1.846 ( 0.002 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000001, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 6 1.849 ( 0.002 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0xa00000003, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 7 1.851 ( 0.002 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000003, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 9 1.853 ( 0.600 ms): perf/258487 perf_event_open(attr_uptr: { type: 2 (tracepoint), size: 136, config: 0x190 (syscalls:sys_enter_nanosleep), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 10 2.456 ( 0.016 ms): perf/258487 perf_event_open(attr_uptr: { type: 2 (tracepoint), size: 136, config: 0x196 (syscalls:sys_enter_clock_nanosleep), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 11 Performance counter stats for 'sleep 1.23456789': 1,402,839 cpu_atom/instructions/ <not counted> cpu_core/instructions/ (0.00%) 11,066 cpu_atom/cache-misses/ <not counted> cpu_core/cache-misses/ (0.00%) 0 syscalls:sys_enter_nanosleep 1 syscalls:sys_enter_clock_nanosleep 1.236246714 seconds time elapsed 0.000000000 seconds user 0.001308000 seconds sys root@number:~# Now if we use it even for the ones we have a specific beautifier in tools/perf/trace/beauty, i.e. use btf_struct_scnprintf() for all structs, by adding the following patch: @@ -2316,7 +2316,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, default_scnprintf = sc->arg_fmt[arg.idx].scnprintf; - if (default_scnprintf == NULL || default_scnprintf == SCA_PTR) { + if (1 || (default_scnprintf == NULL || default_scnprintf == SCA_PTR)) { btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed, size - printed, val, field->type); if (btf_printed) { We get: root@number:~# perf trace -e connect,sendto ping -c 1 www.google.com PING www.google.com (142.251.129.68) 56(84) bytes of data. 0.000 ( 0.015 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)1,(union){.sa_data_min = (char[14])['/','r','u','n','/','s','y','s','t','e','m','d','/','r',],},}, addrlen: 42) = 0 0.046 ( 0.004 ms): ping/283259 sendto(fd: 5, buff: 0x559b008ae980, len: 97, flags: DONTWAIT|NOSIGNAL) = 97 0.353 ( 0.012 ms): ping/283259 sendto(fd: 5, buff: 0x7ffc01294960, len: 20, addr: (struct sockaddr){.sa_family = (sa_family_t)16,}, addr_len: 0xc) = 20 0.377 ( 0.006 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)2,}, addrlen: 16) = 0 0.388 ( 0.010 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)10,}, addrlen: 28) = 0 0.402 ( 0.001 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)2,(union){.sa_data_min = (char[14])[4,1,142,251,129,'D',],},}, addrlen: 16) = 0 0.425 ( 0.045 ms): ping/283259 sendto(fd: 3, buff: 0x559b008a8ac0, len: 64, addr: (struct sockaddr){.sa_family = (sa_family_t)2,}, addr_len: 0x10) = 64 64 bytes from rio07s07-in-f4.1e100.net (142.251.129.68): icmp_seq=1 ttl=49 time=44.1 ms --- www.google.com ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 44.113/44.113/44.113/0.000 ms 44.849 ( 0.038 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)1,(union){.sa_data_min = (char[14])['/','r','u','n','/','s','y','s','t','e','m','d','/','r',],},}, addrlen: 42) = 0 44.927 ( 0.006 ms): ping/283259 sendto(fd: 5, buff: 0x559b008b03d0, len: 110, flags: DONTWAIT|NOSIGNAL) = 110 root@number:~# Which looks sane, i.e.: 18.800 ( 0.004 ms): ping/258481 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 142.251.129.68 }, addrlen: 16) = 0 Becomes: 0.402 ( 0.001 ms): ping/283259 connect(fd: 5, uservaddr: (struct sockaddr){.sa_family = (sa_family_t)2,(union){.sa_data_min = (char[14])[4,1,142,251,129,'D',],},}, addrlen: 16) = 0 And. #define AF_UNIX 1 /* Unix domain sockets */ #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ #define AF_INET 2 /* Internet IP Protocol */ <SNIP> #define AF_INET6 10 /* IP version 6 */ And 'D' == 68, so the preexisting sockaddr BPF collector is working with the new generic BTF pretty printer (btf_struct_scnprintf()), its just that it doesn't know about 'struct sockaddr' besides what is in BTF, i.e. its an array of bytes, not an IPv4 address that needs extra massaging. Ditto for the 'struct perf_event_attr' case: 1.851 ( 0.002 ms): perf/258487 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000003, sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 258488 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 9 Becomes: 2.081 ( 0.002 ms): :283304/283304 perf_event_open(attr_uptr: (struct perf_event_attr){.size = (__u32)136,.config = (__u64)17179869187,.sample_type = (__u64)65536,.read_format = (__u64)3,.disabled = (__u64)0x1,.inherit = (__u64)0x1,.enable_on_exec = (__u64)0x1,.exclude_guest = (__u64)0x1,}, pid: 283305 (sleep), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 9 hex(17179869187) = 0x400000003, etc. read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING is enum perf_event_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, and so on. We need to work with the libbpf btf dump api to get one output that matches the 'perf trace'/strace expectations/format, but having this in this current form is already an improvement to 'perf trace', so lets improve from what we have. Signed-off-by: Howard Chu <howardchu95@gmail.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240815013626.935097-7-howardchu95@gmail.com Link: https://lore.kernel.org/r/20240824163322.60796-5-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-10perf trace: Add trace__bpf_sys_enter_beauty_map() to prepare for fetching ↵Howard Chu
data in BPF Set up beauty_map, load it to BPF, in such format: if argument No.3 is a struct of size 32 bytes (of syscall number 114) beauty_map[114][2] = 32; if argument No.3 is a string (of syscall number 114) beauty_map[114][2] = 1; if argument No.3 is a buffer, its size is indicated by argument No.4 (of syscall number 114) beauty_map[114][2] = -4; /* -1 ~ -6, we'll read this buffer size in BPF */ Committer notes: Moved syscall_arg_fmt__cache_btf_struct() from a ifdef HAVE_LIBBPF_SUPPORT to closer to where it is used, that is ifdef'ed on HAVE_BPF_SKEL and thus breaks the build when building with BUILD_BPF_SKEL=0, as detected using 'make -C tools/perf build-test'. Also add 'struct beauty_map_enter' to tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c as we're using it in this patch, otherwise we get this while trying to build at this point in the original patch series: builtin-trace.c: In function ‘trace__init_syscalls_bpf_prog_array_maps’: builtin-trace.c:3725:58: error: ‘struct <anonymous>’ has no member named ‘beauty_map_enter’ 3725 | int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter); | We also have to take into account syscall_arg_fmt.from_user when telling the kernel what to copy in the sys_enter generic collector, we don't want to collect bogus data in buffers that will only be available to us at sys_exit time, i.e. after the kernel has filled it, so leave this for when we have such a sys_exit based collector. Committer testing: Not wired up yet, so all continues to work, using the existing BPF collector and userspace beautifiers that are augmentation aware: root@number:~# rm -f 987654 ; touch 123456 ; perf trace -e rename* mv 123456 987654 0.000 ( 0.031 ms): mv/20888 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0 root@number:~# perf trace -e connect,sendto ping -c 1 www.google.com 0.000 ( 0.014 ms): ping/20892 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0 0.040 ( 0.003 ms): ping/20892 sendto(fd: 5, buff: 0x560b4ff17980, len: 97, flags: DONTWAIT|NOSIGNAL) = 97 0.480 ( 0.017 ms): ping/20892 sendto(fd: 5, buff: 0x7ffd82d07150, len: 20, addr: { .family: NETLINK }, addr_len: 0xc) = 20 0.526 ( 0.014 ms): ping/20892 connect(fd: 5, uservaddr: { .family: INET6, port: 0, addr: 2800:3f0:4004:810::2004 }, addrlen: 28) = 0 0.542 ( 0.002 ms): ping/20892 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 0.544 ( 0.004 ms): ping/20892 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 142.251.135.100 }, addrlen: 16) = 0 0.559 ( 0.002 ms): ping/20892 connect(fd: 5, uservaddr: { .family: INET, port: 1025, addr: 142.251.135.100 }, addrlen: 16PING www.google.com (142.251.135.100) 56(84) bytes of data. ) = 0 0.589 ( 0.058 ms): ping/20892 sendto(fd: 3, buff: 0x560b4ff11ac0, len: 64, addr: { .family: INET, port: 0, addr: 142.251.135.100 }, addr_len: 0x10) = 64 45.250 ( 0.029 ms): ping/20892 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0 45.344 ( 0.012 ms): ping/20892 sendto(fd: 5, buff: 0x560b4ff19340, len: 111, flags: DONTWAIT|NOSIGNAL) = 111 64 bytes from rio09s08-in-f4.1e100.net (142.251.135.100): icmp_seq=1 ttl=49 time=44.4 ms --- www.google.com ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 44.361/44.361/44.361/0.000 ms root@number:~# Signed-off-by: Howard Chu <howardchu95@gmail.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240815013626.935097-4-howardchu95@gmail.com Link: https://lore.kernel.org/r/20240824163322.60796-3-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-10perf trace: Mark bpf's attr as from_userArnaldo Carvalho de Melo
This one has no specific pretty printer right now, so will be handled by the generic BTF based one later in this patch series. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-10Merge branch 'linus' into timers/coreThomas Gleixner
To update with the latest fixes.
2024-09-10tools/virtio:Fix the wrong format specifierZhu Jun
The unsigned int should use "%u" instead of "%d". Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com> Message-Id: <20240724074108.9530-1-zhujun2@cmss.chinamobile.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Eugenio Pérez <eperezma@redhat.com> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
2024-09-09KVM: selftests: Verify single-stepping a fastpath VM-Exit exits to userspaceSean Christopherson
In x86's debug_regs test, change the RDMSR(MISC_ENABLES) in the single-step testcase to a WRMSR(TSC_DEADLINE) in order to verify that KVM honors KVM_GUESTDBG_SINGLESTEP when handling a fastpath VM-Exit. Note, the extra coverage is effectively Intel-only, as KVM only handles TSC_DEADLINE in the fastpath when the timer is emulated via the hypervisor timer, a.k.a. the VMX preemption timer. Link: https://lore.kernel.org/r/20240830044448.130449-1-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-09-09selftests/net: integrate packetdrill with ksftWillem de Bruijn
Lay the groundwork to import into kselftests the over 150 packetdrill TCP/IP conformance tests on github.com/google/packetdrill. Florian recently added support for packetdrill tests in nf_conntrack, in commit a8a388c2aae49 ("selftests: netfilter: add packetdrill based conntrack tests"). This patch takes a slightly different approach. It relies on ksft_runner.sh to run every *.pkt file in the directory. Any future imports of packetdrill tests should require no additional coding. Just add the *.pkt files. Initially import only two features/directories from github. One with a single script, and one with two. This was the only reason to pick tcp/inq and tcp/md5. The path replaces the directory hierarchy in github with a flat space of files: $(subst /,_,$(wildcard tcp/**/*.pkt)). This is the most straightforward option to integrate with kselftests. The Linked thread reviewed two ways to maintain the hierarchy: TEST_PROGS_RECURSE and PRESERVE_TEST_DIRS. But both introduce significant changes to kselftest infra and with that risk to existing tests. Implementation notes: - restore alphabetical order when adding the new directory to tools/testing/selftests/Makefile - imported *.pkt files and support verbatim from the github project, except for - update `source ./defaults.sh` path (to adjust for flat dir) - add SPDX headers - remove one author statement - Acknowledgment: drop an e (checkpatch) Tested: make -C tools/testing/selftests \ TARGETS=net/packetdrill \ run_tests make -C tools/testing/selftests \ TARGETS=net/packetdrill \ install INSTALL_PATH=$KSFT_INSTALL_PATH # in virtme-ng ./run_kselftest.sh -c net/packetdrill ./run_kselftest.sh -t net/packetdrill:tcp_inq_client.pkt Link: https://lore.kernel.org/netdev/20240827193417.2792223-1-willemdebruijn.kernel@gmail.com/ Signed-off-by: Willem de Bruijn <willemb@google.com> Link: https://patch.msgid.link/20240905231653.2427327-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09selftests: support interpreted scripts with ksft_runner.shWillem de Bruijn
Support testcases that are themselves not executable, but need an interpreter to run them. If a test file is not executable, but an executable file ksft_runner.sh exists in the TARGET dir, kselftest will run ./ksft_runner.sh ./$BASENAME_TEST Packetdrill may add hundreds of packetdrill scripts for testing. These scripts must be passed to the packetdrill process. Have kselftest run each test directly, as it already solves common runner requirements like parallel execution and isolation (netns). A previous RFC added a wrapper in between, which would have to reimplement such functionality. Link: https://lore.kernel.org/netdev/66d4d97a4cac_3df182941a@willemb.c.googlers.com.notmuch/T/ Signed-off-by: Willem de Bruijn <willemb@google.com> Link: https://patch.msgid.link/20240905231653.2427327-2-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09af_unix: Don't return OOB skb in manage_oob().Kuniyuki Iwashima
syzbot reported use-after-free in unix_stream_recv_urg(). [0] The scenario is 1. send(MSG_OOB) 2. recv(MSG_OOB) -> The consumed OOB remains in recv queue 3. send(MSG_OOB) 4. recv() -> manage_oob() returns the next skb of the consumed OOB -> This is also OOB, but unix_sk(sk)->oob_skb is not cleared 5. recv(MSG_OOB) -> unix_sk(sk)->oob_skb is used but already freed The recent commit 8594d9b85c07 ("af_unix: Don't call skb_get() for OOB skb.") uncovered the issue. If the OOB skb is consumed and the next skb is peeked in manage_oob(), we still need to check if the skb is OOB. Let's do so by falling back to the following checks in manage_oob() and add the test case in selftest. Note that we need to add a similar check for SIOCATMARK. [0]: BUG: KASAN: slab-use-after-free in unix_stream_read_actor+0xa6/0xb0 net/unix/af_unix.c:2959 Read of size 4 at addr ffff8880326abcc4 by task syz-executor178/5235 CPU: 0 UID: 0 PID: 5235 Comm: syz-executor178 Not tainted 6.11.0-rc5-syzkaller-00742-gfbdaffe41adc #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 Call Trace: <TASK> __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 unix_stream_read_actor+0xa6/0xb0 net/unix/af_unix.c:2959 unix_stream_recv_urg+0x1df/0x320 net/unix/af_unix.c:2640 unix_stream_read_generic+0x2456/0x2520 net/unix/af_unix.c:2778 unix_stream_recvmsg+0x22b/0x2c0 net/unix/af_unix.c:2996 sock_recvmsg_nosec net/socket.c:1046 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1068 ____sys_recvmsg+0x1db/0x470 net/socket.c:2816 ___sys_recvmsg net/socket.c:2858 [inline] __sys_recvmsg+0x2f0/0x3e0 net/socket.c:2888 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5360d6b4e9 Code: 48 83 c4 28 c3 e8 37 17 00 00 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff29b3a458 EFLAGS: 00000246 ORIG_RAX: 000000000000002f RAX: ffffffffffffffda RBX: 00007fff29b3a638 RCX: 00007f5360d6b4e9 RDX: 0000000000002001 RSI: 0000000020000640 RDI: 0000000000000003 RBP: 00007f5360dde610 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 R13: 00007fff29b3a628 R14: 0000000000000001 R15: 0000000000000001 </TASK> Allocated by task 5235: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 unpoison_slab_object mm/kasan/common.c:312 [inline] __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338 kasan_slab_alloc include/linux/kasan.h:201 [inline] slab_post_alloc_hook mm/slub.c:3988 [inline] slab_alloc_node mm/slub.c:4037 [inline] kmem_cache_alloc_node_noprof+0x16b/0x320 mm/slub.c:4080 __alloc_skb+0x1c3/0x440 net/core/skbuff.c:667 alloc_skb include/linux/skbuff.h:1320 [inline] alloc_skb_with_frags+0xc3/0x770 net/core/skbuff.c:6528 sock_alloc_send_pskb+0x91a/0xa60 net/core/sock.c:2815 sock_alloc_send_skb include/net/sock.h:1778 [inline] queue_oob+0x108/0x680 net/unix/af_unix.c:2198 unix_stream_sendmsg+0xd24/0xf80 net/unix/af_unix.c:2351 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597 ___sys_sendmsg net/socket.c:2651 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2680 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 5235: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579 poison_slab_object+0xe0/0x150 mm/kasan/common.c:240 __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256 kasan_slab_free include/linux/kasan.h:184 [inline] slab_free_hook mm/slub.c:2252 [inline] slab_free mm/slub.c:4473 [inline] kmem_cache_free+0x145/0x350 mm/slub.c:4548 unix_stream_read_generic+0x1ef6/0x2520 net/unix/af_unix.c:2917 unix_stream_recvmsg+0x22b/0x2c0 net/unix/af_unix.c:2996 sock_recvmsg_nosec net/socket.c:1046 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1068 __sys_recvfrom+0x256/0x3e0 net/socket.c:2255 __do_sys_recvfrom net/socket.c:2273 [inline] __se_sys_recvfrom net/socket.c:2269 [inline] __x64_sys_recvfrom+0xde/0x100 net/socket.c:2269 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff8880326abc80 which belongs to the cache skbuff_head_cache of size 240 The buggy address is located 68 bytes inside of freed 240-byte region [ffff8880326abc80, ffff8880326abd70) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x326ab ksm flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff) page_type: 0xfdffffff(slab) raw: 00fff00000000000 ffff88801eaee780 ffffea0000b7dc80 dead000000000003 raw: 0000000000000000 00000000800c000c 00000001fdffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x52cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP), pid 4686, tgid 4686 (udevadm), ts 32357469485, free_ts 28829011109 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1493 prep_new_page mm/page_alloc.c:1501 [inline] get_page_from_freelist+0x2e4c/0x2f10 mm/page_alloc.c:3439 __alloc_pages_noprof+0x256/0x6c0 mm/page_alloc.c:4695 __alloc_pages_node_noprof include/linux/gfp.h:269 [inline] alloc_pages_node_noprof include/linux/gfp.h:296 [inline] alloc_slab_page+0x5f/0x120 mm/slub.c:2321 allocate_slab+0x5a/0x2f0 mm/slub.c:2484 new_slab mm/slub.c:2537 [inline] ___slab_alloc+0xcd1/0x14b0 mm/slub.c:3723 __slab_alloc+0x58/0xa0 mm/slub.c:3813 __slab_alloc_node mm/slub.c:3866 [inline] slab_alloc_node mm/slub.c:4025 [inline] kmem_cache_alloc_node_noprof+0x1fe/0x320 mm/slub.c:4080 __alloc_skb+0x1c3/0x440 net/core/skbuff.c:667 alloc_skb include/linux/skbuff.h:1320 [inline] alloc_uevent_skb+0x74/0x230 lib/kobject_uevent.c:289 uevent_net_broadcast_untagged lib/kobject_uevent.c:326 [inline] kobject_uevent_net_broadcast+0x2fd/0x580 lib/kobject_uevent.c:410 kobject_uevent_env+0x57d/0x8e0 lib/kobject_uevent.c:608 kobject_synth_uevent+0x4ef/0xae0 lib/kobject_uevent.c:207 uevent_store+0x4b/0x70 drivers/base/bus.c:633 kernfs_fop_write_iter+0x3a1/0x500 fs/kernfs/file.c:334 new_sync_write fs/read_write.c:497 [inline] vfs_write+0xa72/0xc90 fs/read_write.c:590 page last free pid 1 tgid 1 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1094 [inline] free_unref_page+0xd22/0xea0 mm/page_alloc.c:2612 kasan_depopulate_vmalloc_pte+0x74/0x90 mm/kasan/shadow.c:408 apply_to_pte_range mm/memory.c:2797 [inline] apply_to_pmd_range mm/memory.c:2841 [inline] apply_to_pud_range mm/memory.c:2877 [inline] apply_to_p4d_range mm/memory.c:2913 [inline] __apply_to_page_range+0x8a8/0xe50 mm/memory.c:2947 kasan_release_vmalloc+0x9a/0xb0 mm/kasan/shadow.c:525 purge_vmap_node+0x3e3/0x770 mm/vmalloc.c:2208 __purge_vmap_area_lazy+0x708/0xae0 mm/vmalloc.c:2290 _vm_unmap_aliases+0x79d/0x840 mm/vmalloc.c:2885 change_page_attr_set_clr+0x2fe/0xdb0 arch/x86/mm/pat/set_memory.c:1881 change_page_attr_set arch/x86/mm/pat/set_memory.c:1922 [inline] set_memory_nx+0xf2/0x130 arch/x86/mm/pat/set_memory.c:2110 free_init_pages arch/x86/mm/init.c:924 [inline] free_kernel_image_pages arch/x86/mm/init.c:943 [inline] free_initmem+0x79/0x110 arch/x86/mm/init.c:970 kernel_init+0x31/0x2b0 init/main.c:1476 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Memory state around the buggy address: ffff8880326abb80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880326abc00: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc >ffff8880326abc80: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880326abd00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc ffff8880326abd80: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb Fixes: 93c99f21db36 ("af_unix: Don't stop recv(MSG_DONTWAIT) if consumed OOB skb is at the head.") Reported-by: syzbot+8811381d455e3e9ec788@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8811381d455e3e9ec788 Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Link: https://patch.msgid.link/20240905193240.17565-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09selftests: mptcp: connect: remove duplicated spaces in TAP outputMatthieu Baerts (NGI0)
It is nice to have a visual alignment in the test output to present the different results, but it makes less sense in the TAP output that is there for computers. It sounds then better to remove the duplicated whitespaces in the TAP output, also because it can cause some issues with TAP parsers expecting only one space around the directive delimiter (#). While at it, change the variable name (result_msg) to something more explicit. Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-5-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09selftests: mptcp: diag: remove trailing whitespaceMatthieu Baerts (NGI0)
It doesn't need to be there, and it can cause some issues with TAP parsers expecting only one space around the directive delimiter (#). Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-4-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09selftests: mptcp: reset the last TS before the first testMatthieu Baerts (NGI0)
Just to slightly improve the precision of the duration of the first test. In mptcp_join.sh, the last append_prev_results is now done as soon as the last test is over: this will add the last result in the list, and get a more precise time for this last test. Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-3-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09selftests: mptcp: connect: remote time in TAP outputMatthieu Baerts (NGI0)
It is now added by the MPTCP lib automatically, see the parent commit. The time in the TAP output might be slightly different from the one displayed before, but that's OK. Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-2-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09selftests: mptcp: lib: add time per subtests in TAP outputMatthieu Baerts (NGI0)
It adds 'time=<N>ms' in the diagnostic data of the TAP output, e.g. ok 1 - pm_netlink: defaults addr list # time=9ms This addition is useful to quickly identify which subtests are taking a longer time than the others, or more than expected. Note that there are no specific formats to follow to show this time according to the TAP 13 [1], TAP 14 [2] and KTAP [3] specifications. Let's then define this one here. Link: https://testanything.org/tap-version-13-specification.html [1] Link: https://testanything.org/tap-version-14-specification.html [2] Link: https://docs.kernel.org/dev-tools/ktap.html [3] Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-1-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09tools/mm: rm thp_swap_allocator_test when make cleanzhangjiao
rm thp_swap_allocator_test when make clean Link: https://lkml.kernel.org/r/20240829042008.6937-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: zhangjiao <zhangjiao2@cmss.chinamobile.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-09scx_qmap: Implement highpri boostingTejun Heo
Implement a silly boosting mechanism for nice -20 tasks. The only purpose is demonstrating and testing scx_bpf_dispatch_from_dsq(). The boosting only works within SHARED_DSQ and makes only minor differences with increased dispatch batch (-b). This exercises moving tasks to a user DSQ and all local DSQs from ops.dispatch() and BPF timerfn. v2: - Updated to use scx_bpf_dispatch_from_dsq_set_{slice|vtime}(). - Drop the workaround for the iterated tasks not being trusted by the verifier. The issue is fixed from BPF side. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Daniel Hodges <hodges.daniel.scott@gmail.com> Cc: David Vernet <void@manifault.com> Cc: Changwoo Min <multics69@gmail.com> Cc: Andrea Righi <andrea.righi@linux.dev> Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
2024-09-09sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()Tejun Heo
Once a task is put into a DSQ, the allowed operations are fairly limited. Tasks in the built-in local and global DSQs are executed automatically and, ignoring dequeue, there is only one way a task in a user DSQ can be manipulated - scx_bpf_consume() moves the first task to the dispatching local DSQ. This inflexibility sometimes gets in the way and is an area where multiple feature requests have been made. Implement scx_bpf_dispatch[_vtime]_from_dsq(), which can be called during DSQ iteration and can move the task to any DSQ - local DSQs, global DSQ and user DSQs. The kfuncs can be called from ops.dispatch() and any BPF context which dosen't hold a rq lock including BPF timers and SYSCALL programs. This is an expansion of an earlier patch which only allowed moving into the dispatching local DSQ: http://lkml.kernel.org/r/Zn4Cw4FDTmvXnhaf@slm.duckdns.org v2: Remove @slice and @vtime from scx_bpf_dispatch_from_dsq[_vtime]() as they push scx_bpf_dispatch_from_dsq_vtime() over the kfunc argument count limit and often won't be needed anyway. Instead provide scx_bpf_dispatch_from_dsq_set_{slice|vtime}() kfuncs which can be called only when needed and override the specified parameter for the subsequent dispatch. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Daniel Hodges <hodges.daniel.scott@gmail.com> Cc: David Vernet <void@manifault.com> Cc: Changwoo Min <multics69@gmail.com> Cc: Andrea Righi <andrea.righi@linux.dev> Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
2024-09-09selftests: return failure when timestamps can't be reportedJason Xing
When I was trying to modify the tx timestamping feature, I found that running "./txtimestamp -4 -C -L 127.0.0.1" didn't reflect the error: I succeeded to generate timestamp stored in the skb but later failed to report it to the userspace (which means failed to put css into cmsg). It can happen when someone writes buggy codes in __sock_recv_timestamp(), for example. After adding the check so that running ./txtimestamp will reflect the result correctly like this if there is a bug in the reporting phase: protocol: TCP payload: 10 server port: 9000 family: INET test SND USR: 1725458477 s 667997 us (seq=0, len=0) Failed to report timestamps USR: 1725458477 s 718128 us (seq=0, len=0) Failed to report timestamps USR: 1725458477 s 768273 us (seq=0, len=0) Failed to report timestamps USR: 1725458477 s 818416 us (seq=0, len=0) Failed to report timestamps ... In the future, it will help us detect whether the new coming patch has bugs or not. Signed-off-by: Jason Xing <kernelxing@tencent.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Link: https://patch.msgid.link/20240905160035.62407-1-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-09-09selftests/mm: relax test to fail after 100 migration failuresDev Jain
It was recently observed at [1] that during the folio unmapping stage of migration, when the PTEs are cleared, a racing thread faulting on that folio may increase the refcount of the folio, sleep on the folio lock (the migration path has the lock), and migration ultimately fails when asserting the actual refcount against the expected. Thereby, the migration selftest fails on shared-anon mappings. The above enforces the fact that migration is a best-effort service, therefore, it is wrong to fail the test for just a single failure; hence, fail the test after 100 consecutive failures (where 100 is still a subjective choice). Note that, this has no effect on the execution time of the test since that is controlled by a timeout. [1] https://lore.kernel.org/all/20240801081657.1386743-1-dev.jain@arm.com/ Link: https://lkml.kernel.org/r/20240830051609.4037834-1-dev.jain@arm.com Signed-off-by: Dev Jain <dev.jain@arm.com> Suggested-by: David Hildenbrand <david@redhat.com> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> Tested-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Barry Song <baohua@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christoph Lameter <cl@gentwo.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Gavin Shan <gshan@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lance Yang <ioworker0@gmail.com> Cc: Mark Brown <broonie@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Oscar Salvador <osalvador@suse.de> Cc: Shuah Khan <shuah@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Cc: Yang Shi <yang@os.amperecomputing.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-09mm: selftest to verify zero-filled pages are mapped to zeropageAlexander Zhu
When a THP is split, any subpage that is zero-filled will be mapped to the shared zeropage, hence saving memory. Add selftest to verify this by allocating zero-filled THP and comparing RssAnon before and after split. Link: https://lkml.kernel.org/r/20240830100438.3623486-4-usamaarif642@gmail.com Signed-off-by: Alexander Zhu <alexlzhu@fb.com> Signed-off-by: Usama Arif <usamaarif642@gmail.com> Acked-by: Rik van Riel <riel@surriel.com> Cc: Barry Song <baohua@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Kairui Song <ryncsn@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mike Rapoport <rppt@kernel.org> Cc: Nico Pache <npache@redhat.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Shuang Zhai <zhais@google.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Shuang Zhai <szhai2@cs.rochester.edu> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-09libbpf: Fix some typos in commentsYusheng Zheng
Fix some spelling errors in the code comments of libbpf: betwen -> between paremeters -> parameters knowning -> knowing definiton -> definition compatiblity -> compatibility overriden -> overridden occured -> occurred proccess -> process managment -> management nessary -> necessary Signed-off-by: Yusheng Zheng <yunwei356@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20240909225952.30324-1-yunwei356@gmail.com
2024-09-09bpf: Fix error message on kfunc arg type mismatchMaxim Mikityanskiy
When "arg#%d expected pointer to ctx, but got %s" error is printed, both template parts actually point to the type of the argument, therefore, it will also say "but got PTR", regardless of what was the actual register type. Fix the message to print the register type in the second part of the template, change the existing test to adapt to the new format, and add a new test to test the case when arg is a pointer to context, but reg is a scalar. Fixes: 00b85860feb8 ("bpf: Rewrite kfunc argument handling") Signed-off-by: Maxim Mikityanskiy <maxim@isovalent.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/bpf/20240909133909.1315460-1-maxim@isovalent.com
2024-09-09bpftool: Fix typosAndrew Kreimer
Fix typos in documentation. Reported-by: Matthew Wilcox <willy@infradead.org> Reported-by: Quentin Monnet <qmo@kernel.org> Signed-off-by: Andrew Kreimer <algonell@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Quentin Monnet <qmo@kernel.org> Link: https://lore.kernel.org/bpf/20240909092452.4293-1-algonell@gmail.com
2024-09-09bpftool: Fix undefined behavior caused by shifting into the sign bitKuan-Wei Chiu
Replace shifts of '1' with '1U' in bitwise operations within __show_dev_tc_bpf() to prevent undefined behavior caused by shifting into the sign bit of a signed integer. By using '1U', the operations are explicitly performed on unsigned integers, avoiding potential integer overflow or sign-related issues. Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Quentin Monnet <qmo@kernel.org> Link: https://lore.kernel.org/bpf/20240908140009.3149781-1-visitorckw@gmail.com
2024-09-09libbpf: Fixed getting wrong return address on arm64 architectureShuyi Cheng
ARM64 has a separate lr register to store the return address, so here you only need to read the lr register to get the return address, no need to dereference it again. Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/1725787433-77262-1-git-send-email-chengshuyi@linux.alibaba.com
2024-09-09perf trace: Introduce SCA_TIMESPEC_FROM_USER() to set .from_user = trueArnaldo Carvalho de Melo
Paving the way for the generic BPF BTF based syscall arg augmenter. Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-09perf trace: Introduce SCA_SOCKADDR_FROM_USER() to set .from_user = trueArnaldo Carvalho de Melo
Paving the way for the generic BPF BTF based syscall arg augmenter. Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-09perf trace: Introduce SCA_PERF_ATTR_FROM_USER() to set .from_user = trueArnaldo Carvalho de Melo
Paving the way for the generic BPF BTF based syscall arg augmenter. Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-09perf trace: Mark which syscall arguments go from user space to kernel spaceArnaldo Carvalho de Melo
We need to know where to collect it in the BPF augmenters, if in the sys_enter hook or in the sys_exit hook. Start with the SCA_FILENAME one, that is just from user to kernel space. The alternative, better, but takes a bit more time than I have now, is to use the __user information that is already in the syscall args and encoded in BTF via a tag, do it later. Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-09perf trace: Use a common encoding for augmented arguments, with size + error ↵Arnaldo Carvalho de Melo
+ payload We were using a more compact format, without explicitely encoding the size and possible error in the payload for an argument. To do it generically, at least as Howard Chu did in his GSoC activities, it is more convenient to use the same model that was being used for string arguments, passing { size, error, payload }. So use that for the non string syscall args we have so far: struct timespec struct perf_event_attr struct sockaddr (this one has even a variable size) With this in place we have the userspace pretty printers: perf_event_attr___scnprintf() syscall_arg__scnprintf_augmented_sockaddr() syscall_arg__scnprintf_augmented_timespec() Ready to have the generic BPF collector in tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c sending its generic payload and thus we'll use them instead of a generic libbpf btf_dump interface that doesn't know about about the sockaddr mux, perf_event_attr non-trivial fields (sample_type, etc), leaving it as a (useful) fallback that prints just basic types until we put in place a more sophisticated pretty printer infrastructure that associates synthesized enums to struct fields using the header scrapers we have in tools/perf/trace/beauty/, some of them in this list: $ ls tools/perf/trace/beauty/*.sh tools/perf/trace/beauty/arch_errno_names.sh tools/perf/trace/beauty/kcmp_type.sh tools/perf/trace/beauty/perf_ioctl.sh tools/perf/trace/beauty/statx_mask.sh tools/perf/trace/beauty/clone.sh tools/perf/trace/beauty/kvm_ioctl.sh tools/perf/trace/beauty/pkey_alloc_access_rights.sh tools/perf/trace/beauty/sync_file_range.sh tools/perf/trace/beauty/drm_ioctl.sh tools/perf/trace/beauty/madvise_behavior.sh tools/perf/trace/beauty/prctl_option.sh tools/perf/trace/beauty/usbdevfs_ioctl.sh tools/perf/trace/beauty/fadvise.sh tools/perf/trace/beauty/mmap_flags.sh tools/perf/trace/beauty/rename_flags.sh tools/perf/trace/beauty/vhost_virtio_ioctl.sh tools/perf/trace/beauty/fs_at_flags.sh tools/perf/trace/beauty/mmap_prot.sh tools/perf/trace/beauty/sndrv_ctl_ioctl.sh tools/perf/trace/beauty/x86_arch_prctl.sh tools/perf/trace/beauty/fsconfig.sh tools/perf/trace/beauty/mount_flags.sh tools/perf/trace/beauty/sndrv_pcm_ioctl.sh tools/perf/trace/beauty/fsmount.sh tools/perf/trace/beauty/move_mount_flags.sh tools/perf/trace/beauty/sockaddr.sh tools/perf/trace/beauty/fspick.sh tools/perf/trace/beauty/mremap_flags.sh tools/perf/trace/beauty/socket.sh $ Testing it: root@number:~# rm -f 987654 ; touch 123456 ; perf trace -e rename* mv 123456 987654 0.000 ( 0.031 ms): mv/1193096 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0 root@number:~# perf trace -e *nanosleep sleep 1.2345678901 0.000 (1234.654 ms): sleep/1192697 clock_nanosleep(rqtp: { .tv_sec: 1, .tv_nsec: 234567891 }, rmtp: 0x7ffe1ea80460) = 0 root@number:~# perf trace -e perf_event_open* perf stat -e cpu-clock sleep 1 0.000 ( 0.011 ms): perf/1192701 perf_event_open(attr_uptr: { type: 1 (software), size: 136, config: 0 (PERF_COUNT_SW_CPU_CLOCK), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 1192702 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 3 Performance counter stats for 'sleep 1': 0.51 msec cpu-clock # 0.001 CPUs utilized 1.001242090 seconds time elapsed 0.000000000 seconds user 0.001010000 seconds sys root@number:~# perf trace -e connect* ping -c 1 bsky.app 0.000 ( 0.130 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0 23.907 ( 0.006 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.20.108.158 }, addrlen: 16) = 0 23.915 PING bsky.app (3.20.108.158) 56(84) bytes of data. ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.917 ( 0.002 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.12.170.30 }, addrlen: 16) = 0 23.921 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.923 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 18.217.70.179 }, addrlen: 16) = 0 23.925 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.927 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.132.20.46 }, addrlen: 16) = 0 23.930 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.931 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.142.89.165 }, addrlen: 16) = 0 23.934 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.935 ( 0.002 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 18.119.147.159 }, addrlen: 16) = 0 23.938 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.940 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.22.38.164 }, addrlen: 16) = 0 23.942 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.944 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.13.14.133 }, addrlen: 16) = 0 23.956 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 1025, addr: 3.20.108.158 }, addrlen: 16) = 0 ^C --- bsky.app ping statistics --- 1 packets transmitted, 0 received, 100% packet loss, time 0ms root@number:~# Reviewed-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/lkml/CAP-5=fW4=2GoP6foAN6qbrCiUzy0a_TzHbd8rvDsakTPfdzvfg@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-09perf trace augmented_syscalls.bpf: Move the renameat aumenter to renameat2, ↵Arnaldo Carvalho de Melo
temporarily While trying to shape Howard Chu's generic BPF augmenter transition into the codebase I got stuck with the renameat2 syscall. Until I noticed that the attempt at reusing augmenters were making it use the 'openat' syscall augmenter, that collect just one string syscall arg, for the 'renameat2' syscall, that takes two strings. So, for the moment, just to help in this transition period, since 'renameat2' is what is used these days in the 'mv' utility, just make the BPF collector be associated with the more widely used syscall, hopefully the transition to Howard's generic BPF augmenter will cure this, so get this out of the way for now! So now we still have that odd "reuse", but for something we're not testing so won't get in the way anymore: root@number:~# rm -f 987654 ; touch 123456 ; perf trace -vv -e rename* mv 123456 987654 |& grep renameat Reusing "openat" BPF sys_enter augmenter for "renameat" 0.000 ( 0.079 ms): mv/1158612 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0 root@number:~# Reviewed-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/lkml/CAP-5=fXjGYs=tpBgETK-P9U-CuXssytk9pSnTXpfphrmmOydWA@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-09cxl/pci: Remove duplicated implementation of waiting for memory_info_validYanfei Xu
commit ce17ad0d5498 ("cxl: Wait Memory_Info_Valid before access memory related info") added another implementation, which is cxl_dvsec_mem_range_valid(), of waiting for memory_info_valid without realizing it duplicated wait_for_valid(). Remove wait_for_valid() and retain cxl_dvsec_mem_range_valid() as the former is hardcoded to check only the Memory_Info_Valid bit of DVSEC range 1, while the latter allows for selection between DVSEC range 1 or 2 via parameter. Suggested-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Signed-off-by: Yanfei Xu <yanfei.xu@intel.com> Reviewed-by: Alison Schofield <alison.schofield@intel.com> Link: https://patch.msgid.link/20240828084231.1378789-3-yanfei.xu@intel.com Signed-off-by: Dave Jiang <dave.jiang@intel.com>
2024-09-09Merge tag 'hyperv-fixes-signed-20240908' of ↵Linus Torvalds
git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux Pull hyperv fixes from Wei Liu: - Add a documentation overview of Confidential Computing VM support (Michael Kelley) - Use lapic timer in a TDX VM without paravisor (Dexuan Cui) - Set X86_FEATURE_TSC_KNOWN_FREQ when Hyper-V provides frequency (Michael Kelley) - Fix a kexec crash due to VP assist page corruption (Anirudh Rayabharam) - Python3 compatibility fix for lsvmbus (Anthony Nandaa) - Misc fixes (Rachel Menge, Roman Kisel, zhang jiao, Hongbo Li) * tag 'hyperv-fixes-signed-20240908' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: hv: vmbus: Constify struct kobj_type and struct attribute_group tools: hv: rm .*.cmd when make clean x86/hyperv: fix kexec crash due to VP assist page corruption Drivers: hv: vmbus: Fix the misplaced function description tools: hv: lsvmbus: change shebang to use python3 x86/hyperv: Set X86_FEATURE_TSC_KNOWN_FREQ when Hyper-V provides frequency Documentation: hyperv: Add overview of Confidential Computing VM support clocksource: hyper-v: Use lapic timer in a TDX VM without paravisor Drivers: hv: Remove deprecated hv_fcopy declarations
2024-09-09Merge 6.11-rc7 into usb-nextGreg Kroah-Hartman
We need the USB fixes in here as well, and this also resolves the merge conflict in: drivers/usb/typec/ucsi/ucsi.c Reported-by: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2024-09-09Merge 6.11-rc7 into char-misc-nextGreg Kroah-Hartman
We need the char-misc fixes in here as well. Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2024-09-09selftests/powerpc: Allow building without static libcMadhavan Srinivasan
Currently exec-target.c is linked statically with libc, which on Fedora at least requires installing an additional package (glibc-static). If that package is not installed the build fails with: CC exec_target /usr/bin/ld: cannot find -lc: No such file or directory collect2: error: ld returned 1 exit status All exec_target.c does is call sys_exit, which can be done easily enough using inline assembly, and removes the requirement for a static libc to be installed. Suggested-by: Michael Ellerman <mpe@ellerman.id.au> Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://msgid.link/20240812094152.418586-1-maddy@linux.ibm.com
2024-09-09tools/hv: Add memory allocation check in hv_fcopy_startZhu Jun
Added error handling for memory allocation failures of file_name and path_name. Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com> Reviewed-by: Dexuan Cui <decui@microsoft.com> Tested-by: Saurabh Sengar <ssengar@linux.microsoft.com> Link: https://lore.kernel.org/r/20240906091333.11419-1-zhujun2@cmss.chinamobile.com Signed-off-by: Wei Liu <wei.liu@kernel.org> Message-ID: <20240906091333.11419-1-zhujun2@cmss.chinamobile.com>
2024-09-09Merge branches 'context_tracking.15.08.24a', 'csd.lock.15.08.24a', ↵Neeraj Upadhyay
'nocb.09.09.24a', 'rcutorture.14.08.24a', 'rcustall.09.09.24a', 'srcu.12.08.24a', 'rcu.tasks.14.08.24a', 'rcu_scaling_tests.15.08.24a', 'fixes.12.08.24a' and 'misc.11.08.24a' into next.09.09.24a
2024-09-06libbpf: Workaround (another) -Wmaybe-uninitialized false positiveSam James
We get this with GCC 15 -O3 (at least): ``` libbpf.c: In function ‘bpf_map__init_kern_struct_ops’: libbpf.c:1109:18: error: ‘mod_btf’ may be used uninitialized [-Werror=maybe-uninitialized] 1109 | kern_btf = mod_btf ? mod_btf->btf : obj->btf_vmlinux; | ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ libbpf.c:1094:28: note: ‘mod_btf’ was declared here 1094 | struct module_btf *mod_btf; | ^~~~~~~ In function ‘find_struct_ops_kern_types’, inlined from ‘bpf_map__init_kern_struct_ops’ at libbpf.c:1102:8: libbpf.c:982:21: error: ‘btf’ may be used uninitialized [-Werror=maybe-uninitialized] 982 | kern_type = btf__type_by_id(btf, kern_type_id); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ libbpf.c: In function ‘bpf_map__init_kern_struct_ops’: libbpf.c:967:21: note: ‘btf’ was declared here 967 | struct btf *btf; | ^~~ ``` This is similar to the other libbpf fix from a few weeks ago for the same modelling-errno issue (fab45b962749184e1a1a57c7c583782b78fad539). Signed-off-by: Sam James <sam@gentoo.org> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://bugs.gentoo.org/939106 Link: https://lore.kernel.org/bpf/f6962729197ae7cdf4f6d1512625bd92f2322d31.1725630494.git.sam@gentoo.org
2024-09-06bpftool: Improve btf c dump sorting stabilityMykyta Yatsenko
Existing algorithm for BTF C dump sorting uses only types and names of the structs and unions for ordering. As dump contains structs with the same names but different contents, relative to each other ordering of those structs will be accidental. This patch addresses this problem by introducing a new sorting field that contains hash of the struct/union field names and types to disambiguate comparison of the non-unique named structs. Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20240906132453.146085-1-mykyta.yatsenko5@gmail.com
2024-09-06Merge tag 'riscv-for-linus-6.11-rc7' of ↵Linus Torvalds
git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux Pull RISC-V fixes from Palmer Dabbelt: - A revert for the mmap() change that ties the allocation range to the hint adress, as what we tried to do ended up regressing on other userspace workloads. - A fix to avoid a kernel memory leak when emulating misaligned accesses from userspace. - A Kconfig fix for toolchain vector detection, which now correctly detects vector support on toolchains where the V extension depends on the M extension. - A fix to avoid failing the linear mapping bootmem bounds check on NOMMU systems. - A fix for early alternatives on relocatable kernels. * tag 'riscv-for-linus-6.11-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux: riscv: Fix RISCV_ALTERNATIVE_EARLY riscv: Do not restrict memory size because of linear mapping on nommu riscv: Fix toolchain vector detection riscv: misaligned: Restrict user access to kernel memory riscv: mm: Do not restrict mmap address based on hint riscv: selftests: Remove mmap hint address checks Revert "RISC-V: mm: Document mmap changes"
2024-09-06selftests/timers: Remove unused NSEC_PER_SEC macrozhang jiao
By reading the code, I found the macro NSEC_PER_SEC is never referenced in the code. Just remove it. Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com> Reviewed-by: Shuah Khan <skhan@linuxfoundation.org> Acked-by: John Stultz <jstultz@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
2024-09-06pm:cpupower: Add error warning when SWIG is not installedJohn B. Wyatt IV
Add error message to better explain to the user when SWIG and python-config is missing from the path. Makefile was cleaned up and unneeded elements were removed. Suggested-by: Shuah Khan <skhan@linuxfoundation.org> Signed-off-by: John B. Wyatt IV <jwyatt@redhat.com> Signed-off-by: John B. Wyatt IV <sageofredondo@gmail.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>