summaryrefslogtreecommitdiff
path: root/samples
diff options
context:
space:
mode:
Diffstat (limited to 'samples')
-rw-r--r--samples/Kconfig31
-rw-r--r--samples/auxdisplay/cfag12864b-example.c16
-rw-r--r--samples/bpf/Makefile172
-rw-r--r--samples/bpf/bpf_insn.h (renamed from samples/bpf/libbpf.h)8
-rw-r--r--samples/bpf/bpf_load.c111
-rw-r--r--samples/bpf/bpf_load.h13
-rw-r--r--samples/bpf/cookie_uid_helper_example.c2
-rw-r--r--samples/bpf/cpustat_user.c2
-rw-r--r--samples/bpf/fds_example.c4
-rw-r--r--samples/bpf/lathist_user.c2
-rw-r--r--samples/bpf/load_sock_ops.c2
-rw-r--r--samples/bpf/lwt_len_hist_user.c2
-rw-r--r--samples/bpf/map_perf_test_user.c2
-rw-r--r--samples/bpf/offwaketime_user.c1
-rw-r--r--samples/bpf/sampleip_user.c1
-rw-r--r--samples/bpf/sock_example.c7
-rw-r--r--samples/bpf/sock_example.h1
-rw-r--r--samples/bpf/sockex1_user.c2
-rw-r--r--samples/bpf/sockex2_user.c2
-rw-r--r--samples/bpf/sockex3_user.c2
-rw-r--r--samples/bpf/spintest_user.c1
-rw-r--r--samples/bpf/syscall_tp_user.c2
-rw-r--r--samples/bpf/task_fd_query_kern.c19
-rw-r--r--samples/bpf/task_fd_query_user.c382
-rw-r--r--samples/bpf/tc_l2_redirect_user.c2
-rw-r--r--samples/bpf/tcbpf2_kern.c596
-rw-r--r--samples/bpf/test_cgrp2_array_pin.c2
-rw-r--r--samples/bpf/test_cgrp2_attach.c3
-rw-r--r--samples/bpf/test_cgrp2_attach2.c3
-rw-r--r--samples/bpf/test_cgrp2_sock.c3
-rw-r--r--samples/bpf/test_cgrp2_sock2.c3
-rw-r--r--samples/bpf/test_current_task_under_cgroup_user.c2
-rw-r--r--samples/bpf/test_lru_dist.c2
-rw-r--r--samples/bpf/test_map_in_map_user.c2
-rw-r--r--samples/bpf/test_overhead_user.c2
-rw-r--r--samples/bpf/test_probe_write_user_user.c2
-rwxr-xr-xsamples/bpf/test_tunnel_bpf.sh319
-rw-r--r--samples/bpf/trace_event_user.c1
-rw-r--r--samples/bpf/trace_output_user.c112
-rw-r--r--samples/bpf/tracex1_user.c2
-rw-r--r--samples/bpf/tracex2_user.c2
-rw-r--r--samples/bpf/tracex3_user.c2
-rw-r--r--samples/bpf/tracex4_user.c2
-rw-r--r--samples/bpf/tracex5_user.c2
-rw-r--r--samples/bpf/tracex6_user.c2
-rw-r--r--samples/bpf/tracex7_user.c2
-rw-r--r--samples/bpf/xdp1_user.c31
-rw-r--r--samples/bpf/xdp_adjust_tail_kern.c152
-rw-r--r--samples/bpf/xdp_adjust_tail_user.c150
-rw-r--r--samples/bpf/xdp_fwd_kern.c138
-rw-r--r--samples/bpf/xdp_fwd_user.c136
-rw-r--r--samples/bpf/xdp_monitor_kern.c49
-rw-r--r--samples/bpf/xdp_monitor_user.c77
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c2
-rw-r--r--samples/bpf/xdp_redirect_map_user.c2
-rw-r--r--samples/bpf/xdp_redirect_user.c2
-rw-r--r--samples/bpf/xdp_router_ipv4_user.c2
-rw-r--r--samples/bpf/xdp_rxq_info_user.c46
-rw-r--r--samples/bpf/xdp_tx_iptunnel_user.c2
-rw-r--r--samples/bpf/xdpsock.h11
-rw-r--r--samples/bpf/xdpsock_kern.c56
-rw-r--r--samples/bpf/xdpsock_user.c962
-rw-r--r--samples/sockmap/Makefile78
-rw-r--r--samples/sockmap/sockmap_kern.c341
-rwxr-xr-xsamples/sockmap/sockmap_test.sh488
-rw-r--r--samples/sockmap/sockmap_user.c894
-rw-r--r--samples/vfio-mdev/Makefile3
-rw-r--r--samples/vfio-mdev/mbochs.c1406
-rw-r--r--samples/vfio-mdev/mdpy-defs.h22
-rw-r--r--samples/vfio-mdev/mdpy-fb.c232
-rw-r--r--samples/vfio-mdev/mdpy.c807
71 files changed, 4862 insertions, 3082 deletions
diff --git a/samples/Kconfig b/samples/Kconfig
index 3db002b9e1d3..bd133efc1a56 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -115,6 +115,37 @@ config SAMPLE_VFIO_MDEV_MTTY
Build a virtual tty sample driver for use as a VFIO
mediated device
+config SAMPLE_VFIO_MDEV_MDPY
+ tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
+ depends on VFIO_MDEV_DEVICE && m
+ help
+ Build a virtual display sample driver for use as a VFIO
+ mediated device. It is a simple framebuffer and supports
+ the region display interface (VFIO_GFX_PLANE_TYPE_REGION).
+
+config SAMPLE_VFIO_MDEV_MDPY_FB
+ tristate "Build VFIO mdpy example guest fbdev driver -- loadable module only"
+ depends on FB && m
+ select FB_CFB_FILLRECT
+ select FB_CFB_COPYAREA
+ select FB_CFB_IMAGEBLIT
+ help
+ Guest fbdev driver for the virtual display sample driver.
+
+config SAMPLE_VFIO_MDEV_MBOCHS
+ tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
+ depends on VFIO_MDEV_DEVICE && m
+ select DMA_SHARED_BUFFER
+ help
+ Build a virtual display sample driver for use as a VFIO
+ mediated device. It supports the region display interface
+ (VFIO_GFX_PLANE_TYPE_DMABUF).
+ Emulate enough of qemu stdvga to make bochs-drm.ko happy.
+ That is basically the vram memory bar and the bochs dispi
+ interface vbe registers in the mmio register bar.
+ Specifically it does *not* include any legacy vga stuff.
+ Device looks a lot like "qemu -device secondary-vga".
+
config SAMPLE_STATX
bool "Build example extended-stat using code"
depends on BROKEN
diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c
index e7823ffb1ca0..85571e90191f 100644
--- a/samples/auxdisplay/cfag12864b-example.c
+++ b/samples/auxdisplay/cfag12864b-example.c
@@ -1,25 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Filename: cfag12864b-example.c
* Version: 0.1.0
* Description: cfag12864b LCD userspace example program
- * License: GPLv2
*
* Author: Copyright (C) Miguel Ojeda Sandonis
* Date: 2006-10-31
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
*/
/*
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 092947676143..1303af10e54d 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -1,4 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
+
+BPF_SAMPLES_PATH ?= $(abspath $(srctree)/$(src))
+TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools
+
# List of programs to build
hostprogs-y := test_lru_dist
hostprogs-y += sock_example
@@ -44,57 +48,65 @@ hostprogs-y += xdp_monitor
hostprogs-y += xdp_rxq_info
hostprogs-y += syscall_tp
hostprogs-y += cpustat
+hostprogs-y += xdp_adjust_tail
+hostprogs-y += xdpsock
+hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
# Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
+LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
+
CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
+TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
-test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
-sock_example-objs := sock_example.o $(LIBBPF)
-fds_example-objs := bpf_load.o $(LIBBPF) fds_example.o
-sockex1-objs := bpf_load.o $(LIBBPF) sockex1_user.o
-sockex2-objs := bpf_load.o $(LIBBPF) sockex2_user.o
-sockex3-objs := bpf_load.o $(LIBBPF) sockex3_user.o
-tracex1-objs := bpf_load.o $(LIBBPF) tracex1_user.o
-tracex2-objs := bpf_load.o $(LIBBPF) tracex2_user.o
-tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
-tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
-tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
-tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
-tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
-load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
-test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
-trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
-lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
-offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o
-spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o
-map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
-test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
-test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o
-test_cgrp2_attach-objs := $(LIBBPF) test_cgrp2_attach.o
-test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o $(CGROUP_HELPERS)
-test_cgrp2_sock-objs := $(LIBBPF) test_cgrp2_sock.o
-test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o
-xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o
+fds_example-objs := bpf_load.o fds_example.o
+sockex1-objs := bpf_load.o sockex1_user.o
+sockex2-objs := bpf_load.o sockex2_user.o
+sockex3-objs := bpf_load.o sockex3_user.o
+tracex1-objs := bpf_load.o tracex1_user.o
+tracex2-objs := bpf_load.o tracex2_user.o
+tracex3-objs := bpf_load.o tracex3_user.o
+tracex4-objs := bpf_load.o tracex4_user.o
+tracex5-objs := bpf_load.o tracex5_user.o
+tracex6-objs := bpf_load.o tracex6_user.o
+tracex7-objs := bpf_load.o tracex7_user.o
+load_sock_ops-objs := bpf_load.o load_sock_ops.o
+test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
+trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
+lathist-objs := bpf_load.o lathist_user.o
+offwaketime-objs := bpf_load.o offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := bpf_load.o spintest_user.o $(TRACE_HELPERS)
+map_perf_test-objs := bpf_load.o map_perf_test_user.o
+test_overhead-objs := bpf_load.o test_overhead_user.o
+test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
+test_cgrp2_attach-objs := test_cgrp2_attach.o
+test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(CGROUP_HELPERS)
+test_cgrp2_sock-objs := test_cgrp2_sock.o
+test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o
+xdp1-objs := xdp1_user.o
# reuse xdp1 source intentionally
-xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
-xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
-test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
+xdp2-objs := xdp1_user.o
+xdp_router_ipv4-objs := bpf_load.o xdp_router_ipv4_user.o
+test_current_task_under_cgroup-objs := bpf_load.o $(CGROUP_HELPERS) \
test_current_task_under_cgroup_user.o
-trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o
-sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o
-tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
-lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
-xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
-test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
-per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o
-xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
-xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
-xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
-xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
-xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
-syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
-cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
+trace_event-objs := bpf_load.o trace_event_user.o $(TRACE_HELPERS)
+sampleip-objs := bpf_load.o sampleip_user.o $(TRACE_HELPERS)
+tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o
+lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o
+xdp_tx_iptunnel-objs := bpf_load.o xdp_tx_iptunnel_user.o
+test_map_in_map-objs := bpf_load.o test_map_in_map_user.o
+per_socket_stats_example-objs := cookie_uid_helper_example.o
+xdp_redirect-objs := bpf_load.o xdp_redirect_user.o
+xdp_redirect_map-objs := bpf_load.o xdp_redirect_map_user.o
+xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
+xdp_monitor-objs := bpf_load.o xdp_monitor_user.o
+xdp_rxq_info-objs := xdp_rxq_info_user.o
+syscall_tp-objs := bpf_load.o syscall_tp_user.o
+cpustat-objs := bpf_load.o cpustat_user.o
+xdp_adjust_tail-objs := xdp_adjust_tail_user.o
+xdpsock-objs := bpf_load.o xdpsock_user.o
+xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -112,7 +124,6 @@ always += sock_flags_kern.o
always += test_probe_write_user_kern.o
always += trace_output_kern.o
always += tcbpf1_kern.o
-always += tcbpf2_kern.o
always += tc_l2_redirect_kern.o
always += lathist_kern.o
always += offwaketime_kern.o
@@ -148,6 +159,10 @@ always += xdp_rxq_info_kern.o
always += xdp2skb_meta_kern.o
always += syscall_tp_kern.o
always += cpustat_kern.o
+always += xdp_adjust_tail_kern.o
+always += xdpsock_kern.o
+always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -156,43 +171,21 @@ HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include
HOSTCFLAGS += -I$(srctree)/tools/perf
HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
-HOSTLOADLIBES_fds_example += -lelf
-HOSTLOADLIBES_sockex1 += -lelf
-HOSTLOADLIBES_sockex2 += -lelf
-HOSTLOADLIBES_sockex3 += -lelf
-HOSTLOADLIBES_tracex1 += -lelf
-HOSTLOADLIBES_tracex2 += -lelf
-HOSTLOADLIBES_tracex3 += -lelf
-HOSTLOADLIBES_tracex4 += -lelf -lrt
-HOSTLOADLIBES_tracex5 += -lelf
-HOSTLOADLIBES_tracex6 += -lelf
-HOSTLOADLIBES_tracex7 += -lelf
-HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
-HOSTLOADLIBES_load_sock_ops += -lelf
-HOSTLOADLIBES_test_probe_write_user += -lelf
-HOSTLOADLIBES_trace_output += -lelf -lrt
-HOSTLOADLIBES_lathist += -lelf
-HOSTLOADLIBES_offwaketime += -lelf
-HOSTLOADLIBES_spintest += -lelf
-HOSTLOADLIBES_map_perf_test += -lelf -lrt
-HOSTLOADLIBES_test_overhead += -lelf -lrt
-HOSTLOADLIBES_xdp1 += -lelf
-HOSTLOADLIBES_xdp2 += -lelf
-HOSTLOADLIBES_xdp_router_ipv4 += -lelf
-HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
-HOSTLOADLIBES_trace_event += -lelf
-HOSTLOADLIBES_sampleip += -lelf
-HOSTLOADLIBES_tc_l2_redirect += -l elf
-HOSTLOADLIBES_lwt_len_hist += -l elf
-HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
-HOSTLOADLIBES_test_map_in_map += -lelf
-HOSTLOADLIBES_xdp_redirect += -lelf
-HOSTLOADLIBES_xdp_redirect_map += -lelf
-HOSTLOADLIBES_xdp_redirect_cpu += -lelf
-HOSTLOADLIBES_xdp_monitor += -lelf
-HOSTLOADLIBES_xdp_rxq_info += -lelf
-HOSTLOADLIBES_syscall_tp += -lelf
-HOSTLOADLIBES_cpustat += -lelf
+HOSTCFLAGS_trace_helpers.o += -I$(srctree)/tools/lib/bpf/
+
+HOSTCFLAGS_trace_output_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
+
+HOST_LOADLIBES += $(LIBBPF) -lelf
+HOSTLOADLIBES_tracex4 += -lrt
+HOSTLOADLIBES_trace_output += -lrt
+HOSTLOADLIBES_map_perf_test += -lrt
+HOSTLOADLIBES_test_overhead += -lrt
+HOSTLOADLIBES_xdpsock += -pthread
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
@@ -206,15 +199,16 @@ CLANG_ARCH_ARGS = -target $(ARCH)
endif
# Trick to allow make to be run from this directory
-all: $(LIBBPF)
- $(MAKE) -C ../../ $(CURDIR)/
+all:
+ $(MAKE) -C ../../ $(CURDIR)/ BPF_SAMPLES_PATH=$(CURDIR)
clean:
$(MAKE) -C ../../ M=$(CURDIR) clean
@rm -f *~
$(LIBBPF): FORCE
- $(MAKE) -C $(dir $@) $(notdir $@)
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+ $(MAKE) -C $(dir $@) RM='rm -rf' LDFLAGS= srctree=$(BPF_SAMPLES_PATH)/../../ O=
$(obj)/syscall_nrs.s: $(src)/syscall_nrs.c
$(call if_changed_dep,cc_s_c)
@@ -245,7 +239,8 @@ verify_target_bpf: verify_cmds
exit 2; \
else true; fi
-$(src)/*.c: verify_target_bpf
+$(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
+$(src)/*.c: verify_target_bpf $(LIBBPF)
$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
@@ -253,7 +248,8 @@ $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
# But, there is no easy way to fix it, so just exclude it since it is
# useless for BPF samples.
$(obj)/%.o: $(src)/%.c
- $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
+ @echo " CLANG-bpf " $@
+ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
-I$(srctree)/tools/testing/selftests/bpf/ \
-D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \
-D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
diff --git a/samples/bpf/libbpf.h b/samples/bpf/bpf_insn.h
index 18bfee5aab6b..20dc5cefec84 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/bpf_insn.h
@@ -1,9 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* eBPF mini library */
-#ifndef __LIBBPF_H
-#define __LIBBPF_H
-
-#include <bpf/bpf.h>
+/* eBPF instruction mini library */
+#ifndef __BPF_INSN_H
+#define __BPF_INSN_H
struct bpf_insn;
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index bebe4188b4b3..89161c9ed466 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -24,7 +24,7 @@
#include <poll.h>
#include <ctype.h>
#include <assert.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include "perf-sys.h"
@@ -145,6 +145,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
}
if (is_kprobe || is_kretprobe) {
+ bool need_normal_check = true;
+ const char *event_prefix = "";
+
if (is_kprobe)
event += 7;
else
@@ -158,18 +161,33 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
if (isdigit(*event))
return populate_prog_array(event, fd);
- snprintf(buf, sizeof(buf),
- "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
- is_kprobe ? 'p' : 'r', event, event);
- err = system(buf);
- if (err < 0) {
- printf("failed to create kprobe '%s' error '%s'\n",
- event, strerror(errno));
- return -1;
+#ifdef __x86_64__
+ if (strncmp(event, "sys_", 4) == 0) {
+ snprintf(buf, sizeof(buf),
+ "echo '%c:__x64_%s __x64_%s' >> /sys/kernel/debug/tracing/kprobe_events",
+ is_kprobe ? 'p' : 'r', event, event);
+ err = system(buf);
+ if (err >= 0) {
+ need_normal_check = false;
+ event_prefix = "__x64_";
+ }
+ }
+#endif
+ if (need_normal_check) {
+ snprintf(buf, sizeof(buf),
+ "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+ is_kprobe ? 'p' : 'r', event, event);
+ err = system(buf);
+ if (err < 0) {
+ printf("failed to create kprobe '%s' error '%s'\n",
+ event, strerror(errno));
+ return -1;
+ }
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/kprobes/");
+ strcat(buf, event_prefix);
strcat(buf, event);
strcat(buf, "/id");
} else if (is_tracepoint) {
@@ -402,7 +420,7 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
/* Keeping compatible with ELF maps section changes
* ------------------------------------------------
- * The program size of struct bpf_map_def is known by loader
+ * The program size of struct bpf_load_map_def is known by loader
* code, but struct stored in ELF file can be different.
*
* Unfortunately sym[i].st_size is zero. To calculate the
@@ -411,7 +429,7 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
* symbols.
*/
map_sz_elf = data_maps->d_size / nr_maps;
- map_sz_copy = sizeof(struct bpf_map_def);
+ map_sz_copy = sizeof(struct bpf_load_map_def);
if (map_sz_elf < map_sz_copy) {
/*
* Backward compat, loading older ELF file with
@@ -430,8 +448,8 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
/* Memcpy relevant part of ELF maps data to loader maps */
for (i = 0; i < nr_maps; i++) {
+ struct bpf_load_map_def *def;
unsigned char *addr, *end;
- struct bpf_map_def *def;
const char *map_name;
size_t offset;
@@ -446,9 +464,9 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
/* Symbol value is offset into ELF maps section data area */
offset = sym[i].st_value;
- def = (struct bpf_map_def *)(data_maps->d_buf + offset);
+ def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
maps[i].elf_offset = offset;
- memset(&maps[i].def, 0, sizeof(struct bpf_map_def));
+ memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
memcpy(&maps[i].def, def, map_sz_copy);
/* Verify no newer features were requested */
@@ -549,7 +567,6 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
if (nr_maps < 0) {
printf("Error: Failed loading ELF maps (errno:%d):%s\n",
nr_maps, strerror(-nr_maps));
- ret = 1;
goto done;
}
if (load_maps(map_data, nr_maps, fixup_map))
@@ -615,7 +632,6 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
}
}
- ret = 0;
done:
close(fd);
return ret;
@@ -650,66 +666,3 @@ void read_trace_pipe(void)
}
}
}
-
-#define MAX_SYMS 300000
-static struct ksym syms[MAX_SYMS];
-static int sym_cnt;
-
-static int ksym_cmp(const void *p1, const void *p2)
-{
- return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
-}
-
-int load_kallsyms(void)
-{
- FILE *f = fopen("/proc/kallsyms", "r");
- char func[256], buf[256];
- char symbol;
- void *addr;
- int i = 0;
-
- if (!f)
- return -ENOENT;
-
- while (!feof(f)) {
- if (!fgets(buf, sizeof(buf), f))
- break;
- if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
- break;
- if (!addr)
- continue;
- syms[i].addr = (long) addr;
- syms[i].name = strdup(func);
- i++;
- }
- sym_cnt = i;
- qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
- return 0;
-}
-
-struct ksym *ksym_search(long key)
-{
- int start = 0, end = sym_cnt;
- int result;
-
- while (start < end) {
- size_t mid = start + (end - start) / 2;
-
- result = key - syms[mid].addr;
- if (result < 0)
- end = mid;
- else if (result > 0)
- start = mid + 1;
- else
- return &syms[mid];
- }
-
- if (start >= 1 && syms[start - 1].addr < key &&
- key < syms[start].addr)
- /* valid ksym */
- return &syms[start - 1];
-
- /* out of range. return _stext */
- return &syms[0];
-}
-
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 453c200b389b..814894a12974 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -2,12 +2,12 @@
#ifndef __BPF_LOAD_H
#define __BPF_LOAD_H
-#include "libbpf.h"
+#include <bpf/bpf.h>
#define MAX_MAPS 32
#define MAX_PROGS 32
-struct bpf_map_def {
+struct bpf_load_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
@@ -21,7 +21,7 @@ struct bpf_map_data {
int fd;
char *name;
size_t elf_offset;
- struct bpf_map_def def;
+ struct bpf_load_map_def def;
};
typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
@@ -54,12 +54,5 @@ int load_bpf_file(char *path);
int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
void read_trace_pipe(void);
-struct ksym {
- long addr;
- char *name;
-};
-
-int load_kallsyms(void);
-struct ksym *ksym_search(long key);
int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
#endif
diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c
index 8eca27e595ae..deb0e3e0324d 100644
--- a/samples/bpf/cookie_uid_helper_example.c
+++ b/samples/bpf/cookie_uid_helper_example.c
@@ -51,7 +51,7 @@
#include <sys/types.h>
#include <unistd.h>
#include <bpf/bpf.h>
-#include "libbpf.h"
+#include "bpf_insn.h"
#define PORT 8888
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
index 2b4cd1ae57c5..869a99406dbf 100644
--- a/samples/bpf/cpustat_user.c
+++ b/samples/bpf/cpustat_user.c
@@ -17,7 +17,7 @@
#include <sys/resource.h>
#include <sys/wait.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#define MAX_CPU 8
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
index e29bd52ff9e8..9854854f05d1 100644
--- a/samples/bpf/fds_example.c
+++ b/samples/bpf/fds_example.c
@@ -12,8 +12,10 @@
#include <sys/types.h>
#include <sys/socket.h>
+#include <bpf/bpf.h>
+
+#include "bpf_insn.h"
#include "bpf_load.h"
-#include "libbpf.h"
#include "sock_example.h"
#define BPF_F_PIN (1 << 0)
diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c
index 6477bad5b4e2..c8e88cc84e61 100644
--- a/samples/bpf/lathist_user.c
+++ b/samples/bpf/lathist_user.c
@@ -10,7 +10,7 @@
#include <stdlib.h>
#include <signal.h>
#include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#define MAX_ENTRIES 20
diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c
index e5da6cf71a3e..8ecb41ea0c03 100644
--- a/samples/bpf/load_sock_ops.c
+++ b/samples/bpf/load_sock_ops.c
@@ -8,7 +8,7 @@
#include <stdlib.h>
#include <string.h>
#include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include <unistd.h>
#include <errno.h>
diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c
index 7fcb94c09112..587b68b1f8dd 100644
--- a/samples/bpf/lwt_len_hist_user.c
+++ b/samples/bpf/lwt_len_hist_user.c
@@ -9,7 +9,7 @@
#include <errno.h>
#include <arpa/inet.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_util.h"
#define MAX_INDEX 64
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index 519d9af4b04a..38b7b1a96cc2 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -21,7 +21,7 @@
#include <arpa/inet.h>
#include <errno.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#define TEST_BIT(t) (1U << (t))
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 512f87a5fd20..f06063af9fcb 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -17,6 +17,7 @@
#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
+#include "trace_helpers.h"
#define PRINT_RAW_ADDR 0
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index 4ed690b907ff..60c2b73d1b4d 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -22,6 +22,7 @@
#include "libbpf.h"
#include "bpf_load.h"
#include "perf-sys.h"
+#include "trace_helpers.h"
#define DEFAULT_FREQ 99
#define DEFAULT_SECS 5
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index 6fc6e193ef1b..60ec467c78ab 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -9,10 +9,10 @@
* if (value)
* (*(u64*)value) += 1;
*
- * - attaches this program to eth0 raw socket
+ * - attaches this program to loopback interface "lo" raw socket
*
* - every second user space reads map[tcp], map[udp], map[icmp] to see
- * how many packets of given protocol were seen on eth0
+ * how many packets of given protocol were seen on "lo"
*/
#include <stdio.h>
#include <unistd.h>
@@ -26,7 +26,8 @@
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <stddef.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
+#include "bpf_insn.h"
#include "sock_example.h"
char bpf_log_buf[BPF_LOG_BUF_SIZE];
diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h
index 772d5dad8465..a27d7579bc73 100644
--- a/samples/bpf/sock_example.h
+++ b/samples/bpf/sock_example.h
@@ -9,7 +9,6 @@
#include <net/if.h>
#include <linux/if_packet.h>
#include <arpa/inet.h>
-#include "libbpf.h"
static inline int open_raw_sock(const char *name)
{
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
index 2be935c2627d..93ec01c56104 100644
--- a/samples/bpf/sockex1_user.c
+++ b/samples/bpf/sockex1_user.c
@@ -2,7 +2,7 @@
#include <stdio.h>
#include <assert.h>
#include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include "sock_example.h"
#include <unistd.h>
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
index 44fe0805b087..1d5c6e9a6d27 100644
--- a/samples/bpf/sockex2_user.c
+++ b/samples/bpf/sockex2_user.c
@@ -2,7 +2,7 @@
#include <stdio.h>
#include <assert.h>
#include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include "sock_example.h"
#include <unistd.h>
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index 495ee02e2fb7..5ba3ae9d180b 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -2,7 +2,7 @@
#include <stdio.h>
#include <assert.h>
#include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include "sock_example.h"
#include <unistd.h>
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
index 3d736219a31c..8d3e9cfa1909 100644
--- a/samples/bpf/spintest_user.c
+++ b/samples/bpf/spintest_user.c
@@ -7,6 +7,7 @@
#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
+#include "trace_helpers.h"
int main(int ac, char **argv)
{
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
index 9169d3207f18..1a1d0059a277 100644
--- a/samples/bpf/syscall_tp_user.c
+++ b/samples/bpf/syscall_tp_user.c
@@ -16,7 +16,7 @@
#include <assert.h>
#include <stdbool.h>
#include <sys/resource.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*.
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000000000000..f4b0a9ea674d
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000000000000..8381d792f138
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("FAIL: %s:\n", __func__); \
+ perror(" "); \
+ return -1; \
+ } \
+})
+
+#define CHECK_AND_RET(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) \
+ return -1; \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+ return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+ char buf[256];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ fd = open(buf, O_RDONLY);
+ CHECK_PERROR_RET(fd < 0);
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ errno = 0;
+ ret = (int)strtol(buf, NULL, 10);
+ CHECK_PERROR_RET(errno);
+ return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+ char buf[256];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ fd = open(buf, O_RDONLY);
+ CHECK_PERROR_RET(fd < 0);
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+ CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+ errno = 0;
+ ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+ CHECK_PERROR_RET(errno);
+ return ret;
+}
+
+static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name,
+ __u32 expected_fd_type)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ char buf[256];
+ int err;
+
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len,
+ &prog_id, &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
+ __func__, prog_fd_idx, fn_name);
+ perror(" :");
+ return -1;
+ }
+ if (strcmp(buf, fn_name) != 0 ||
+ fd_type != expected_fd_type ||
+ probe_offset != 0x0 || probe_addr != 0x0) {
+ printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n",
+ prog_fd_idx);
+ printf("buf: %s, fd_type: %u, probe_offset: 0x%llx,"
+ " probe_addr: 0x%llx\n",
+ buf, fd_type, probe_offset, probe_addr);
+ return -1;
+ }
+ return 0;
+}
+
+static int test_nondebug_fs_kuprobe_common(const char *event_type,
+ const char *name, __u64 offset, __u64 addr, bool is_return,
+ char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
+ __u64 *probe_offset, __u64 *probe_addr)
+{
+ int is_return_bit = bpf_get_retprobe_bit(event_type);
+ int type = bpf_find_probe_type(event_type);
+ struct perf_event_attr attr = {};
+ int fd;
+
+ if (type < 0 || is_return_bit < 0) {
+ printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
+ __func__, type, is_return_bit);
+ return -1;
+ }
+
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ if (is_return)
+ attr.config |= 1 << is_return_bit;
+
+ if (name) {
+ attr.config1 = ptr_to_u64((void *)name);
+ attr.config2 = offset;
+ } else {
+ attr.config1 = 0;
+ attr.config2 = addr;
+ }
+ attr.size = sizeof(attr);
+ attr.type = type;
+
+ fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+ CHECK_PERROR_RET(fd < 0);
+
+ CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+ CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+ CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
+ prog_id, fd_type, probe_offset, probe_addr) < 0);
+
+ return 0;
+}
+
+static int test_nondebug_fs_probe(const char *event_type, const char *name,
+ __u64 offset, __u64 addr, bool is_return,
+ __u32 expected_fd_type,
+ __u32 expected_ret_fd_type,
+ char *buf, __u32 buf_len)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 prog_id, fd_type;
+ int err;
+
+ err = test_nondebug_fs_kuprobe_common(event_type, name,
+ offset, addr, is_return,
+ buf, &buf_len, &prog_id,
+ &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, "
+ "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
+ __func__, name ? name : "", offset, addr, is_return);
+ perror(" :");
+ return -1;
+ }
+ if ((is_return && fd_type != expected_ret_fd_type) ||
+ (!is_return && fd_type != expected_fd_type)) {
+ printf("FAIL: %s, incorrect fd_type %u\n",
+ __func__, fd_type);
+ return -1;
+ }
+ if (name) {
+ if (strcmp(name, buf) != 0) {
+ printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+ return -1;
+ }
+ if (probe_offset != offset) {
+ printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
+ __func__, probe_offset);
+ return -1;
+ }
+ } else {
+ if (buf_len != 0) {
+ printf("FAIL: %s, incorrect buf %p\n",
+ __func__, buf);
+ return -1;
+ }
+
+ if (probe_addr != addr) {
+ printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
+ __func__, probe_addr);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
+{
+ const char *event_type = "uprobe";
+ struct perf_event_attr attr = {};
+ char buf[256], event_alias[256];
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ int err, res, kfd, efd;
+ ssize_t bytes;
+
+ snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+ event_type);
+ kfd = open(buf, O_WRONLY | O_APPEND, 0);
+ CHECK_PERROR_RET(kfd < 0);
+
+ res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
+ CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
+
+ res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
+ is_return ? 'r' : 'p', event_type, event_alias,
+ binary_path, offset);
+ CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
+ CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
+
+ close(kfd);
+ kfd = -1;
+
+ snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+ event_type, event_alias);
+ efd = open(buf, O_RDONLY, 0);
+ CHECK_PERROR_RET(efd < 0);
+
+ bytes = read(efd, buf, sizeof(buf));
+ CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
+ close(efd);
+ buf[bytes] = '\0';
+
+ attr.config = strtol(buf, NULL, 0);
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+ CHECK_PERROR_RET(kfd < 0);
+ CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+ CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len,
+ &prog_id, &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
+ perror(" :");
+ return -1;
+ }
+ if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) ||
+ (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) {
+ printf("FAIL: %s, incorrect fd_type %u\n", __func__,
+ fd_type);
+ return -1;
+ }
+ if (strcmp(binary_path, buf) != 0) {
+ printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+ return -1;
+ }
+ if (probe_offset != offset) {
+ printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
+ probe_offset);
+ return -1;
+ }
+
+ close(kfd);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {1024*1024, RLIM_INFINITY};
+ extern char __executable_start;
+ char filename[256], buf[256];
+ __u64 uprobe_file_offset;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ /* test two functions in the corresponding *_kern.c file */
+ CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+ BPF_FD_TYPE_KPROBE));
+ CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
+ BPF_FD_TYPE_KRETPROBE));
+
+ /* test nondebug fs kprobe */
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+ false, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+#ifdef __x86_64__
+ /* set a kprobe on "bpf_check + 0x5", which is x64 specific */
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
+ false, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+#endif
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+ true, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), false,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), false,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ NULL, 0));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), true,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), true,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ 0, 0));
+
+ /* test nondebug fs uprobe */
+ /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
+ * and the default linker script, which defines __executable_start as
+ * the start of the .text section. The calculation could be different
+ * on different systems with different compilers. The right way is
+ * to parse the ELF file. We took a shortcut here.
+ */
+ uprobe_file_offset = (__u64)main - (__u64)&__executable_start;
+ CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+ uprobe_file_offset, 0x0, false,
+ BPF_FD_TYPE_UPROBE,
+ BPF_FD_TYPE_URETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+ uprobe_file_offset, 0x0, true,
+ BPF_FD_TYPE_UPROBE,
+ BPF_FD_TYPE_URETPROBE,
+ buf, sizeof(buf)));
+
+ /* test debug fs uprobe */
+ CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+ false));
+ CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+ true));
+
+ return 0;
+}
diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c
index 28995a776560..7ec45c3e8f56 100644
--- a/samples/bpf/tc_l2_redirect_user.c
+++ b/samples/bpf/tc_l2_redirect_user.c
@@ -13,7 +13,7 @@
#include <string.h>
#include <errno.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
static void usage(void)
{
diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
deleted file mode 100644
index 9a8db7bd6db4..000000000000
--- a/samples/bpf/tcbpf2_kern.c
+++ /dev/null
@@ -1,596 +0,0 @@
-/* Copyright (c) 2016 VMware
- * Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-#define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include <uapi/linux/if_ether.h>
-#include <uapi/linux/if_packet.h>
-#include <uapi/linux/ip.h>
-#include <uapi/linux/ipv6.h>
-#include <uapi/linux/in.h>
-#include <uapi/linux/tcp.h>
-#include <uapi/linux/filter.h>
-#include <uapi/linux/pkt_cls.h>
-#include <uapi/linux/erspan.h>
-#include <net/ipv6.h>
-#include "bpf_helpers.h"
-#include "bpf_endian.h"
-
-#define _htonl __builtin_bswap32
-#define ERROR(ret) do {\
- char fmt[] = "ERROR line:%d ret:%d\n";\
- bpf_trace_printk(fmt, sizeof(fmt), __LINE__, ret); \
- } while(0)
-
-struct geneve_opt {
- __be16 opt_class;
- u8 type;
- u8 length:5;
- u8 r3:1;
- u8 r2:1;
- u8 r1:1;
- u8 opt_data[8]; /* hard-coded to 8 byte */
-};
-
-struct vxlan_metadata {
- u32 gbp;
-};
-
-SEC("gre_set_tunnel")
-int _gre_set_tunnel(struct __sk_buff *skb)
-{
- int ret;
- struct bpf_tunnel_key key;
-
- __builtin_memset(&key, 0x0, sizeof(key));
- key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
- key.tunnel_id = 2;
- key.tunnel_tos = 0;
- key.tunnel_ttl = 64;
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
- BPF_F_ZERO_CSUM_TX | BPF_F_SEQ_NUMBER);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("gre_get_tunnel")
-int _gre_get_tunnel(struct __sk_buff *skb)
-{
- int ret;
- struct bpf_tunnel_key key;
- char fmt[] = "key %d remote ip 0x%x\n";
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4);
- return TC_ACT_OK;
-}
-
-SEC("ip6gretap_set_tunnel")
-int _ip6gretap_set_tunnel(struct __sk_buff *skb)
-{
- struct bpf_tunnel_key key;
- int ret;
-
- __builtin_memset(&key, 0x0, sizeof(key));
- key.remote_ipv6[3] = _htonl(0x11); /* ::11 */
- key.tunnel_id = 2;
- key.tunnel_tos = 0;
- key.tunnel_ttl = 64;
- key.tunnel_label = 0xabcde;
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
- BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
- BPF_F_SEQ_NUMBER);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("ip6gretap_get_tunnel")
-int _ip6gretap_get_tunnel(struct __sk_buff *skb)
-{
- char fmt[] = "key %d remote ip6 ::%x label %x\n";
- struct bpf_tunnel_key key;
- int ret;
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
- BPF_F_TUNINFO_IPV6);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt),
- key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
-
- return TC_ACT_OK;
-}
-
-SEC("erspan_set_tunnel")
-int _erspan_set_tunnel(struct __sk_buff *skb)
-{
- struct bpf_tunnel_key key;
- struct erspan_metadata md;
- int ret;
-
- __builtin_memset(&key, 0x0, sizeof(key));
- key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
- key.tunnel_id = 2;
- key.tunnel_tos = 0;
- key.tunnel_ttl = 64;
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- __builtin_memset(&md, 0, sizeof(md));
-#ifdef ERSPAN_V1
- md.version = 1;
- md.u.index = bpf_htonl(123);
-#else
- u8 direction = 1;
- u8 hwid = 7;
-
- md.version = 2;
- md.u.md2.dir = direction;
- md.u.md2.hwid = hwid & 0xf;
- md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
-#endif
-
- ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("erspan_get_tunnel")
-int _erspan_get_tunnel(struct __sk_buff *skb)
-{
- char fmt[] = "key %d remote ip 0x%x erspan version %d\n";
- struct bpf_tunnel_key key;
- struct erspan_metadata md;
- u32 index;
- int ret;
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt),
- key.tunnel_id, key.remote_ipv4, md.version);
-
-#ifdef ERSPAN_V1
- char fmt2[] = "\tindex %x\n";
-
- index = bpf_ntohl(md.u.index);
- bpf_trace_printk(fmt2, sizeof(fmt2), index);
-#else
- char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
-
- bpf_trace_printk(fmt2, sizeof(fmt2),
- md.u.md2.dir,
- (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
- bpf_ntohl(md.u.md2.timestamp));
-#endif
-
- return TC_ACT_OK;
-}
-
-SEC("ip4ip6erspan_set_tunnel")
-int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
-{
- struct bpf_tunnel_key key;
- struct erspan_metadata md;
- int ret;
-
- __builtin_memset(&key, 0x0, sizeof(key));
- key.remote_ipv6[3] = _htonl(0x11);
- key.tunnel_id = 2;
- key.tunnel_tos = 0;
- key.tunnel_ttl = 64;
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
- BPF_F_TUNINFO_IPV6);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- __builtin_memset(&md, 0, sizeof(md));
-
-#ifdef ERSPAN_V1
- md.u.index = htonl(123);
- md.version = 1;
-#else
- u8 direction = 0;
- u8 hwid = 17;
-
- md.version = 2;
- md.u.md2.dir = direction;
- md.u.md2.hwid = hwid & 0xf;
- md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
-#endif
-
- ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("ip4ip6erspan_get_tunnel")
-int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
-{
- char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n";
- struct bpf_tunnel_key key;
- struct erspan_metadata md;
- u32 index;
- int ret;
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt),
- key.tunnel_id, key.remote_ipv4, md.version);
-
-#ifdef ERSPAN_V1
- char fmt2[] = "\tindex %x\n";
-
- index = bpf_ntohl(md.u.index);
- bpf_trace_printk(fmt2, sizeof(fmt2), index);
-#else
- char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
-
- bpf_trace_printk(fmt2, sizeof(fmt2),
- md.u.md2.dir,
- (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
- bpf_ntohl(md.u.md2.timestamp));
-#endif
-
- return TC_ACT_OK;
-}
-
-SEC("vxlan_set_tunnel")
-int _vxlan_set_tunnel(struct __sk_buff *skb)
-{
- int ret;
- struct bpf_tunnel_key key;
- struct vxlan_metadata md;
-
- __builtin_memset(&key, 0x0, sizeof(key));
- key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
- key.tunnel_id = 2;
- key.tunnel_tos = 0;
- key.tunnel_ttl = 64;
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- md.gbp = 0x800FF; /* Set VXLAN Group Policy extension */
- ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("vxlan_get_tunnel")
-int _vxlan_get_tunnel(struct __sk_buff *skb)
-{
- int ret;
- struct bpf_tunnel_key key;
- struct vxlan_metadata md;
- char fmt[] = "key %d remote ip 0x%x vxlan gbp 0x%x\n";
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt),
- key.tunnel_id, key.remote_ipv4, md.gbp);
-
- return TC_ACT_OK;
-}
-
-SEC("geneve_set_tunnel")
-int _geneve_set_tunnel(struct __sk_buff *skb)
-{
- int ret, ret2;
- struct bpf_tunnel_key key;
- struct geneve_opt gopt;
-
- __builtin_memset(&key, 0x0, sizeof(key));
- key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
- key.tunnel_id = 2;
- key.tunnel_tos = 0;
- key.tunnel_ttl = 64;
-
- __builtin_memset(&gopt, 0x0, sizeof(gopt));
- gopt.opt_class = 0x102; /* Open Virtual Networking (OVN) */
- gopt.type = 0x08;
- gopt.r1 = 0;
- gopt.r2 = 0;
- gopt.r3 = 0;
- gopt.length = 2; /* 4-byte multiple */
- *(int *) &gopt.opt_data = 0xdeadbeef;
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt));
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("geneve_get_tunnel")
-int _geneve_get_tunnel(struct __sk_buff *skb)
-{
- int ret;
- struct bpf_tunnel_key key;
- struct geneve_opt gopt;
- char fmt[] = "key %d remote ip 0x%x geneve class 0x%x\n";
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt));
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt),
- key.tunnel_id, key.remote_ipv4, gopt.opt_class);
- return TC_ACT_OK;
-}
-
-SEC("ipip_set_tunnel")
-int _ipip_set_tunnel(struct __sk_buff *skb)
-{
- struct bpf_tunnel_key key = {};
- void *data = (void *)(long)skb->data;
- struct iphdr *iph = data;
- struct tcphdr *tcp = data + sizeof(*iph);
- void *data_end = (void *)(long)skb->data_end;
- int ret;
-
- /* single length check */
- if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
- ERROR(1);
- return TC_ACT_SHOT;
- }
-
- key.tunnel_ttl = 64;
- if (iph->protocol == IPPROTO_ICMP) {
- key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
- } else {
- if (iph->protocol != IPPROTO_TCP || iph->ihl != 5)
- return TC_ACT_SHOT;
-
- if (tcp->dest == htons(5200))
- key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
- else if (tcp->dest == htons(5201))
- key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */
- else
- return TC_ACT_SHOT;
- }
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("ipip_get_tunnel")
-int _ipip_get_tunnel(struct __sk_buff *skb)
-{
- int ret;
- struct bpf_tunnel_key key;
- char fmt[] = "remote ip 0x%x\n";
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4);
- return TC_ACT_OK;
-}
-
-SEC("ipip6_set_tunnel")
-int _ipip6_set_tunnel(struct __sk_buff *skb)
-{
- struct bpf_tunnel_key key = {};
- void *data = (void *)(long)skb->data;
- struct iphdr *iph = data;
- struct tcphdr *tcp = data + sizeof(*iph);
- void *data_end = (void *)(long)skb->data_end;
- int ret;
-
- /* single length check */
- if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
- ERROR(1);
- return TC_ACT_SHOT;
- }
-
- key.remote_ipv6[0] = _htonl(0x2401db00);
- key.tunnel_ttl = 64;
-
- if (iph->protocol == IPPROTO_ICMP) {
- key.remote_ipv6[3] = _htonl(1);
- } else {
- if (iph->protocol != IPPROTO_TCP || iph->ihl != 5) {
- ERROR(iph->protocol);
- return TC_ACT_SHOT;
- }
-
- if (tcp->dest == htons(5200)) {
- key.remote_ipv6[3] = _htonl(1);
- } else if (tcp->dest == htons(5201)) {
- key.remote_ipv6[3] = _htonl(2);
- } else {
- ERROR(tcp->dest);
- return TC_ACT_SHOT;
- }
- }
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("ipip6_get_tunnel")
-int _ipip6_get_tunnel(struct __sk_buff *skb)
-{
- int ret;
- struct bpf_tunnel_key key;
- char fmt[] = "remote ip6 %x::%x\n";
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]),
- _htonl(key.remote_ipv6[3]));
- return TC_ACT_OK;
-}
-
-SEC("ip6ip6_set_tunnel")
-int _ip6ip6_set_tunnel(struct __sk_buff *skb)
-{
- struct bpf_tunnel_key key = {};
- void *data = (void *)(long)skb->data;
- struct ipv6hdr *iph = data;
- struct tcphdr *tcp = data + sizeof(*iph);
- void *data_end = (void *)(long)skb->data_end;
- int ret;
-
- /* single length check */
- if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
- ERROR(1);
- return TC_ACT_SHOT;
- }
-
- key.remote_ipv6[0] = _htonl(0x2401db00);
- key.tunnel_ttl = 64;
-
- if (iph->nexthdr == NEXTHDR_ICMP) {
- key.remote_ipv6[3] = _htonl(1);
- } else {
- if (iph->nexthdr != NEXTHDR_TCP) {
- ERROR(iph->nexthdr);
- return TC_ACT_SHOT;
- }
-
- if (tcp->dest == htons(5200)) {
- key.remote_ipv6[3] = _htonl(1);
- } else if (tcp->dest == htons(5201)) {
- key.remote_ipv6[3] = _htonl(2);
- } else {
- ERROR(tcp->dest);
- return TC_ACT_SHOT;
- }
- }
-
- ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- return TC_ACT_OK;
-}
-
-SEC("ip6ip6_get_tunnel")
-int _ip6ip6_get_tunnel(struct __sk_buff *skb)
-{
- int ret;
- struct bpf_tunnel_key key;
- char fmt[] = "remote ip6 %x::%x\n";
-
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
- if (ret < 0) {
- ERROR(ret);
- return TC_ACT_SHOT;
- }
-
- bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]),
- _htonl(key.remote_ipv6[3]));
- return TC_ACT_OK;
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c
index 8a1b8b5d8def..242184292f59 100644
--- a/samples/bpf/test_cgrp2_array_pin.c
+++ b/samples/bpf/test_cgrp2_array_pin.c
@@ -14,7 +14,7 @@
#include <errno.h>
#include <fcntl.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
static void usage(void)
{
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
index 4bfcaf93fcf3..20fbd1241db3 100644
--- a/samples/bpf/test_cgrp2_attach.c
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -28,8 +28,9 @@
#include <fcntl.h>
#include <linux/bpf.h>
+#include <bpf/bpf.h>
-#include "libbpf.h"
+#include "bpf_insn.h"
enum {
MAP_KEY_PACKETS,
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 1af412ec6007..b453e6a161be 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -24,8 +24,9 @@
#include <unistd.h>
#include <linux/bpf.h>
+#include <bpf/bpf.h>
-#include "libbpf.h"
+#include "bpf_insn.h"
#include "cgroup_helpers.h"
#define FOO "/foo"
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index e79594dd629b..b0811da5a00f 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -21,8 +21,9 @@
#include <net/if.h>
#include <inttypes.h>
#include <linux/bpf.h>
+#include <bpf/bpf.h>
-#include "libbpf.h"
+#include "bpf_insn.h"
char bpf_log_buf[BPF_LOG_BUF_SIZE];
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
index e53f1f6f0867..3b5be2364975 100644
--- a/samples/bpf/test_cgrp2_sock2.c
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -19,8 +19,9 @@
#include <fcntl.h>
#include <net/if.h>
#include <linux/bpf.h>
+#include <bpf/bpf.h>
-#include "libbpf.h"
+#include "bpf_insn.h"
#include "bpf_load.h"
static int usage(const char *argv0)
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c
index 65b5fb51c1db..4be4874ca2bc 100644
--- a/samples/bpf/test_current_task_under_cgroup_user.c
+++ b/samples/bpf/test_current_task_under_cgroup_user.c
@@ -9,7 +9,7 @@
#include <stdio.h>
#include <linux/bpf.h>
#include <unistd.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include <linux/bpf.h>
#include "cgroup_helpers.h"
diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c
index 73c357142268..eec3e2509ce8 100644
--- a/samples/bpf/test_lru_dist.c
+++ b/samples/bpf/test_lru_dist.c
@@ -21,7 +21,7 @@
#include <stdlib.h>
#include <time.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_util.h"
#define min(a, b) ((a) < (b) ? (a) : (b))
diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c
index 1aca18539d8d..e308858f7bcf 100644
--- a/samples/bpf/test_map_in_map_user.c
+++ b/samples/bpf/test_map_in_map_user.c
@@ -13,7 +13,7 @@
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#define PORT_A (map_fd[0])
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c
index e1d35e07a10e..6caf47afa635 100644
--- a/samples/bpf/test_overhead_user.c
+++ b/samples/bpf/test_overhead_user.c
@@ -19,7 +19,7 @@
#include <string.h>
#include <time.h>
#include <sys/resource.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#define MAX_CNT 1000000
diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c
index bf8e3a9f3067..045eb5e30f54 100644
--- a/samples/bpf/test_probe_write_user_user.c
+++ b/samples/bpf/test_probe_write_user_user.c
@@ -3,7 +3,7 @@
#include <assert.h>
#include <linux/bpf.h>
#include <unistd.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include <sys/socket.h>
#include <string.h>
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
deleted file mode 100755
index c265863ccdf9..000000000000
--- a/samples/bpf/test_tunnel_bpf.sh
+++ /dev/null
@@ -1,319 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-# In Namespace 0 (at_ns0) using native tunnel
-# Overlay IP: 10.1.1.100
-# local 192.16.1.100 remote 192.16.1.200
-# veth0 IP: 172.16.1.100, tunnel dev <type>00
-
-# Out of Namespace using BPF set/get on lwtunnel
-# Overlay IP: 10.1.1.200
-# local 172.16.1.200 remote 172.16.1.100
-# veth1 IP: 172.16.1.200, tunnel dev <type>11
-
-function config_device {
- ip netns add at_ns0
- ip link add veth0 type veth peer name veth1
- ip link set veth0 netns at_ns0
- ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
- ip netns exec at_ns0 ip link set dev veth0 up
- ip link set dev veth1 up mtu 1500
- ip addr add dev veth1 172.16.1.200/24
-}
-
-function add_gre_tunnel {
- # in namespace
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE seq key 2 \
- local 172.16.1.100 remote 172.16.1.200
- ip netns exec at_ns0 ip link set dev $DEV_NS up
- ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-
- # out of namespace
- ip link add dev $DEV type $TYPE key 2 external
- ip link set dev $DEV up
- ip addr add dev $DEV 10.1.1.200/24
-}
-
-function add_ip6gretap_tunnel {
-
- # assign ipv6 address
- ip netns exec at_ns0 ip addr add ::11/96 dev veth0
- ip netns exec at_ns0 ip link set dev veth0 up
- ip addr add dev veth1 ::22/96
- ip link set dev veth1 up
-
- # in namespace
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE seq flowlabel 0xbcdef key 2 \
- local ::11 remote ::22
-
- ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
- ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96
- ip netns exec at_ns0 ip link set dev $DEV_NS up
-
- # out of namespace
- ip link add dev $DEV type $TYPE external
- ip addr add dev $DEV 10.1.1.200/24
- ip addr add dev $DEV fc80::200/24
- ip link set dev $DEV up
-}
-
-function add_erspan_tunnel {
- # in namespace
- if [ "$1" == "v1" ]; then
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE seq key 2 \
- local 172.16.1.100 remote 172.16.1.200 \
- erspan_ver 1 erspan 123
- else
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE seq key 2 \
- local 172.16.1.100 remote 172.16.1.200 \
- erspan_ver 2 erspan_dir egress erspan_hwid 3
- fi
- ip netns exec at_ns0 ip link set dev $DEV_NS up
- ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-
- # out of namespace
- ip link add dev $DEV type $TYPE external
- ip link set dev $DEV up
- ip addr add dev $DEV 10.1.1.200/24
-}
-
-function add_ip6erspan_tunnel {
-
- # assign ipv6 address
- ip netns exec at_ns0 ip addr add ::11/96 dev veth0
- ip netns exec at_ns0 ip link set dev veth0 up
- ip addr add dev veth1 ::22/96
- ip link set dev veth1 up
-
- # in namespace
- if [ "$1" == "v1" ]; then
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE seq key 2 \
- local ::11 remote ::22 \
- erspan_ver 1 erspan 123
- else
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE seq key 2 \
- local ::11 remote ::22 \
- erspan_ver 2 erspan_dir egress erspan_hwid 7
- fi
- ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
- ip netns exec at_ns0 ip link set dev $DEV_NS up
-
- # out of namespace
- ip link add dev $DEV type $TYPE external
- ip addr add dev $DEV 10.1.1.200/24
- ip link set dev $DEV up
-}
-
-function add_vxlan_tunnel {
- # Set static ARP entry here because iptables set-mark works
- # on L3 packet, as a result not applying to ARP packets,
- # causing errors at get_tunnel_{key/opt}.
-
- # in namespace
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE id 2 dstport 4789 gbp remote 172.16.1.200
- ip netns exec at_ns0 ip link set dev $DEV_NS address 52:54:00:d9:01:00 up
- ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
- ip netns exec at_ns0 arp -s 10.1.1.200 52:54:00:d9:02:00
- ip netns exec at_ns0 iptables -A OUTPUT -j MARK --set-mark 0x800FF
-
- # out of namespace
- ip link add dev $DEV type $TYPE external gbp dstport 4789
- ip link set dev $DEV address 52:54:00:d9:02:00 up
- ip addr add dev $DEV 10.1.1.200/24
- arp -s 10.1.1.100 52:54:00:d9:01:00
-}
-
-function add_geneve_tunnel {
- # in namespace
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE id 2 dstport 6081 remote 172.16.1.200
- ip netns exec at_ns0 ip link set dev $DEV_NS up
- ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-
- # out of namespace
- ip link add dev $DEV type $TYPE dstport 6081 external
- ip link set dev $DEV up
- ip addr add dev $DEV 10.1.1.200/24
-}
-
-function add_ipip_tunnel {
- # in namespace
- ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE local 172.16.1.100 remote 172.16.1.200
- ip netns exec at_ns0 ip link set dev $DEV_NS up
- ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-
- # out of namespace
- ip link add dev $DEV type $TYPE external
- ip link set dev $DEV up
- ip addr add dev $DEV 10.1.1.200/24
-}
-
-function attach_bpf {
- DEV=$1
- SET_TUNNEL=$2
- GET_TUNNEL=$3
- tc qdisc add dev $DEV clsact
- tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL
- tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL
-}
-
-function test_gre {
- TYPE=gretap
- DEV_NS=gretap00
- DEV=gretap11
- config_device
- add_gre_tunnel
- attach_bpf $DEV gre_set_tunnel gre_get_tunnel
- ping -c 1 10.1.1.100
- ip netns exec at_ns0 ping -c 1 10.1.1.200
- cleanup
-}
-
-function test_ip6gre {
- TYPE=ip6gre
- DEV_NS=ip6gre00
- DEV=ip6gre11
- config_device
- # reuse the ip6gretap function
- add_ip6gretap_tunnel
- attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
- # underlay
- ping6 -c 4 ::11
- # overlay: ipv4 over ipv6
- ip netns exec at_ns0 ping -c 1 10.1.1.200
- ping -c 1 10.1.1.100
- # overlay: ipv6 over ipv6
- ip netns exec at_ns0 ping6 -c 1 fc80::200
- cleanup
-}
-
-function test_ip6gretap {
- TYPE=ip6gretap
- DEV_NS=ip6gretap00
- DEV=ip6gretap11
- config_device
- add_ip6gretap_tunnel
- attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
- # underlay
- ping6 -c 4 ::11
- # overlay: ipv4 over ipv6
- ip netns exec at_ns0 ping -i .2 -c 1 10.1.1.200
- ping -c 1 10.1.1.100
- # overlay: ipv6 over ipv6
- ip netns exec at_ns0 ping6 -c 1 fc80::200
- cleanup
-}
-
-function test_erspan {
- TYPE=erspan
- DEV_NS=erspan00
- DEV=erspan11
- config_device
- add_erspan_tunnel $1
- attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel
- ping -c 1 10.1.1.100
- ip netns exec at_ns0 ping -c 1 10.1.1.200
- cleanup
-}
-
-function test_ip6erspan {
- TYPE=ip6erspan
- DEV_NS=ip6erspan00
- DEV=ip6erspan11
- config_device
- add_ip6erspan_tunnel $1
- attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
- ping6 -c 3 ::11
- ip netns exec at_ns0 ping -c 1 10.1.1.200
- cleanup
-}
-
-function test_vxlan {
- TYPE=vxlan
- DEV_NS=vxlan00
- DEV=vxlan11
- config_device
- add_vxlan_tunnel
- attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel
- ping -c 1 10.1.1.100
- ip netns exec at_ns0 ping -c 1 10.1.1.200
- cleanup
-}
-
-function test_geneve {
- TYPE=geneve
- DEV_NS=geneve00
- DEV=geneve11
- config_device
- add_geneve_tunnel
- attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel
- ping -c 1 10.1.1.100
- ip netns exec at_ns0 ping -c 1 10.1.1.200
- cleanup
-}
-
-function test_ipip {
- TYPE=ipip
- DEV_NS=ipip00
- DEV=ipip11
- config_device
- tcpdump -nei veth1 &
- cat /sys/kernel/debug/tracing/trace_pipe &
- add_ipip_tunnel
- ethtool -K veth1 gso off gro off rx off tx off
- ip link set dev veth1 mtu 1500
- attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
- ping -c 1 10.1.1.100
- ip netns exec at_ns0 ping -c 1 10.1.1.200
- ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
- sleep 0.2
- iperf -c 10.1.1.100 -n 5k -p 5200
- cleanup
-}
-
-function cleanup {
- set +ex
- pkill iperf
- ip netns delete at_ns0
- ip link del veth1
- ip link del ipip11
- ip link del gretap11
- ip link del ip6gre11
- ip link del ip6gretap11
- ip link del vxlan11
- ip link del geneve11
- ip link del erspan11
- ip link del ip6erspan11
- pkill tcpdump
- pkill cat
- set -ex
-}
-
-trap cleanup 0 2 3 6 9
-cleanup
-echo "Testing GRE tunnel..."
-test_gre
-echo "Testing IP6GRE tunnel..."
-test_ip6gre
-echo "Testing IP6GRETAP tunnel..."
-test_ip6gretap
-echo "Testing ERSPAN tunnel..."
-test_erspan v1
-test_erspan v2
-echo "Testing IP6ERSPAN tunnel..."
-test_ip6erspan v1
-test_ip6erspan v2
-echo "Testing VXLAN tunnel..."
-test_vxlan
-echo "Testing GENEVE tunnel..."
-test_geneve
-echo "Testing IPIP tunnel..."
-test_ipip
-echo "*** PASS ***"
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 56f7a259a7c9..1fa1becfa641 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -21,6 +21,7 @@
#include "libbpf.h"
#include "bpf_load.h"
#include "perf-sys.h"
+#include "trace_helpers.h"
#define SAMPLE_FREQ 50
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index ccca1e348017..4837d73edefe 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -18,103 +18,13 @@
#include <sys/mman.h>
#include <time.h>
#include <signal.h>
-#include "libbpf.h"
+#include <libbpf.h>
#include "bpf_load.h"
#include "perf-sys.h"
+#include "trace_helpers.h"
static int pmu_fd;
-int page_size;
-int page_cnt = 8;
-volatile struct perf_event_mmap_page *header;
-
-typedef void (*print_fn)(void *data, int size);
-
-static int perf_event_mmap(int fd)
-{
- void *base;
- int mmap_size;
-
- page_size = getpagesize();
- mmap_size = page_size * (page_cnt + 1);
-
- base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- if (base == MAP_FAILED) {
- printf("mmap err\n");
- return -1;
- }
-
- header = base;
- return 0;
-}
-
-static int perf_event_poll(int fd)
-{
- struct pollfd pfd = { .fd = fd, .events = POLLIN };
-
- return poll(&pfd, 1, 1000);
-}
-
-struct perf_event_sample {
- struct perf_event_header header;
- __u32 size;
- char data[];
-};
-
-static void perf_event_read(print_fn fn)
-{
- __u64 data_tail = header->data_tail;
- __u64 data_head = header->data_head;
- __u64 buffer_size = page_cnt * page_size;
- void *base, *begin, *end;
- char buf[256];
-
- asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
- if (data_head == data_tail)
- return;
-
- base = ((char *)header) + page_size;
-
- begin = base + data_tail % buffer_size;
- end = base + data_head % buffer_size;
-
- while (begin != end) {
- struct perf_event_sample *e;
-
- e = begin;
- if (begin + e->header.size > base + buffer_size) {
- long len = base + buffer_size - begin;
-
- assert(len < e->header.size);
- memcpy(buf, begin, len);
- memcpy(buf + len, base, e->header.size - len);
- e = (void *) buf;
- begin = base + e->header.size - len;
- } else if (begin + e->header.size == base + buffer_size) {
- begin = base;
- } else {
- begin += e->header.size;
- }
-
- if (e->header.type == PERF_RECORD_SAMPLE) {
- fn(e->data, e->size);
- } else if (e->header.type == PERF_RECORD_LOST) {
- struct {
- struct perf_event_header header;
- __u64 id;
- __u64 lost;
- } *lost = (void *) e;
- printf("lost %lld events\n", lost->lost);
- } else {
- printf("unknown event type=%d size=%d\n",
- e->header.type, e->header.size);
- }
- }
-
- __sync_synchronize(); /* smp_mb() */
- header->data_tail = data_head;
-}
-
static __u64 time_get_ns(void)
{
struct timespec ts;
@@ -127,7 +37,7 @@ static __u64 start_time;
#define MAX_CNT 100000ll
-static void print_bpf_output(void *data, int size)
+static int print_bpf_output(void *data, int size)
{
static __u64 cnt;
struct {
@@ -138,7 +48,7 @@ static void print_bpf_output(void *data, int size)
if (e->cookie != 0x12345678) {
printf("BUG pid %llx cookie %llx sized %d\n",
e->pid, e->cookie, size);
- kill(0, SIGINT);
+ return LIBBPF_PERF_EVENT_ERROR;
}
cnt++;
@@ -146,8 +56,10 @@ static void print_bpf_output(void *data, int size)
if (cnt == MAX_CNT) {
printf("recv %lld events per sec\n",
MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
- kill(0, SIGINT);
+ return LIBBPF_PERF_EVENT_DONE;
}
+
+ return LIBBPF_PERF_EVENT_CONT;
}
static void test_bpf_perf_event(void)
@@ -170,6 +82,7 @@ int main(int argc, char **argv)
{
char filename[256];
FILE *f;
+ int ret;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
@@ -187,10 +100,7 @@ int main(int argc, char **argv)
(void) f;
start_time = time_get_ns();
- for (;;) {
- perf_event_poll(pmu_fd);
- perf_event_read(print_bpf_output);
- }
-
- return 0;
+ ret = perf_event_poller(pmu_fd, print_bpf_output);
+ kill(0, SIGINT);
+ return ret;
}
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
index 3dcb475fb135..af8c20608ab5 100644
--- a/samples/bpf/tracex1_user.c
+++ b/samples/bpf/tracex1_user.c
@@ -2,7 +2,7 @@
#include <stdio.h>
#include <linux/bpf.h>
#include <unistd.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
int main(int ac, char **argv)
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index efb5e61918df..1a81e6a5c2ea 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -7,7 +7,7 @@
#include <string.h>
#include <sys/resource.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include "bpf_util.h"
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
index fe372239d505..6c6b10f4c3ee 100644
--- a/samples/bpf/tracex3_user.c
+++ b/samples/bpf/tracex3_user.c
@@ -13,7 +13,7 @@
#include <linux/bpf.h>
#include <sys/resource.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include "bpf_util.h"
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
index 22c644f1f4c3..14625c898e43 100644
--- a/samples/bpf/tracex4_user.c
+++ b/samples/bpf/tracex4_user.c
@@ -14,7 +14,7 @@
#include <linux/bpf.h>
#include <sys/resource.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
struct pair {
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
index 4e2774b731f0..c4ab91c89494 100644
--- a/samples/bpf/tracex5_user.c
+++ b/samples/bpf/tracex5_user.c
@@ -5,7 +5,7 @@
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/prctl.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include <sys/resource.h>
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
index 89ab8d408474..4bb3c830adb2 100644
--- a/samples/bpf/tracex6_user.c
+++ b/samples/bpf/tracex6_user.c
@@ -16,7 +16,7 @@
#include <unistd.h>
#include "bpf_load.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "perf-sys.h"
#define SAMPLE_PERIOD 0x7fffffffffffffffULL
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
index 8a52ac492e8b..ea6dae78f0df 100644
--- a/samples/bpf/tracex7_user.c
+++ b/samples/bpf/tracex7_user.c
@@ -3,7 +3,7 @@
#include <stdio.h>
#include <linux/bpf.h>
#include <unistd.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
int main(int argc, char **argv)
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index b901ee2b3336..b02c531510ed 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -16,9 +16,9 @@
#include <libgen.h>
#include <sys/resource.h>
-#include "bpf_load.h"
#include "bpf_util.h"
-#include "libbpf.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
static int ifindex;
static __u32 xdp_flags;
@@ -31,7 +31,7 @@ static void int_exit(int sig)
/* simple per-protocol drop counter
*/
-static void poll_stats(int interval)
+static void poll_stats(int map_fd, int interval)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
const unsigned int nr_keys = 256;
@@ -47,7 +47,7 @@ static void poll_stats(int interval)
for (key = 0; key < nr_keys; key++) {
__u64 sum = 0;
- assert(bpf_map_lookup_elem(map_fd[0], &key, values) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, values) == 0);
for (i = 0; i < nr_cpus; i++)
sum += (values[i] - prev[key][i]);
if (sum)
@@ -71,9 +71,14 @@ static void usage(const char *prog)
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
const char *optstr = "SN";
+ int prog_fd, map_fd, opt;
+ struct bpf_object *obj;
+ struct bpf_map *map;
char filename[256];
- int opt;
while ((opt = getopt(argc, argv, optstr)) != -1) {
switch (opt) {
@@ -102,13 +107,19 @@ int main(int argc, char **argv)
ifindex = strtoul(argv[optind], NULL, 0);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ return 1;
+
+ map = bpf_map__next(NULL, obj);
+ if (!map) {
+ printf("finding a map in obj file failed\n");
return 1;
}
+ map_fd = bpf_map__fd(map);
- if (!prog_fd[0]) {
+ if (!prog_fd) {
printf("load_bpf_file: %s\n", strerror(errno));
return 1;
}
@@ -116,12 +127,12 @@ int main(int argc, char **argv)
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
- if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
printf("link set xdp fd failed\n");
return 1;
}
- poll_stats(2);
+ poll_stats(map_fd, 2);
return 0;
}
diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c
new file mode 100644
index 000000000000..411fdb21f8bc
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_kern.c
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program shows how to use bpf_xdp_adjust_tail() by
+ * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed
+ * to be more preice in case of v4)" where receiving packets bigger then
+ * 600 bytes.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include "bpf_helpers.h"
+
+#define DEFAULT_TTL 64
+#define MAX_PCKT_SIZE 600
+#define ICMP_TOOBIG_SIZE 98
+#define ICMP_TOOBIG_PAYLOAD_SIZE 92
+
+struct bpf_map_def SEC("maps") icmpcnt = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u64),
+ .max_entries = 1,
+};
+
+static __always_inline void count_icmp(void)
+{
+ u64 key = 0;
+ u64 *icmp_count;
+
+ icmp_count = bpf_map_lookup_elem(&icmpcnt, &key);
+ if (icmp_count)
+ *icmp_count += 1;
+}
+
+static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth)
+{
+ struct ethhdr *eth;
+
+ eth = data;
+ memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN);
+ memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN);
+ eth->h_proto = orig_eth->h_proto;
+}
+
+static __always_inline __u16 csum_fold_helper(__u32 csum)
+{
+ return ~((csum & 0xffff) + (csum >> 16));
+}
+
+static __always_inline void ipv4_csum(void *data_start, int data_size,
+ __u32 *csum)
+{
+ *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum);
+ *csum = csum_fold_helper(*csum);
+}
+
+static __always_inline int send_icmp4_too_big(struct xdp_md *xdp)
+{
+ int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr);
+
+ if (bpf_xdp_adjust_head(xdp, 0 - headroom))
+ return XDP_DROP;
+ void *data = (void *)(long)xdp->data;
+ void *data_end = (void *)(long)xdp->data_end;
+
+ if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end)
+ return XDP_DROP;
+
+ struct iphdr *iph, *orig_iph;
+ struct icmphdr *icmp_hdr;
+ struct ethhdr *orig_eth;
+ __u32 csum = 0;
+ __u64 off = 0;
+
+ orig_eth = data + headroom;
+ swap_mac(data, orig_eth);
+ off += sizeof(struct ethhdr);
+ iph = data + off;
+ off += sizeof(struct iphdr);
+ icmp_hdr = data + off;
+ off += sizeof(struct icmphdr);
+ orig_iph = data + off;
+ icmp_hdr->type = ICMP_DEST_UNREACH;
+ icmp_hdr->code = ICMP_FRAG_NEEDED;
+ icmp_hdr->un.frag.mtu = htons(MAX_PCKT_SIZE-sizeof(struct ethhdr));
+ icmp_hdr->checksum = 0;
+ ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum);
+ icmp_hdr->checksum = csum;
+ iph->ttl = DEFAULT_TTL;
+ iph->daddr = orig_iph->saddr;
+ iph->saddr = orig_iph->daddr;
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->protocol = IPPROTO_ICMP;
+ iph->tos = 0;
+ iph->tot_len = htons(
+ ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr));
+ iph->check = 0;
+ csum = 0;
+ ipv4_csum(iph, sizeof(struct iphdr), &csum);
+ iph->check = csum;
+ count_icmp();
+ return XDP_TX;
+}
+
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ int pckt_size = data_end - data;
+ int offset;
+
+ if (pckt_size > MAX_PCKT_SIZE) {
+ offset = pckt_size - ICMP_TOOBIG_SIZE;
+ if (bpf_xdp_adjust_tail(xdp, 0 - offset))
+ return XDP_PASS;
+ return send_icmp4_too_big(xdp);
+ }
+ return XDP_PASS;
+}
+
+SEC("xdp_icmp")
+int _xdp_icmp(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct ethhdr *eth = data;
+ __u16 h_proto;
+
+ if (eth + 1 > data_end)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_IP))
+ return handle_ipv4(xdp);
+ else
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
new file mode 100644
index 000000000000..3042ce37dae8
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <arpa/inet.h>
+#include <netinet/ether.h>
+#include <unistd.h>
+#include <time.h>
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+
+#define STATS_INTERVAL_S 2U
+
+static int ifindex = -1;
+static __u32 xdp_flags;
+
+static void int_exit(int sig)
+{
+ if (ifindex > -1)
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(0);
+}
+
+/* simple "icmp packet too big sent" counter
+ */
+static void poll_stats(unsigned int map_fd, unsigned int kill_after_s)
+{
+ time_t started_at = time(NULL);
+ __u64 value = 0;
+ int key = 0;
+
+
+ while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
+ sleep(STATS_INTERVAL_S);
+
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+
+ printf("icmp \"packet too big\" sent: %10llu pkts\n", value);
+ }
+}
+
+static void usage(const char *cmd)
+{
+ printf("Start a XDP prog which send ICMP \"packet too big\" \n"
+ "messages if ingress packet is bigger then MAX_SIZE bytes\n");
+ printf("Usage: %s [...]\n", cmd);
+ printf(" -i <ifindex> Interface Index\n");
+ printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
+ printf(" -S use skb-mode\n");
+ printf(" -N enforce native mode\n");
+ printf(" -h Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
+ unsigned char opt_flags[256] = {};
+ unsigned int kill_after_s = 0;
+ const char *optstr = "i:T:SNh";
+ int i, prog_fd, map_fd, opt;
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ char filename[256];
+
+ for (i = 0; i < strlen(optstr); i++)
+ if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
+ opt_flags[(unsigned char)optstr[i]] = 1;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+
+ switch (opt) {
+ case 'i':
+ ifindex = atoi(optarg);
+ break;
+ case 'T':
+ kill_after_s = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ opt_flags[opt] = 0;
+ }
+
+ for (i = 0; i < strlen(optstr); i++) {
+ if (opt_flags[(unsigned int)optstr[i]]) {
+ fprintf(stderr, "Missing argument -%c\n", optstr[i]);
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
+
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ return 1;
+
+ map = bpf_map__next(NULL, obj);
+ if (!map) {
+ printf("finding a map in obj file failed\n");
+ return 1;
+ }
+ map_fd = bpf_map__fd(map);
+
+ if (!prog_fd) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+ printf("link set xdp fd failed\n");
+ return 1;
+ }
+
+ poll_stats(map_fd, kill_after_s);
+
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000000..6673cdb9f55c
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include "bpf_helpers.h"
+
+#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF)
+
+struct bpf_map_def SEC("maps") tx_port = {
+ .type = BPF_MAP_TYPE_DEVMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 64,
+};
+
+/* from include/net/ip.h */
+static __always_inline int ip_decrease_ttl(struct iphdr *iph)
+{
+ u32 check = (__force u32)iph->check;
+
+ check += (__force u32)htons(0x0100);
+ iph->check = (__force __sum16)(check + (check >= 0xFFFF));
+ return --iph->ttl;
+}
+
+static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct bpf_fib_lookup fib_params;
+ struct ethhdr *eth = data;
+ struct ipv6hdr *ip6h;
+ struct iphdr *iph;
+ int out_index;
+ u16 h_proto;
+ u64 nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return XDP_DROP;
+
+ __builtin_memset(&fib_params, 0, sizeof(fib_params));
+
+ h_proto = eth->h_proto;
+ if (h_proto == htons(ETH_P_IP)) {
+ iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+
+ if (iph->ttl <= 1)
+ return XDP_PASS;
+
+ fib_params.family = AF_INET;
+ fib_params.tos = iph->tos;
+ fib_params.l4_protocol = iph->protocol;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = ntohs(iph->tot_len);
+ fib_params.ipv4_src = iph->saddr;
+ fib_params.ipv4_dst = iph->daddr;
+ } else if (h_proto == htons(ETH_P_IPV6)) {
+ struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst;
+
+ ip6h = data + nh_off;
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ if (ip6h->hop_limit <= 1)
+ return XDP_PASS;
+
+ fib_params.family = AF_INET6;
+ fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
+ fib_params.l4_protocol = ip6h->nexthdr;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = ntohs(ip6h->payload_len);
+ *src = ip6h->saddr;
+ *dst = ip6h->daddr;
+ } else {
+ return XDP_PASS;
+ }
+
+ fib_params.ifindex = ctx->ingress_ifindex;
+
+ out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+
+ /* verify egress index has xdp support
+ * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
+ * cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+ * NOTE: without verification that egress index supports XDP
+ * forwarding packets are dropped.
+ */
+ if (out_index > 0) {
+ if (h_proto == htons(ETH_P_IP))
+ ip_decrease_ttl(iph);
+ else if (h_proto == htons(ETH_P_IPV6))
+ ip6h->hop_limit--;
+
+ memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+ memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+ return bpf_redirect_map(&tx_port, out_index, 0);
+ }
+
+ return XDP_PASS;
+}
+
+SEC("xdp_fwd")
+int xdp_fwd_prog(struct xdp_md *ctx)
+{
+ return xdp_fwd_flags(ctx, 0);
+}
+
+SEC("xdp_fwd_direct")
+int xdp_fwd_direct_prog(struct xdp_md *ctx)
+{
+ return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000000..a87a2048ed32
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/limits.h>
+#include <net/if.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+
+
+static int do_attach(int idx, int fd, const char *name)
+{
+ int err;
+
+ err = bpf_set_link_xdp_fd(idx, fd, 0);
+ if (err < 0)
+ printf("ERROR: failed to attach program to %s\n", name);
+
+ return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+ int err;
+
+ err = bpf_set_link_xdp_fd(idx, -1, 0);
+ if (err < 0)
+ printf("ERROR: failed to detach program from %s\n", name);
+
+ return err;
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] interface-list\n"
+ "\nOPTS:\n"
+ " -d detach program\n"
+ " -D direct table lookups (skip fib rules)\n",
+ prog);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[PATH_MAX];
+ int opt, i, idx, err;
+ int prog_id = 0;
+ int attach = 1;
+ int ret = 0;
+
+ while ((opt = getopt(argc, argv, ":dD")) != -1) {
+ switch (opt) {
+ case 'd':
+ attach = 0;
+ break;
+ case 'D':
+ prog_id = 1;
+ break;
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (optind == argc) {
+ usage(basename(argv[0]));
+ return 1;
+ }
+
+ if (attach) {
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (access(filename, O_RDONLY) < 0) {
+ printf("error accessing file %s: %s\n",
+ filename, strerror(errno));
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ if (!prog_fd[prog_id]) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+ }
+ if (attach) {
+ for (i = 1; i < 64; ++i)
+ bpf_map_update_elem(map_fd[0], &i, &i, 0);
+ }
+
+ for (i = optind; i < argc; ++i) {
+ idx = if_nametoindex(argv[i]);
+ if (!idx)
+ idx = strtoul(argv[i], NULL, 0);
+
+ if (!idx) {
+ fprintf(stderr, "Invalid arg\n");
+ return 1;
+ }
+ if (!attach) {
+ err = do_detach(idx, argv[i]);
+ if (err)
+ ret = err;
+ } else {
+ err = do_attach(idx, prog_fd[prog_id], argv[i]);
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 211db8ded0de..ad10fe700d7d 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -125,6 +125,7 @@ struct datarec {
u64 processed;
u64 dropped;
u64 info;
+ u64 err;
};
#define MAX_CPUS 64
@@ -208,3 +209,51 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
return 0;
}
+
+struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct devmap_xmit_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ u32 map_index; // offset:16; size:4; signed:0;
+ int drops; // offset:20; size:4; signed:1;
+ int sent; // offset:24; size:4; signed:1;
+ int from_ifindex; // offset:28; size:4; signed:1;
+ int to_ifindex; // offset:32; size:4; signed:1;
+ int err; // offset:36; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_devmap_xmit")
+int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->sent;
+ rec->dropped += ctx->drops;
+
+ /* Record bulk events, then userspace can calc average bulk size */
+ rec->info += 1;
+
+ /* Record error cases, where no frame were sent */
+ if (ctx->err)
+ rec->err++;
+
+ /* Catch API error of drv ndo_xdp_xmit sent more than count */
+ if (ctx->drops < 0)
+ rec->err++;
+
+ return 1;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index eec14520d513..dd558cbb2309 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -26,7 +26,7 @@ static const char *__doc_err_only__=
#include <net/if.h>
#include <time.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include "bpf_util.h"
@@ -58,7 +58,7 @@ static void usage(char *argv[])
printf(" flag (internal value:%d)",
*long_options[i].flag);
else
- printf("(internal short-option: -%c)",
+ printf("short-option: -%c",
long_options[i].val);
printf("\n");
}
@@ -117,6 +117,7 @@ struct datarec {
__u64 processed;
__u64 dropped;
__u64 info;
+ __u64 err;
};
#define MAX_CPUS 64
@@ -141,6 +142,7 @@ struct stats_record {
struct record_u64 xdp_exception[XDP_ACTION_MAX];
struct record xdp_cpumap_kthread;
struct record xdp_cpumap_enqueue[MAX_CPUS];
+ struct record xdp_devmap_xmit;
};
static bool map_collect_record(int fd, __u32 key, struct record *rec)
@@ -151,6 +153,7 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
__u64 sum_processed = 0;
__u64 sum_dropped = 0;
__u64 sum_info = 0;
+ __u64 sum_err = 0;
int i;
if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
@@ -169,10 +172,13 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
sum_dropped += values[i].dropped;
rec->cpu[i].info = values[i].info;
sum_info += values[i].info;
+ rec->cpu[i].err = values[i].err;
+ sum_err += values[i].err;
}
rec->total.processed = sum_processed;
rec->total.dropped = sum_dropped;
rec->total.info = sum_info;
+ rec->total.err = sum_err;
return true;
}
@@ -273,6 +279,18 @@ static double calc_info(struct datarec *r, struct datarec *p, double period)
return pps;
}
+static double calc_err(struct datarec *r, struct datarec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->err - p->err;
+ pps = packets / period;
+ }
+ return pps;
+}
+
static void stats_print(struct stats_record *stats_rec,
struct stats_record *stats_prev,
bool err_only)
@@ -330,7 +348,7 @@ static void stats_print(struct stats_record *stats_rec,
pps = calc_pps_u64(r, p, t);
if (pps > 0)
printf(fmt1, "Exception", i,
- 0.0, pps, err2str(rec_i));
+ 0.0, pps, action2str(rec_i));
}
pps = calc_pps_u64(&rec->total, &prev->total, t);
if (pps > 0)
@@ -397,7 +415,7 @@ static void stats_print(struct stats_record *stats_rec,
info = calc_info(r, p, t);
if (info > 0)
i_str = "sched";
- if (pps > 0)
+ if (pps > 0 || drop > 0)
printf(fmt1, "cpumap-kthread",
i, pps, drop, info, i_str);
}
@@ -409,6 +427,50 @@ static void stats_print(struct stats_record *stats_rec,
printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
}
+ /* devmap ndo_xdp_xmit stats */
+ {
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
+ struct record *rec, *prev;
+ double drop, info, err;
+ char *i_str = "";
+ char *err_str = "";
+
+ rec = &stats_rec->xdp_devmap_xmit;
+ prev = &stats_prev->xdp_devmap_xmit;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop(r, p, t);
+ info = calc_info(r, p, t);
+ err = calc_err(r, p, t);
+ if (info > 0) {
+ i_str = "bulk-average";
+ info = (pps+drop) / info; /* calc avg bulk */
+ }
+ if (err > 0)
+ err_str = "drv-err";
+ if (pps > 0 || drop > 0)
+ printf(fmt1, "devmap-xmit",
+ i, pps, drop, info, i_str, err_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop(&rec->total, &prev->total, t);
+ info = calc_info(&rec->total, &prev->total, t);
+ err = calc_err(&rec->total, &prev->total, t);
+ if (info > 0) {
+ i_str = "bulk-average";
+ info = (pps+drop) / info; /* calc avg bulk */
+ }
+ if (err > 0)
+ err_str = "drv-err";
+ printf(fmt2, "devmap-xmit", "total", pps, drop,
+ info, i_str, err_str);
+ }
+
printf("\n");
}
@@ -437,6 +499,9 @@ static bool stats_collect(struct stats_record *rec)
fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+ fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+ map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
+
return true;
}
@@ -480,6 +545,7 @@ static struct stats_record *alloc_stats_record(void)
rec_sz = sizeof(struct datarec);
rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+ rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz);
for (i = 0; i < MAX_CPUS; i++)
rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
@@ -498,6 +564,7 @@ static void free_stats_record(struct stats_record *r)
free(r->xdp_exception[i].cpu);
free(r->xdp_cpumap_kthread.cpu);
+ free(r->xdp_devmap_xmit.cpu);
for (i = 0; i < MAX_CPUS; i++)
free(r->xdp_cpumap_enqueue[i].cpu);
@@ -594,7 +661,7 @@ int main(int argc, char **argv)
snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
/* Parse commands line args */
- while ((opt = getopt_long(argc, argv, "h",
+ while ((opt = getopt_long(argc, argv, "hDSs:",
long_options, &longindex)) != -1) {
switch (opt) {
case 'D':
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index 23744a8aaf21..f6efaefd485b 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -28,7 +28,7 @@ static const char *__doc__ =
* use bpf/libbpf.h), but cannot as (currently) needed for XDP
* attaching to a device via bpf_set_link_xdp_fd()
*/
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_load.h"
#include "bpf_util.h"
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index 7eae07d7293e..4445e76854b5 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -24,7 +24,7 @@
#include "bpf_load.h"
#include "bpf_util.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
static int ifindex_in;
static int ifindex_out;
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index b701b5c21342..81a69e36cb78 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -24,7 +24,7 @@
#include "bpf_load.h"
#include "bpf_util.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
static int ifindex_in;
static int ifindex_out;
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
index 6296741c1fbd..b2b4dfa776c8 100644
--- a/samples/bpf/xdp_router_ipv4_user.c
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -16,7 +16,7 @@
#include <sys/socket.h>
#include <unistd.h>
#include "bpf_load.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <poll.h>
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
index 478d95412de4..e4e9ba52bff0 100644
--- a/samples/bpf/xdp_rxq_info_user.c
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -22,8 +22,8 @@ static const char *__doc__ = " XDP RX-queue info extract example\n\n"
#include <arpa/inet.h>
#include <linux/if_link.h>
-#include "libbpf.h"
-#include "bpf_load.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
#include "bpf_util.h"
static int ifindex = -1;
@@ -32,6 +32,9 @@ static char *ifname;
static __u32 xdp_flags;
+static struct bpf_map *stats_global_map;
+static struct bpf_map *rx_queue_index_map;
+
/* Exit return codes */
#define EXIT_OK 0
#define EXIT_FAIL 1
@@ -174,7 +177,7 @@ static struct datarec *alloc_record_per_cpu(void)
static struct record *alloc_record_per_rxq(void)
{
- unsigned int nr_rxqs = map_data[2].def.max_entries;
+ unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
struct record *array;
size_t size;
@@ -190,7 +193,7 @@ static struct record *alloc_record_per_rxq(void)
static struct stats_record *alloc_stats_record(void)
{
- unsigned int nr_rxqs = map_data[2].def.max_entries;
+ unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
struct stats_record *rec;
int i;
@@ -210,7 +213,7 @@ static struct stats_record *alloc_stats_record(void)
static void free_stats_record(struct stats_record *r)
{
- unsigned int nr_rxqs = map_data[2].def.max_entries;
+ unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
int i;
for (i = 0; i < nr_rxqs; i++)
@@ -254,11 +257,11 @@ static void stats_collect(struct stats_record *rec)
{
int fd, i, max_rxqs;
- fd = map_data[1].fd; /* map: stats_global_map */
+ fd = bpf_map__fd(stats_global_map);
map_collect_percpu(fd, 0, &rec->stats);
- fd = map_data[2].fd; /* map: rx_queue_index_map */
- max_rxqs = map_data[2].def.max_entries;
+ fd = bpf_map__fd(rx_queue_index_map);
+ max_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
for (i = 0; i < max_rxqs; i++)
map_collect_percpu(fd, i, &rec->rxq[i]);
}
@@ -304,8 +307,8 @@ static void stats_print(struct stats_record *stats_rec,
struct stats_record *stats_prev,
int action)
{
+ unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
unsigned int nr_cpus = bpf_num_possible_cpus();
- unsigned int nr_rxqs = map_data[2].def.max_entries;
double pps = 0, err = 0;
struct record *rec, *prev;
double t;
@@ -419,31 +422,44 @@ static void stats_poll(int interval, int action)
int main(int argc, char **argv)
{
struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
+ int prog_fd, map_fd, opt, err;
bool use_separators = true;
struct config cfg = { 0 };
+ struct bpf_object *obj;
+ struct bpf_map *map;
char filename[256];
int longindex = 0;
int interval = 2;
__u32 key = 0;
- int opt, err;
char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
int action = XDP_PASS; /* Default action */
char *action_str = NULL;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
- if (load_bpf_file(filename)) {
- fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ return EXIT_FAIL;
+
+ map = bpf_map__next(NULL, obj);
+ stats_global_map = bpf_map__next(map, obj);
+ rx_queue_index_map = bpf_map__next(stats_global_map, obj);
+ if (!map || !stats_global_map || !rx_queue_index_map) {
+ printf("finding a map in obj file failed\n");
return EXIT_FAIL;
}
+ map_fd = bpf_map__fd(map);
- if (!prog_fd[0]) {
+ if (!prog_fd) {
fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
return EXIT_FAIL;
}
@@ -512,7 +528,7 @@ int main(int argc, char **argv)
setlocale(LC_NUMERIC, "en_US");
/* User-side setup ifindex in config_map */
- err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0);
+ err = bpf_map_update_elem(map_fd, &key, &cfg, 0);
if (err) {
fprintf(stderr, "Store config failed (err:%d)\n", err);
exit(EXIT_FAIL_BPF);
@@ -521,7 +537,7 @@ int main(int argc, char **argv)
/* Remove XDP program when program is interrupted */
signal(SIGINT, int_exit);
- if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
fprintf(stderr, "link set xdp fd failed\n");
return EXIT_FAIL_XDP;
}
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index f0a787268a87..a4ccc33adac0 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -18,7 +18,7 @@
#include <unistd.h>
#include <time.h>
#include "bpf_load.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
#include "bpf_util.h"
#include "xdp_tx_iptunnel_common.h"
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h
new file mode 100644
index 000000000000..533ab81adfa1
--- /dev/null
+++ b/samples/bpf/xdpsock.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef XDPSOCK_H_
+#define XDPSOCK_H_
+
+/* Power-of-2 number of sockets */
+#define MAX_SOCKS 4
+
+/* Round-robin receive */
+#define RR_LB 0
+
+#endif /* XDPSOCK_H_ */
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c
new file mode 100644
index 000000000000..d8806c41362e
--- /dev/null
+++ b/samples/bpf/xdpsock_kern.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#include "xdpsock.h"
+
+struct bpf_map_def SEC("maps") qidconf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") xsks_map = {
+ .type = BPF_MAP_TYPE_XSKMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 4,
+};
+
+struct bpf_map_def SEC("maps") rr_map = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(unsigned int),
+ .max_entries = 1,
+};
+
+SEC("xdp_sock")
+int xdp_sock_prog(struct xdp_md *ctx)
+{
+ int *qidconf, key = 0, idx;
+ unsigned int *rr;
+
+ qidconf = bpf_map_lookup_elem(&qidconf_map, &key);
+ if (!qidconf)
+ return XDP_ABORTED;
+
+ if (*qidconf != ctx->rx_queue_index)
+ return XDP_PASS;
+
+#if RR_LB /* NB! RR_LB is configured in xdpsock.h */
+ rr = bpf_map_lookup_elem(&rr_map, &key);
+ if (!rr)
+ return XDP_ABORTED;
+
+ *rr = (*rr + 1) & (MAX_SOCKS - 1);
+ idx = *rr;
+#else
+ idx = 0;
+#endif
+
+ return bpf_redirect_map(&xsks_map, idx, 0);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
new file mode 100644
index 000000000000..d69c8d78d3fd
--- /dev/null
+++ b/samples/bpf/xdpsock_user.c
@@ -0,0 +1,962 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2017 - 2018 Intel Corporation. */
+
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/if_xdp.h>
+#include <linux/if_ether.h>
+#include <net/if.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/ethernet.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <poll.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+
+#include "xdpsock.h"
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define NUM_FRAMES 131072
+#define FRAME_HEADROOM 0
+#define FRAME_SHIFT 11
+#define FRAME_SIZE 2048
+#define NUM_DESCS 1024
+#define BATCH_SIZE 16
+
+#define FQ_NUM_DESCS 1024
+#define CQ_NUM_DESCS 1024
+
+#define DEBUG_HEXDUMP 0
+
+typedef __u64 u64;
+typedef __u32 u32;
+
+static unsigned long prev_time;
+
+enum benchmark_type {
+ BENCH_RXDROP = 0,
+ BENCH_TXONLY = 1,
+ BENCH_L2FWD = 2,
+};
+
+static enum benchmark_type opt_bench = BENCH_RXDROP;
+static u32 opt_xdp_flags;
+static const char *opt_if = "";
+static int opt_ifindex;
+static int opt_queue;
+static int opt_poll;
+static int opt_shared_packet_buffer;
+static int opt_interval = 1;
+static u32 opt_xdp_bind_flags;
+
+struct xdp_umem_uqueue {
+ u32 cached_prod;
+ u32 cached_cons;
+ u32 mask;
+ u32 size;
+ u32 *producer;
+ u32 *consumer;
+ u64 *ring;
+ void *map;
+};
+
+struct xdp_umem {
+ char *frames;
+ struct xdp_umem_uqueue fq;
+ struct xdp_umem_uqueue cq;
+ int fd;
+};
+
+struct xdp_uqueue {
+ u32 cached_prod;
+ u32 cached_cons;
+ u32 mask;
+ u32 size;
+ u32 *producer;
+ u32 *consumer;
+ struct xdp_desc *ring;
+ void *map;
+};
+
+struct xdpsock {
+ struct xdp_uqueue rx;
+ struct xdp_uqueue tx;
+ int sfd;
+ struct xdp_umem *umem;
+ u32 outstanding_tx;
+ unsigned long rx_npkts;
+ unsigned long tx_npkts;
+ unsigned long prev_rx_npkts;
+ unsigned long prev_tx_npkts;
+};
+
+#define MAX_SOCKS 4
+static int num_socks;
+struct xdpsock *xsks[MAX_SOCKS];
+
+static unsigned long get_nsecs(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000UL + ts.tv_nsec;
+}
+
+static void dump_stats(void);
+
+#define lassert(expr) \
+ do { \
+ if (!(expr)) { \
+ fprintf(stderr, "%s:%s:%i: Assertion failed: " \
+ #expr ": errno: %d/\"%s\"\n", \
+ __FILE__, __func__, __LINE__, \
+ errno, strerror(errno)); \
+ dump_stats(); \
+ exit(EXIT_FAILURE); \
+ } \
+ } while (0)
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+#define u_smp_rmb() barrier()
+#define u_smp_wmb() barrier()
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+static const char pkt_data[] =
+ "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
+ "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
+ "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
+ "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
+
+static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
+{
+ u32 free_entries = q->cached_cons - q->cached_prod;
+
+ if (free_entries >= nb)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = *q->consumer + q->size;
+
+ return q->cached_cons - q->cached_prod;
+}
+
+static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
+{
+ u32 free_entries = q->cached_cons - q->cached_prod;
+
+ if (free_entries >= ndescs)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = *q->consumer + q->size;
+ return q->cached_cons - q->cached_prod;
+}
+
+static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb)
+{
+ u32 entries = q->cached_prod - q->cached_cons;
+
+ if (entries == 0) {
+ q->cached_prod = *q->producer;
+ entries = q->cached_prod - q->cached_cons;
+ }
+
+ return (entries > nb) ? nb : entries;
+}
+
+static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs)
+{
+ u32 entries = q->cached_prod - q->cached_cons;
+
+ if (entries == 0) {
+ q->cached_prod = *q->producer;
+ entries = q->cached_prod - q->cached_cons;
+ }
+
+ return (entries > ndescs) ? ndescs : entries;
+}
+
+static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
+ struct xdp_desc *d,
+ size_t nb)
+{
+ u32 i;
+
+ if (umem_nb_free(fq, nb) < nb)
+ return -ENOSPC;
+
+ for (i = 0; i < nb; i++) {
+ u32 idx = fq->cached_prod++ & fq->mask;
+
+ fq->ring[idx] = d[i].addr;
+ }
+
+ u_smp_wmb();
+
+ *fq->producer = fq->cached_prod;
+
+ return 0;
+}
+
+static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u64 *d,
+ size_t nb)
+{
+ u32 i;
+
+ if (umem_nb_free(fq, nb) < nb)
+ return -ENOSPC;
+
+ for (i = 0; i < nb; i++) {
+ u32 idx = fq->cached_prod++ & fq->mask;
+
+ fq->ring[idx] = d[i];
+ }
+
+ u_smp_wmb();
+
+ *fq->producer = fq->cached_prod;
+
+ return 0;
+}
+
+static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
+ u64 *d, size_t nb)
+{
+ u32 idx, i, entries = umem_nb_avail(cq, nb);
+
+ u_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = cq->cached_cons++ & cq->mask;
+ d[i] = cq->ring[idx];
+ }
+
+ if (entries > 0) {
+ u_smp_wmb();
+
+ *cq->consumer = cq->cached_cons;
+ }
+
+ return entries;
+}
+
+static inline void *xq_get_data(struct xdpsock *xsk, u64 addr)
+{
+ return &xsk->umem->frames[addr];
+}
+
+static inline int xq_enq(struct xdp_uqueue *uq,
+ const struct xdp_desc *descs,
+ unsigned int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int i;
+
+ if (xq_nb_free(uq, ndescs) < ndescs)
+ return -ENOSPC;
+
+ for (i = 0; i < ndescs; i++) {
+ u32 idx = uq->cached_prod++ & uq->mask;
+
+ r[idx].addr = descs[i].addr;
+ r[idx].len = descs[i].len;
+ }
+
+ u_smp_wmb();
+
+ *uq->producer = uq->cached_prod;
+ return 0;
+}
+
+static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
+ unsigned int id, unsigned int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int i;
+
+ if (xq_nb_free(uq, ndescs) < ndescs)
+ return -ENOSPC;
+
+ for (i = 0; i < ndescs; i++) {
+ u32 idx = uq->cached_prod++ & uq->mask;
+
+ r[idx].addr = (id + i) << FRAME_SHIFT;
+ r[idx].len = sizeof(pkt_data) - 1;
+ }
+
+ u_smp_wmb();
+
+ *uq->producer = uq->cached_prod;
+ return 0;
+}
+
+static inline int xq_deq(struct xdp_uqueue *uq,
+ struct xdp_desc *descs,
+ int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int idx;
+ int i, entries;
+
+ entries = xq_nb_avail(uq, ndescs);
+
+ u_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = uq->cached_cons++ & uq->mask;
+ descs[i] = r[idx];
+ }
+
+ if (entries > 0) {
+ u_smp_wmb();
+
+ *uq->consumer = uq->cached_cons;
+ }
+
+ return entries;
+}
+
+static void swap_mac_addresses(void *data)
+{
+ struct ether_header *eth = (struct ether_header *)data;
+ struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
+ struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
+ struct ether_addr tmp;
+
+ tmp = *src_addr;
+ *src_addr = *dst_addr;
+ *dst_addr = tmp;
+}
+
+static void hex_dump(void *pkt, size_t length, u64 addr)
+{
+ const unsigned char *address = (unsigned char *)pkt;
+ const unsigned char *line = address;
+ size_t line_size = 32;
+ unsigned char c;
+ char buf[32];
+ int i = 0;
+
+ if (!DEBUG_HEXDUMP)
+ return;
+
+ sprintf(buf, "addr=%llu", addr);
+ printf("length = %zu\n", length);
+ printf("%s | ", buf);
+ while (length-- > 0) {
+ printf("%02X ", *address++);
+ if (!(++i % line_size) || (length == 0 && i % line_size)) {
+ if (length == 0) {
+ while (i++ % line_size)
+ printf("__ ");
+ }
+ printf(" | "); /* right close */
+ while (line < address) {
+ c = *line++;
+ printf("%c", (c < 33 || c == 255) ? 0x2E : c);
+ }
+ printf("\n");
+ if (length > 0)
+ printf("%s | ", buf);
+ }
+ }
+ printf("\n");
+}
+
+static size_t gen_eth_frame(char *frame)
+{
+ memcpy(frame, pkt_data, sizeof(pkt_data) - 1);
+ return sizeof(pkt_data) - 1;
+}
+
+static struct xdp_umem *xdp_umem_configure(int sfd)
+{
+ int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS;
+ struct xdp_mmap_offsets off;
+ struct xdp_umem_reg mr;
+ struct xdp_umem *umem;
+ socklen_t optlen;
+ void *bufs;
+
+ umem = calloc(1, sizeof(*umem));
+ lassert(umem);
+
+ lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
+ NUM_FRAMES * FRAME_SIZE) == 0);
+
+ mr.addr = (__u64)bufs;
+ mr.len = NUM_FRAMES * FRAME_SIZE;
+ mr.chunk_size = FRAME_SIZE;
+ mr.headroom = FRAME_HEADROOM;
+
+ lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
+ lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
+ sizeof(int)) == 0);
+ lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
+ sizeof(int)) == 0);
+
+ optlen = sizeof(off);
+ lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
+ &optlen) == 0);
+
+ umem->fq.map = mmap(0, off.fr.desc +
+ FQ_NUM_DESCS * sizeof(u64),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_UMEM_PGOFF_FILL_RING);
+ lassert(umem->fq.map != MAP_FAILED);
+
+ umem->fq.mask = FQ_NUM_DESCS - 1;
+ umem->fq.size = FQ_NUM_DESCS;
+ umem->fq.producer = umem->fq.map + off.fr.producer;
+ umem->fq.consumer = umem->fq.map + off.fr.consumer;
+ umem->fq.ring = umem->fq.map + off.fr.desc;
+ umem->fq.cached_cons = FQ_NUM_DESCS;
+
+ umem->cq.map = mmap(0, off.cr.desc +
+ CQ_NUM_DESCS * sizeof(u64),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_UMEM_PGOFF_COMPLETION_RING);
+ lassert(umem->cq.map != MAP_FAILED);
+
+ umem->cq.mask = CQ_NUM_DESCS - 1;
+ umem->cq.size = CQ_NUM_DESCS;
+ umem->cq.producer = umem->cq.map + off.cr.producer;
+ umem->cq.consumer = umem->cq.map + off.cr.consumer;
+ umem->cq.ring = umem->cq.map + off.cr.desc;
+
+ umem->frames = bufs;
+ umem->fd = sfd;
+
+ if (opt_bench == BENCH_TXONLY) {
+ int i;
+
+ for (i = 0; i < NUM_FRAMES * FRAME_SIZE; i += FRAME_SIZE)
+ (void)gen_eth_frame(&umem->frames[i]);
+ }
+
+ return umem;
+}
+
+static struct xdpsock *xsk_configure(struct xdp_umem *umem)
+{
+ struct sockaddr_xdp sxdp = {};
+ struct xdp_mmap_offsets off;
+ int sfd, ndescs = NUM_DESCS;
+ struct xdpsock *xsk;
+ bool shared = true;
+ socklen_t optlen;
+ u64 i;
+
+ sfd = socket(PF_XDP, SOCK_RAW, 0);
+ lassert(sfd >= 0);
+
+ xsk = calloc(1, sizeof(*xsk));
+ lassert(xsk);
+
+ xsk->sfd = sfd;
+ xsk->outstanding_tx = 0;
+
+ if (!umem) {
+ shared = false;
+ xsk->umem = xdp_umem_configure(sfd);
+ } else {
+ xsk->umem = umem;
+ }
+
+ lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING,
+ &ndescs, sizeof(int)) == 0);
+ lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING,
+ &ndescs, sizeof(int)) == 0);
+ optlen = sizeof(off);
+ lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
+ &optlen) == 0);
+
+ /* Rx */
+ xsk->rx.map = mmap(NULL,
+ off.rx.desc +
+ NUM_DESCS * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_PGOFF_RX_RING);
+ lassert(xsk->rx.map != MAP_FAILED);
+
+ if (!shared) {
+ for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE)
+ lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
+ == 0);
+ }
+
+ /* Tx */
+ xsk->tx.map = mmap(NULL,
+ off.tx.desc +
+ NUM_DESCS * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_PGOFF_TX_RING);
+ lassert(xsk->tx.map != MAP_FAILED);
+
+ xsk->rx.mask = NUM_DESCS - 1;
+ xsk->rx.size = NUM_DESCS;
+ xsk->rx.producer = xsk->rx.map + off.rx.producer;
+ xsk->rx.consumer = xsk->rx.map + off.rx.consumer;
+ xsk->rx.ring = xsk->rx.map + off.rx.desc;
+
+ xsk->tx.mask = NUM_DESCS - 1;
+ xsk->tx.size = NUM_DESCS;
+ xsk->tx.producer = xsk->tx.map + off.tx.producer;
+ xsk->tx.consumer = xsk->tx.map + off.tx.consumer;
+ xsk->tx.ring = xsk->tx.map + off.tx.desc;
+ xsk->tx.cached_cons = NUM_DESCS;
+
+ sxdp.sxdp_family = PF_XDP;
+ sxdp.sxdp_ifindex = opt_ifindex;
+ sxdp.sxdp_queue_id = opt_queue;
+
+ if (shared) {
+ sxdp.sxdp_flags = XDP_SHARED_UMEM;
+ sxdp.sxdp_shared_umem_fd = umem->fd;
+ } else {
+ sxdp.sxdp_flags = opt_xdp_bind_flags;
+ }
+
+ lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
+
+ return xsk;
+}
+
+static void print_benchmark(bool running)
+{
+ const char *bench_str = "INVALID";
+
+ if (opt_bench == BENCH_RXDROP)
+ bench_str = "rxdrop";
+ else if (opt_bench == BENCH_TXONLY)
+ bench_str = "txonly";
+ else if (opt_bench == BENCH_L2FWD)
+ bench_str = "l2fwd";
+
+ printf("%s:%d %s ", opt_if, opt_queue, bench_str);
+ if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
+ printf("xdp-skb ");
+ else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
+ printf("xdp-drv ");
+ else
+ printf(" ");
+
+ if (opt_poll)
+ printf("poll() ");
+
+ if (running) {
+ printf("running...");
+ fflush(stdout);
+ }
+}
+
+static void dump_stats(void)
+{
+ unsigned long now = get_nsecs();
+ long dt = now - prev_time;
+ int i;
+
+ prev_time = now;
+
+ for (i = 0; i < num_socks; i++) {
+ char *fmt = "%-15s %'-11.0f %'-11lu\n";
+ double rx_pps, tx_pps;
+
+ rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+ 1000000000. / dt;
+ tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+ 1000000000. / dt;
+
+ printf("\n sock%d@", i);
+ print_benchmark(false);
+ printf("\n");
+
+ printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+ dt / 1000000000.);
+ printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
+ printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
+
+ xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
+ xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
+ }
+}
+
+static void *poller(void *arg)
+{
+ (void)arg;
+ for (;;) {
+ sleep(opt_interval);
+ dump_stats();
+ }
+
+ return NULL;
+}
+
+static void int_exit(int sig)
+{
+ (void)sig;
+ dump_stats();
+ bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+ exit(EXIT_SUCCESS);
+}
+
+static struct option long_options[] = {
+ {"rxdrop", no_argument, 0, 'r'},
+ {"txonly", no_argument, 0, 't'},
+ {"l2fwd", no_argument, 0, 'l'},
+ {"interface", required_argument, 0, 'i'},
+ {"queue", required_argument, 0, 'q'},
+ {"poll", no_argument, 0, 'p'},
+ {"shared-buffer", no_argument, 0, 's'},
+ {"xdp-skb", no_argument, 0, 'S'},
+ {"xdp-native", no_argument, 0, 'N'},
+ {"interval", required_argument, 0, 'n'},
+ {0, 0, 0, 0}
+};
+
+static void usage(const char *prog)
+{
+ const char *str =
+ " Usage: %s [OPTIONS]\n"
+ " Options:\n"
+ " -r, --rxdrop Discard all incoming packets (default)\n"
+ " -t, --txonly Only send packets\n"
+ " -l, --l2fwd MAC swap L2 forwarding\n"
+ " -i, --interface=n Run on interface n\n"
+ " -q, --queue=n Use queue n (default 0)\n"
+ " -p, --poll Use poll syscall\n"
+ " -s, --shared-buffer Use shared packet buffer\n"
+ " -S, --xdp-skb=n Use XDP skb-mod\n"
+ " -N, --xdp-native=n Enfore XDP native mode\n"
+ " -n, --interval=n Specify statistics update interval (default 1 sec).\n"
+ "\n";
+ fprintf(stderr, str, prog);
+ exit(EXIT_FAILURE);
+}
+
+static void parse_command_line(int argc, char **argv)
+{
+ int option_index, c;
+
+ opterr = 0;
+
+ for (;;) {
+ c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
+ &option_index);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'r':
+ opt_bench = BENCH_RXDROP;
+ break;
+ case 't':
+ opt_bench = BENCH_TXONLY;
+ break;
+ case 'l':
+ opt_bench = BENCH_L2FWD;
+ break;
+ case 'i':
+ opt_if = optarg;
+ break;
+ case 'q':
+ opt_queue = atoi(optarg);
+ break;
+ case 's':
+ opt_shared_packet_buffer = 1;
+ break;
+ case 'p':
+ opt_poll = 1;
+ break;
+ case 'S':
+ opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
+ opt_xdp_bind_flags |= XDP_COPY;
+ break;
+ case 'N':
+ opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ case 'n':
+ opt_interval = atoi(optarg);
+ break;
+ default:
+ usage(basename(argv[0]));
+ }
+ }
+
+ opt_ifindex = if_nametoindex(opt_if);
+ if (!opt_ifindex) {
+ fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
+ opt_if);
+ usage(basename(argv[0]));
+ }
+}
+
+static void kick_tx(int fd)
+{
+ int ret;
+
+ ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+ if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN)
+ return;
+ lassert(0);
+}
+
+static inline void complete_tx_l2fwd(struct xdpsock *xsk)
+{
+ u64 descs[BATCH_SIZE];
+ unsigned int rcvd;
+ size_t ndescs;
+
+ if (!xsk->outstanding_tx)
+ return;
+
+ kick_tx(xsk->sfd);
+ ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
+ xsk->outstanding_tx;
+
+ /* re-add completed Tx buffers */
+ rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs);
+ if (rcvd > 0) {
+ umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd);
+ xsk->outstanding_tx -= rcvd;
+ xsk->tx_npkts += rcvd;
+ }
+}
+
+static inline void complete_tx_only(struct xdpsock *xsk)
+{
+ u64 descs[BATCH_SIZE];
+ unsigned int rcvd;
+
+ if (!xsk->outstanding_tx)
+ return;
+
+ kick_tx(xsk->sfd);
+
+ rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE);
+ if (rcvd > 0) {
+ xsk->outstanding_tx -= rcvd;
+ xsk->tx_npkts += rcvd;
+ }
+}
+
+static void rx_drop(struct xdpsock *xsk)
+{
+ struct xdp_desc descs[BATCH_SIZE];
+ unsigned int rcvd, i;
+
+ rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+ if (!rcvd)
+ return;
+
+ for (i = 0; i < rcvd; i++) {
+ char *pkt = xq_get_data(xsk, descs[i].addr);
+
+ hex_dump(pkt, descs[i].len, descs[i].addr);
+ }
+
+ xsk->rx_npkts += rcvd;
+
+ umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
+}
+
+static void rx_drop_all(void)
+{
+ struct pollfd fds[MAX_SOCKS + 1];
+ int i, ret, timeout, nfds = 1;
+
+ memset(fds, 0, sizeof(fds));
+
+ for (i = 0; i < num_socks; i++) {
+ fds[i].fd = xsks[i]->sfd;
+ fds[i].events = POLLIN;
+ timeout = 1000; /* 1sn */
+ }
+
+ for (;;) {
+ if (opt_poll) {
+ ret = poll(fds, nfds, timeout);
+ if (ret <= 0)
+ continue;
+ }
+
+ for (i = 0; i < num_socks; i++)
+ rx_drop(xsks[i]);
+ }
+}
+
+static void tx_only(struct xdpsock *xsk)
+{
+ int timeout, ret, nfds = 1;
+ struct pollfd fds[nfds + 1];
+ unsigned int idx = 0;
+
+ memset(fds, 0, sizeof(fds));
+ fds[0].fd = xsk->sfd;
+ fds[0].events = POLLOUT;
+ timeout = 1000; /* 1sn */
+
+ for (;;) {
+ if (opt_poll) {
+ ret = poll(fds, nfds, timeout);
+ if (ret <= 0)
+ continue;
+
+ if (fds[0].fd != xsk->sfd ||
+ !(fds[0].revents & POLLOUT))
+ continue;
+ }
+
+ if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) {
+ lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0);
+
+ xsk->outstanding_tx += BATCH_SIZE;
+ idx += BATCH_SIZE;
+ idx %= NUM_FRAMES;
+ }
+
+ complete_tx_only(xsk);
+ }
+}
+
+static void l2fwd(struct xdpsock *xsk)
+{
+ for (;;) {
+ struct xdp_desc descs[BATCH_SIZE];
+ unsigned int rcvd, i;
+ int ret;
+
+ for (;;) {
+ complete_tx_l2fwd(xsk);
+
+ rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+ if (rcvd > 0)
+ break;
+ }
+
+ for (i = 0; i < rcvd; i++) {
+ char *pkt = xq_get_data(xsk, descs[i].addr);
+
+ swap_mac_addresses(pkt);
+
+ hex_dump(pkt, descs[i].len, descs[i].addr);
+ }
+
+ xsk->rx_npkts += rcvd;
+
+ ret = xq_enq(&xsk->tx, descs, rcvd);
+ lassert(ret == 0);
+ xsk->outstanding_tx += rcvd;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char xdp_filename[256];
+ int i, ret, key = 0;
+ pthread_t pt;
+
+ parse_command_line(argc, argv);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(xdp_filename)) {
+ fprintf(stderr, "ERROR: load_bpf_file %s\n", bpf_log_buf);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!prog_fd[0]) {
+ fprintf(stderr, "ERROR: load_bpf_file: \"%s\"\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd[0], opt_xdp_flags) < 0) {
+ fprintf(stderr, "ERROR: link set xdp fd failed\n");
+ exit(EXIT_FAILURE);
+ }
+
+ ret = bpf_map_update_elem(map_fd[0], &key, &opt_queue, 0);
+ if (ret) {
+ fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Create sockets... */
+ xsks[num_socks++] = xsk_configure(NULL);
+
+#if RR_LB
+ for (i = 0; i < MAX_SOCKS - 1; i++)
+ xsks[num_socks++] = xsk_configure(xsks[0]->umem);
+#endif
+
+ /* ...and insert them into the map. */
+ for (i = 0; i < num_socks; i++) {
+ key = i;
+ ret = bpf_map_update_elem(map_fd[1], &key, &xsks[i]->sfd, 0);
+ if (ret) {
+ fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+ signal(SIGABRT, int_exit);
+
+ setlocale(LC_ALL, "");
+
+ ret = pthread_create(&pt, NULL, poller, NULL);
+ lassert(ret == 0);
+
+ prev_time = get_nsecs();
+
+ if (opt_bench == BENCH_RXDROP)
+ rx_drop_all();
+ else if (opt_bench == BENCH_TXONLY)
+ tx_only(xsks[0]);
+ else
+ l2fwd(xsks[0]);
+
+ return 0;
+}
diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile
deleted file mode 100644
index fa53f4d77834..000000000000
--- a/samples/sockmap/Makefile
+++ /dev/null
@@ -1,78 +0,0 @@
-# List of programs to build
-hostprogs-y := sockmap
-
-# Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
-
-HOSTCFLAGS += -I$(objtree)/usr/include
-HOSTCFLAGS += -I$(srctree)/tools/lib/
-HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
-HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include
-HOSTCFLAGS += -I$(srctree)/tools/perf
-
-sockmap-objs := ../bpf/bpf_load.o $(LIBBPF) sockmap_user.o
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
-always += sockmap_kern.o
-
-HOSTLOADLIBES_sockmap += -lelf -lpthread
-
-# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
-# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
-LLC ?= llc
-CLANG ?= clang
-
-# Trick to allow make to be run from this directory
-all:
- $(MAKE) -C ../../ $(CURDIR)/
-
-clean:
- $(MAKE) -C ../../ M=$(CURDIR) clean
- @rm -f *~
-
-$(obj)/syscall_nrs.s: $(src)/syscall_nrs.c
- $(call if_changed_dep,cc_s_c)
-
-$(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE
- $(call filechk,offsets,__SYSCALL_NRS_H__)
-
-clean-files += syscall_nrs.h
-
-FORCE:
-
-
-# Verify LLVM compiler tools are available and bpf target is supported by llc
-.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC)
-
-verify_cmds: $(CLANG) $(LLC)
- @for TOOL in $^ ; do \
- if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \
- echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\
- exit 1; \
- else true; fi; \
- done
-
-verify_target_bpf: verify_cmds
- @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \
- echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\
- echo " NOTICE: LLVM version >= 3.7.1 required" ;\
- exit 2; \
- else true; fi
-
-$(src)/*.c: verify_target_bpf
-
-# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
-# But, there is no easy way to fix it, so just exclude it since it is
-# useless for BPF samples.
-#
-# -target bpf option required with SK_MSG programs, this is to ensure
-# reading 'void *' data types for data and data_end are __u64 reads.
-$(obj)/%.o: $(src)/%.c
- $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
- -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
- -Wno-compare-distinct-pointer-types \
- -Wno-gnu-variable-sized-type-not-at-end \
- -Wno-address-of-packed-member -Wno-tautological-compare \
- -Wno-unknown-warning-option -O2 -target bpf \
- -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c
deleted file mode 100644
index 9ff8bc5dc206..000000000000
--- a/samples/sockmap/sockmap_kern.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-#include <uapi/linux/bpf.h>
-#include <uapi/linux/if_ether.h>
-#include <uapi/linux/if_packet.h>
-#include <uapi/linux/ip.h>
-#include "../../tools/testing/selftests/bpf/bpf_helpers.h"
-#include "../../tools/testing/selftests/bpf/bpf_endian.h"
-
-/* Sockmap sample program connects a client and a backend together
- * using cgroups.
- *
- * client:X <---> frontend:80 client:X <---> backend:80
- *
- * For simplicity we hard code values here and bind 1:1. The hard
- * coded values are part of the setup in sockmap.sh script that
- * is associated with this BPF program.
- *
- * The bpf_printk is verbose and prints information as connections
- * are established and verdicts are decided.
- */
-
-#define bpf_printk(fmt, ...) \
-({ \
- char ____fmt[] = fmt; \
- bpf_trace_printk(____fmt, sizeof(____fmt), \
- ##__VA_ARGS__); \
-})
-
-struct bpf_map_def SEC("maps") sock_map = {
- .type = BPF_MAP_TYPE_SOCKMAP,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_map_txmsg = {
- .type = BPF_MAP_TYPE_SOCKMAP,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_map_redir = {
- .type = BPF_MAP_TYPE_SOCKMAP,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_apply_bytes = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_cork_bytes = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_pull_bytes = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 2
-};
-
-struct bpf_map_def SEC("maps") sock_redir_flags = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_skb_opts = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 1
-};
-
-SEC("sk_skb1")
-int bpf_prog1(struct __sk_buff *skb)
-{
- return skb->len;
-}
-
-SEC("sk_skb2")
-int bpf_prog2(struct __sk_buff *skb)
-{
- __u32 lport = skb->local_port;
- __u32 rport = skb->remote_port;
- int len, *f, ret, zero = 0;
- __u64 flags = 0;
-
- if (lport == 10000)
- ret = 10;
- else
- ret = 1;
-
- len = (__u32)skb->data_end - (__u32)skb->data;
- f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
- if (f && *f) {
- ret = 3;
- flags = *f;
- }
-
- bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
- len, flags);
- return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
-}
-
-SEC("sockops")
-int bpf_sockmap(struct bpf_sock_ops *skops)
-{
- __u32 lport, rport;
- int op, err = 0, index, key, ret;
-
-
- op = (int) skops->op;
-
- switch (op) {
- case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
- lport = skops->local_port;
- rport = skops->remote_port;
-
- if (lport == 10000) {
- ret = 1;
- err = bpf_sock_map_update(skops, &sock_map, &ret,
- BPF_NOEXIST);
- bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
- lport, bpf_ntohl(rport), err);
- }
- break;
- case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
- lport = skops->local_port;
- rport = skops->remote_port;
-
- if (bpf_ntohl(rport) == 10001) {
- ret = 10;
- err = bpf_sock_map_update(skops, &sock_map, &ret,
- BPF_NOEXIST);
- bpf_printk("active(%i -> %i) map ctx update err: %d\n",
- lport, bpf_ntohl(rport), err);
- }
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-SEC("sk_msg1")
-int bpf_prog4(struct sk_msg_md *msg)
-{
- int *bytes, zero = 0, one = 1;
- int *start, *end;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- bpf_msg_cork_bytes(msg, *bytes);
- start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
- if (start && end)
- bpf_msg_pull_data(msg, *start, *end, 0);
- return SK_PASS;
-}
-
-SEC("sk_msg2")
-int bpf_prog5(struct sk_msg_md *msg)
-{
- int err1 = -1, err2 = -1, zero = 0, one = 1;
- int *bytes, *start, *end, len1, len2;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- err1 = bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- err2 = bpf_msg_cork_bytes(msg, *bytes);
- len1 = (__u64)msg->data_end - (__u64)msg->data;
- start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
- if (start && end) {
- int err;
-
- bpf_printk("sk_msg2: pull(%i:%i)\n",
- start ? *start : 0, end ? *end : 0);
- err = bpf_msg_pull_data(msg, *start, *end, 0);
- if (err)
- bpf_printk("sk_msg2: pull_data err %i\n",
- err);
- len2 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length update %i->%i\n",
- len1, len2);
- }
- bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
- len1, err1, err2);
- return SK_PASS;
-}
-
-SEC("sk_msg3")
-int bpf_prog6(struct sk_msg_md *msg)
-{
- int *bytes, zero = 0, one = 1, key = 0;
- int *start, *end, *f;
- __u64 flags = 0;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- bpf_msg_cork_bytes(msg, *bytes);
- start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
- if (start && end)
- bpf_msg_pull_data(msg, *start, *end, 0);
- f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
- if (f && *f) {
- key = 2;
- flags = *f;
- }
- return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-}
-
-SEC("sk_msg4")
-int bpf_prog7(struct sk_msg_md *msg)
-{
- int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0;
- int *f, *bytes, *start, *end, len1, len2;
- __u64 flags = 0;
-
- int err;
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- err1 = bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- err2 = bpf_msg_cork_bytes(msg, *bytes);
- len1 = (__u64)msg->data_end - (__u64)msg->data;
- start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
- if (start && end) {
-
- bpf_printk("sk_msg2: pull(%i:%i)\n",
- start ? *start : 0, end ? *end : 0);
- err = bpf_msg_pull_data(msg, *start, *end, 0);
- if (err)
- bpf_printk("sk_msg2: pull_data err %i\n",
- err);
- len2 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length update %i->%i\n",
- len1, len2);
- }
- f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
- if (f && *f) {
- key = 2;
- flags = *f;
- }
- bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
- len1, flags, err1 ? err1 : err2);
- err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
- bpf_printk("sk_msg3: err %i\n", err);
- return err;
-}
-
-SEC("sk_msg5")
-int bpf_prog8(struct sk_msg_md *msg)
-{
- void *data_end = (void *)(long) msg->data_end;
- void *data = (void *)(long) msg->data;
- int ret = 0, *bytes, zero = 0;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes) {
- ret = bpf_msg_apply_bytes(msg, *bytes);
- if (ret)
- return SK_DROP;
- } else {
- return SK_DROP;
- }
- return SK_PASS;
-}
-SEC("sk_msg6")
-int bpf_prog9(struct sk_msg_md *msg)
-{
- void *data_end = (void *)(long) msg->data_end;
- void *data = (void *)(long) msg->data;
- int ret = 0, *bytes, zero = 0;
-
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes) {
- if (((__u64)data_end - (__u64)data) >= *bytes)
- return SK_PASS;
- ret = bpf_msg_cork_bytes(msg, *bytes);
- if (ret)
- return SK_DROP;
- }
- return SK_PASS;
-}
-
-SEC("sk_msg7")
-int bpf_prog10(struct sk_msg_md *msg)
-{
- int *bytes, zero = 0, one = 1;
- int *start, *end;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- bpf_msg_cork_bytes(msg, *bytes);
- start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
- if (start && end)
- bpf_msg_pull_data(msg, *start, *end, 0);
-
- return SK_DROP;
-}
-
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/sockmap/sockmap_test.sh b/samples/sockmap/sockmap_test.sh
deleted file mode 100755
index ace75f070eb8..000000000000
--- a/samples/sockmap/sockmap_test.sh
+++ /dev/null
@@ -1,488 +0,0 @@
-#Test a bunch of positive cases to verify basic functionality
-for prog in "--txmsg_redir --txmsg_skb" "--txmsg_redir --txmsg_ingress" "--txmsg" "--txmsg_redir" "--txmsg_redir --txmsg_ingress" "--txmsg_drop"; do
-for t in "sendmsg" "sendpage"; do
-for r in 1 10 100; do
- for i in 1 10 100; do
- for l in 1 10 100; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
- done
- done
-done
-done
-done
-
-#Test max iov
-t="sendmsg"
-r=1
-i=1024
-l=1
-prog="--txmsg"
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-echo $TEST
-$TEST
-sleep 2
-prog="--txmsg_redir"
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-echo $TEST
-$TEST
-
-# Test max iov with 1k send
-
-t="sendmsg"
-r=1
-i=1024
-l=1024
-prog="--txmsg"
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-echo $TEST
-$TEST
-sleep 2
-prog="--txmsg_redir"
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-echo $TEST
-$TEST
-sleep 2
-
-# Test apply with 1B
-r=1
-i=1024
-l=1024
-prog="--txmsg_apply 1"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test apply with larger value than send
-r=1
-i=8
-l=1024
-prog="--txmsg_apply 2048"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test apply with apply that never reaches limit
-r=1024
-i=1
-l=1
-prog="--txmsg_apply 2048"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test apply and redirect with 1B
-r=1
-i=1024
-l=1024
-prog="--txmsg_redir --txmsg_apply 1"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-prog="--txmsg_redir --txmsg_apply 1 --txmsg_ingress"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-prog="--txmsg_redir --txmsg_apply 1 --txmsg_skb"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-
-# Test apply and redirect with larger value than send
-r=1
-i=8
-l=1024
-prog="--txmsg_redir --txmsg_apply 2048"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-prog="--txmsg_redir --txmsg_apply 2048 --txmsg_ingress"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-prog="--txmsg_redir --txmsg_apply 2048 --txmsg_skb"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-
-# Test apply and redirect with apply that never reaches limit
-r=1024
-i=1
-l=1
-prog="--txmsg_apply 2048"
-
-for t in "sendmsg" "sendpage"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test cork with 1B not really useful but test it anyways
-r=1
-i=1024
-l=1024
-prog="--txmsg_cork 1"
-
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test cork with a more reasonable 100B
-r=1
-i=1000
-l=1000
-prog="--txmsg_cork 100"
-
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test cork with larger value than send
-r=1
-i=8
-l=1024
-prog="--txmsg_cork 2048"
-
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test cork with cork that never reaches limit
-r=1024
-i=1
-l=1
-prog="--txmsg_cork 2048"
-
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-r=1
-i=1024
-l=1024
-prog="--txmsg_redir --txmsg_cork 1"
-
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test cork with a more reasonable 100B
-r=1
-i=1000
-l=1000
-prog="--txmsg_redir --txmsg_cork 100"
-
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test cork with larger value than send
-r=1
-i=8
-l=1024
-prog="--txmsg_redir --txmsg_cork 2048"
-
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test cork with cork that never reaches limit
-r=1024
-i=1
-l=1
-prog="--txmsg_cork 2048"
-
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-
-# mix and match cork and apply not really useful but valid programs
-
-# Test apply < cork
-r=100
-i=1
-l=5
-prog="--txmsg_apply 10 --txmsg_cork 100"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Try again with larger sizes so we hit overflow case
-r=100
-i=1000
-l=2048
-prog="--txmsg_apply 4096 --txmsg_cork 8096"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test apply > cork
-r=100
-i=1
-l=5
-prog="--txmsg_apply 100 --txmsg_cork 10"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Again with larger sizes so we hit overflow cases
-r=100
-i=1000
-l=2048
-prog="--txmsg_apply 8096 --txmsg_cork 4096"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-
-# Test apply = cork
-r=100
-i=1
-l=5
-prog="--txmsg_apply 10 --txmsg_cork 10"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-r=100
-i=1000
-l=2048
-prog="--txmsg_apply 4096 --txmsg_cork 4096"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test apply < cork
-r=100
-i=1
-l=5
-prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 100"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Try again with larger sizes so we hit overflow case
-r=100
-i=1000
-l=2048
-prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 8096"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Test apply > cork
-r=100
-i=1
-l=5
-prog="--txmsg_redir --txmsg_apply 100 --txmsg_cork 10"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Again with larger sizes so we hit overflow cases
-r=100
-i=1000
-l=2048
-prog="--txmsg_redir --txmsg_apply 8096 --txmsg_cork 4096"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-
-# Test apply = cork
-r=100
-i=1
-l=5
-prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 10"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-r=100
-i=1000
-l=2048
-prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 4096"
-for t in "sendpage" "sendmsg"; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
- echo $TEST
- $TEST
- sleep 2
-done
-
-# Tests for bpf_msg_pull_data()
-for i in `seq 99 100 1600`; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
- --txmsg --txmsg_start 0 --txmsg_end $i --txmsg_cork 1600"
- echo $TEST
- $TEST
- sleep 2
-done
-
-for i in `seq 199 100 1600`; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
- --txmsg --txmsg_start 100 --txmsg_end $i --txmsg_cork 1600"
- echo $TEST
- $TEST
- sleep 2
-done
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
- --txmsg --txmsg_start 1500 --txmsg_end 1600 --txmsg_cork 1600"
-echo $TEST
-$TEST
-sleep 2
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
- --txmsg --txmsg_start 1111 --txmsg_end 1112 --txmsg_cork 1600"
-echo $TEST
-$TEST
-sleep 2
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
- --txmsg --txmsg_start 1111 --txmsg_end 0 --txmsg_cork 1600"
-echo $TEST
-$TEST
-sleep 2
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
- --txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1600"
-echo $TEST
-$TEST
-sleep 2
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
- --txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1602"
-echo $TEST
-$TEST
-sleep 2
-
-# Run through gamut again with start and end
-for prog in "--txmsg" "--txmsg_redir" "--txmsg_drop"; do
-for t in "sendmsg" "sendpage"; do
-for r in 1 10 100; do
- for i in 1 10 100; do
- for l in 1 10 100; do
- TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog --txmsg_start 1 --txmsg_end 2"
- echo $TEST
- $TEST
- sleep 2
- done
- done
-done
-done
-done
-
-# Some specific tests to cover specific code paths
-./sockmap --cgroup /mnt/cgroup2/ -t sendpage \
- -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3
-./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \
- -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3
-./sockmap --cgroup /mnt/cgroup2/ -t sendpage \
- -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5
-./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \
- -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
deleted file mode 100644
index 6f2334912283..000000000000
--- a/samples/sockmap/sockmap_user.c
+++ /dev/null
@@ -1,894 +0,0 @@
-/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/select.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <unistd.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/ioctl.h>
-#include <stdbool.h>
-#include <signal.h>
-#include <fcntl.h>
-#include <sys/wait.h>
-#include <time.h>
-
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <sys/types.h>
-#include <sys/sendfile.h>
-
-#include <linux/netlink.h>
-#include <linux/socket.h>
-#include <linux/sock_diag.h>
-#include <linux/bpf.h>
-#include <linux/if_link.h>
-#include <assert.h>
-#include <libgen.h>
-
-#include <getopt.h>
-
-#include "../bpf/bpf_load.h"
-#include "../bpf/bpf_util.h"
-#include "../bpf/libbpf.h"
-
-int running;
-void running_handler(int a);
-
-/* randomly selected ports for testing on lo */
-#define S1_PORT 10000
-#define S2_PORT 10001
-
-/* global sockets */
-int s1, s2, c1, c2, p1, p2;
-
-int txmsg_pass;
-int txmsg_noisy;
-int txmsg_redir;
-int txmsg_redir_noisy;
-int txmsg_drop;
-int txmsg_apply;
-int txmsg_cork;
-int txmsg_start;
-int txmsg_end;
-int txmsg_ingress;
-int txmsg_skb;
-
-static const struct option long_options[] = {
- {"help", no_argument, NULL, 'h' },
- {"cgroup", required_argument, NULL, 'c' },
- {"rate", required_argument, NULL, 'r' },
- {"verbose", no_argument, NULL, 'v' },
- {"iov_count", required_argument, NULL, 'i' },
- {"length", required_argument, NULL, 'l' },
- {"test", required_argument, NULL, 't' },
- {"data_test", no_argument, NULL, 'd' },
- {"txmsg", no_argument, &txmsg_pass, 1 },
- {"txmsg_noisy", no_argument, &txmsg_noisy, 1 },
- {"txmsg_redir", no_argument, &txmsg_redir, 1 },
- {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1},
- {"txmsg_drop", no_argument, &txmsg_drop, 1 },
- {"txmsg_apply", required_argument, NULL, 'a'},
- {"txmsg_cork", required_argument, NULL, 'k'},
- {"txmsg_start", required_argument, NULL, 's'},
- {"txmsg_end", required_argument, NULL, 'e'},
- {"txmsg_ingress", no_argument, &txmsg_ingress, 1 },
- {"txmsg_skb", no_argument, &txmsg_skb, 1 },
- {0, 0, NULL, 0 }
-};
-
-static void usage(char *argv[])
-{
- int i;
-
- printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]);
- printf(" options:\n");
- for (i = 0; long_options[i].name != 0; i++) {
- printf(" --%-12s", long_options[i].name);
- if (long_options[i].flag != NULL)
- printf(" flag (internal value:%d)\n",
- *long_options[i].flag);
- else
- printf(" -%c\n", long_options[i].val);
- }
- printf("\n");
-}
-
-static int sockmap_init_sockets(void)
-{
- int i, err, one = 1;
- struct sockaddr_in addr;
- int *fds[4] = {&s1, &s2, &c1, &c2};
-
- s1 = s2 = p1 = p2 = c1 = c2 = 0;
-
- /* Init sockets */
- for (i = 0; i < 4; i++) {
- *fds[i] = socket(AF_INET, SOCK_STREAM, 0);
- if (*fds[i] < 0) {
- perror("socket s1 failed()");
- return errno;
- }
- }
-
- /* Allow reuse */
- for (i = 0; i < 2; i++) {
- err = setsockopt(*fds[i], SOL_SOCKET, SO_REUSEADDR,
- (char *)&one, sizeof(one));
- if (err) {
- perror("setsockopt failed()");
- return errno;
- }
- }
-
- /* Non-blocking sockets */
- for (i = 0; i < 2; i++) {
- err = ioctl(*fds[i], FIONBIO, (char *)&one);
- if (err < 0) {
- perror("ioctl s1 failed()");
- return errno;
- }
- }
-
- /* Bind server sockets */
- memset(&addr, 0, sizeof(struct sockaddr_in));
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = inet_addr("127.0.0.1");
-
- addr.sin_port = htons(S1_PORT);
- err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
- if (err < 0) {
- perror("bind s1 failed()\n");
- return errno;
- }
-
- addr.sin_port = htons(S2_PORT);
- err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
- if (err < 0) {
- perror("bind s2 failed()\n");
- return errno;
- }
-
- /* Listen server sockets */
- addr.sin_port = htons(S1_PORT);
- err = listen(s1, 32);
- if (err < 0) {
- perror("listen s1 failed()\n");
- return errno;
- }
-
- addr.sin_port = htons(S2_PORT);
- err = listen(s2, 32);
- if (err < 0) {
- perror("listen s1 failed()\n");
- return errno;
- }
-
- /* Initiate Connect */
- addr.sin_port = htons(S1_PORT);
- err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
- if (err < 0 && errno != EINPROGRESS) {
- perror("connect c1 failed()\n");
- return errno;
- }
-
- addr.sin_port = htons(S2_PORT);
- err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
- if (err < 0 && errno != EINPROGRESS) {
- perror("connect c2 failed()\n");
- return errno;
- } else if (err < 0) {
- err = 0;
- }
-
- /* Accept Connecrtions */
- p1 = accept(s1, NULL, NULL);
- if (p1 < 0) {
- perror("accept s1 failed()\n");
- return errno;
- }
-
- p2 = accept(s2, NULL, NULL);
- if (p2 < 0) {
- perror("accept s1 failed()\n");
- return errno;
- }
-
- printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
- printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
- c1, s1, c2, s2);
- return 0;
-}
-
-struct msg_stats {
- size_t bytes_sent;
- size_t bytes_recvd;
- struct timespec start;
- struct timespec end;
-};
-
-struct sockmap_options {
- int verbose;
- bool base;
- bool sendpage;
- bool data_test;
- bool drop_expected;
-};
-
-static int msg_loop_sendpage(int fd, int iov_length, int cnt,
- struct msg_stats *s,
- struct sockmap_options *opt)
-{
- bool drop = opt->drop_expected;
- unsigned char k = 0;
- FILE *file;
- int i, fp;
-
- file = fopen(".sendpage_tst.tmp", "w+");
- for (i = 0; i < iov_length * cnt; i++, k++)
- fwrite(&k, sizeof(char), 1, file);
- fflush(file);
- fseek(file, 0, SEEK_SET);
- fclose(file);
-
- fp = open(".sendpage_tst.tmp", O_RDONLY);
- clock_gettime(CLOCK_MONOTONIC, &s->start);
- for (i = 0; i < cnt; i++) {
- int sent = sendfile(fd, fp, NULL, iov_length);
-
- if (!drop && sent < 0) {
- perror("send loop error:");
- close(fp);
- return sent;
- } else if (drop && sent >= 0) {
- printf("sendpage loop error expected: %i\n", sent);
- close(fp);
- return -EIO;
- }
-
- if (sent > 0)
- s->bytes_sent += sent;
- }
- clock_gettime(CLOCK_MONOTONIC, &s->end);
- close(fp);
- return 0;
-}
-
-static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
- struct msg_stats *s, bool tx,
- struct sockmap_options *opt)
-{
- struct msghdr msg = {0};
- int err, i, flags = MSG_NOSIGNAL;
- struct iovec *iov;
- unsigned char k;
- bool data_test = opt->data_test;
- bool drop = opt->drop_expected;
-
- iov = calloc(iov_count, sizeof(struct iovec));
- if (!iov)
- return errno;
-
- k = 0;
- for (i = 0; i < iov_count; i++) {
- unsigned char *d = calloc(iov_length, sizeof(char));
-
- if (!d) {
- fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
- goto out_errno;
- }
- iov[i].iov_base = d;
- iov[i].iov_len = iov_length;
-
- if (data_test && tx) {
- int j;
-
- for (j = 0; j < iov_length; j++)
- d[j] = k++;
- }
- }
-
- msg.msg_iov = iov;
- msg.msg_iovlen = iov_count;
- k = 0;
-
- if (tx) {
- clock_gettime(CLOCK_MONOTONIC, &s->start);
- for (i = 0; i < cnt; i++) {
- int sent = sendmsg(fd, &msg, flags);
-
- if (!drop && sent < 0) {
- perror("send loop error:");
- goto out_errno;
- } else if (drop && sent >= 0) {
- printf("send loop error expected: %i\n", sent);
- errno = -EIO;
- goto out_errno;
- }
- if (sent > 0)
- s->bytes_sent += sent;
- }
- clock_gettime(CLOCK_MONOTONIC, &s->end);
- } else {
- int slct, recv, max_fd = fd;
- struct timeval timeout;
- float total_bytes;
- fd_set w;
-
- total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
- err = clock_gettime(CLOCK_MONOTONIC, &s->start);
- if (err < 0)
- perror("recv start time: ");
- while (s->bytes_recvd < total_bytes) {
- timeout.tv_sec = 1;
- timeout.tv_usec = 0;
-
- /* FD sets */
- FD_ZERO(&w);
- FD_SET(fd, &w);
-
- slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
- if (slct == -1) {
- perror("select()");
- clock_gettime(CLOCK_MONOTONIC, &s->end);
- goto out_errno;
- } else if (!slct) {
- fprintf(stderr, "unexpected timeout\n");
- errno = -EIO;
- clock_gettime(CLOCK_MONOTONIC, &s->end);
- goto out_errno;
- }
-
- recv = recvmsg(fd, &msg, flags);
- if (recv < 0) {
- if (errno != EWOULDBLOCK) {
- clock_gettime(CLOCK_MONOTONIC, &s->end);
- perror("recv failed()\n");
- goto out_errno;
- }
- }
-
- s->bytes_recvd += recv;
-
- if (data_test) {
- int j;
-
- for (i = 0; i < msg.msg_iovlen; i++) {
- unsigned char *d = iov[i].iov_base;
-
- for (j = 0;
- j < iov[i].iov_len && recv; j++) {
- if (d[j] != k++) {
- errno = -EIO;
- fprintf(stderr,
- "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
- i, j, d[j], k - 1, d[j+1], k + 1);
- goto out_errno;
- }
- recv--;
- }
- }
- }
- }
- clock_gettime(CLOCK_MONOTONIC, &s->end);
- }
-
- for (i = 0; i < iov_count; i++)
- free(iov[i].iov_base);
- free(iov);
- return 0;
-out_errno:
- for (i = 0; i < iov_count; i++)
- free(iov[i].iov_base);
- free(iov);
- return errno;
-}
-
-static float giga = 1000000000;
-
-static inline float sentBps(struct msg_stats s)
-{
- return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec);
-}
-
-static inline float recvdBps(struct msg_stats s)
-{
- return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
-}
-
-static int sendmsg_test(int iov_count, int iov_buf, int cnt,
- struct sockmap_options *opt)
-{
- float sent_Bps = 0, recvd_Bps = 0;
- int rx_fd, txpid, rxpid, err = 0;
- struct msg_stats s = {0};
- int status;
-
- errno = 0;
-
- if (opt->base)
- rx_fd = p1;
- else
- rx_fd = p2;
-
- rxpid = fork();
- if (rxpid == 0) {
- if (opt->drop_expected)
- exit(1);
-
- if (opt->sendpage)
- iov_count = 1;
- err = msg_loop(rx_fd, iov_count, iov_buf,
- cnt, &s, false, opt);
- if (err)
- fprintf(stderr,
- "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
- iov_count, iov_buf, cnt, err);
- shutdown(p2, SHUT_RDWR);
- shutdown(p1, SHUT_RDWR);
- if (s.end.tv_sec - s.start.tv_sec) {
- sent_Bps = sentBps(s);
- recvd_Bps = recvdBps(s);
- }
- fprintf(stdout,
- "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
- s.bytes_sent, sent_Bps, sent_Bps/giga,
- s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
- exit(1);
- } else if (rxpid == -1) {
- perror("msg_loop_rx: ");
- return errno;
- }
-
- txpid = fork();
- if (txpid == 0) {
- if (opt->sendpage)
- err = msg_loop_sendpage(c1, iov_buf, cnt, &s, opt);
- else
- err = msg_loop(c1, iov_count, iov_buf,
- cnt, &s, true, opt);
-
- if (err)
- fprintf(stderr,
- "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
- iov_count, iov_buf, cnt, err);
- shutdown(c1, SHUT_RDWR);
- if (s.end.tv_sec - s.start.tv_sec) {
- sent_Bps = sentBps(s);
- recvd_Bps = recvdBps(s);
- }
- fprintf(stdout,
- "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
- s.bytes_sent, sent_Bps, sent_Bps/giga,
- s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
- exit(1);
- } else if (txpid == -1) {
- perror("msg_loop_tx: ");
- return errno;
- }
-
- assert(waitpid(rxpid, &status, 0) == rxpid);
- assert(waitpid(txpid, &status, 0) == txpid);
- return err;
-}
-
-static int forever_ping_pong(int rate, struct sockmap_options *opt)
-{
- struct timeval timeout;
- char buf[1024] = {0};
- int sc;
-
- timeout.tv_sec = 10;
- timeout.tv_usec = 0;
-
- /* Ping/Pong data from client to server */
- sc = send(c1, buf, sizeof(buf), 0);
- if (sc < 0) {
- perror("send failed()\n");
- return sc;
- }
-
- do {
- int s, rc, i, max_fd = p2;
- fd_set w;
-
- /* FD sets */
- FD_ZERO(&w);
- FD_SET(c1, &w);
- FD_SET(c2, &w);
- FD_SET(p1, &w);
- FD_SET(p2, &w);
-
- s = select(max_fd + 1, &w, NULL, NULL, &timeout);
- if (s == -1) {
- perror("select()");
- break;
- } else if (!s) {
- fprintf(stderr, "unexpected timeout\n");
- break;
- }
-
- for (i = 0; i <= max_fd && s > 0; ++i) {
- if (!FD_ISSET(i, &w))
- continue;
-
- s--;
-
- rc = recv(i, buf, sizeof(buf), 0);
- if (rc < 0) {
- if (errno != EWOULDBLOCK) {
- perror("recv failed()\n");
- return rc;
- }
- }
-
- if (rc == 0) {
- close(i);
- break;
- }
-
- sc = send(i, buf, rc, 0);
- if (sc < 0) {
- perror("send failed()\n");
- return sc;
- }
- }
-
- if (rate)
- sleep(rate);
-
- if (opt->verbose) {
- printf(".");
- fflush(stdout);
-
- }
- } while (running);
-
- return 0;
-}
-
-enum {
- PING_PONG,
- SENDMSG,
- BASE,
- BASE_SENDPAGE,
- SENDPAGE,
-};
-
-int main(int argc, char **argv)
-{
- int iov_count = 1, length = 1024, rate = 1, tx_prog_fd;
- struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
- int opt, longindex, err, cg_fd = 0;
- struct sockmap_options options = {0};
- int test = PING_PONG;
- char filename[256];
-
- while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
- long_options, &longindex)) != -1) {
- switch (opt) {
- case 's':
- txmsg_start = atoi(optarg);
- break;
- case 'e':
- txmsg_end = atoi(optarg);
- break;
- case 'a':
- txmsg_apply = atoi(optarg);
- break;
- case 'k':
- txmsg_cork = atoi(optarg);
- break;
- case 'c':
- cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
- if (cg_fd < 0) {
- fprintf(stderr,
- "ERROR: (%i) open cg path failed: %s\n",
- cg_fd, optarg);
- return cg_fd;
- }
- break;
- case 'r':
- rate = atoi(optarg);
- break;
- case 'v':
- options.verbose = 1;
- break;
- case 'i':
- iov_count = atoi(optarg);
- break;
- case 'l':
- length = atoi(optarg);
- break;
- case 'd':
- options.data_test = true;
- break;
- case 't':
- if (strcmp(optarg, "ping") == 0) {
- test = PING_PONG;
- } else if (strcmp(optarg, "sendmsg") == 0) {
- test = SENDMSG;
- } else if (strcmp(optarg, "base") == 0) {
- test = BASE;
- } else if (strcmp(optarg, "base_sendpage") == 0) {
- test = BASE_SENDPAGE;
- } else if (strcmp(optarg, "sendpage") == 0) {
- test = SENDPAGE;
- } else {
- usage(argv);
- return -1;
- }
- break;
- case 0:
- break;
- case 'h':
- default:
- usage(argv);
- return -1;
- }
- }
-
- if (!cg_fd) {
- fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
- argv[0]);
- return -1;
- }
-
- if (setrlimit(RLIMIT_MEMLOCK, &r)) {
- perror("setrlimit(RLIMIT_MEMLOCK)");
- return 1;
- }
-
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
- running = 1;
-
- /* catch SIGINT */
- signal(SIGINT, running_handler);
-
- if (load_bpf_file(filename)) {
- fprintf(stderr, "load_bpf_file: (%s) %s\n",
- filename, strerror(errno));
- return 1;
- }
-
- /* If base test skip BPF setup */
- if (test == BASE || test == BASE_SENDPAGE)
- goto run;
-
- /* Attach programs to sockmap */
- err = bpf_prog_attach(prog_fd[0], map_fd[0],
- BPF_SK_SKB_STREAM_PARSER, 0);
- if (err) {
- fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
- err, strerror(errno));
- return err;
- }
-
- err = bpf_prog_attach(prog_fd[1], map_fd[0],
- BPF_SK_SKB_STREAM_VERDICT, 0);
- if (err) {
- fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
- err, strerror(errno));
- return err;
- }
-
- /* Attach to cgroups */
- err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
- if (err) {
- fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n",
- err, strerror(errno));
- return err;
- }
-
-run:
- err = sockmap_init_sockets();
- if (err) {
- fprintf(stderr, "ERROR: test socket failed: %d\n", err);
- goto out;
- }
-
- /* Attach txmsg program to sockmap */
- if (txmsg_pass)
- tx_prog_fd = prog_fd[3];
- else if (txmsg_noisy)
- tx_prog_fd = prog_fd[4];
- else if (txmsg_redir)
- tx_prog_fd = prog_fd[5];
- else if (txmsg_redir_noisy)
- tx_prog_fd = prog_fd[6];
- else if (txmsg_drop)
- tx_prog_fd = prog_fd[9];
- /* apply and cork must be last */
- else if (txmsg_apply)
- tx_prog_fd = prog_fd[7];
- else if (txmsg_cork)
- tx_prog_fd = prog_fd[8];
- else
- tx_prog_fd = 0;
-
- if (tx_prog_fd) {
- int redir_fd, i = 0;
-
- err = bpf_prog_attach(tx_prog_fd,
- map_fd[1], BPF_SK_MSG_VERDICT, 0);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_prog_attach (txmsg): %d (%s)\n",
- err, strerror(errno));
- return err;
- }
-
- err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (txmsg): %d (%s\n",
- err, strerror(errno));
- return err;
- }
-
- if (txmsg_redir || txmsg_redir_noisy)
- redir_fd = c2;
- else
- redir_fd = c1;
-
- err = bpf_map_update_elem(map_fd[2], &i, &redir_fd, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (txmsg): %d (%s\n",
- err, strerror(errno));
- return err;
- }
-
- if (txmsg_apply) {
- err = bpf_map_update_elem(map_fd[3],
- &i, &txmsg_apply, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (apply_bytes): %d (%s\n",
- err, strerror(errno));
- return err;
- }
- }
-
- if (txmsg_cork) {
- err = bpf_map_update_elem(map_fd[4],
- &i, &txmsg_cork, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (cork_bytes): %d (%s\n",
- err, strerror(errno));
- return err;
- }
- }
-
- if (txmsg_start) {
- err = bpf_map_update_elem(map_fd[5],
- &i, &txmsg_start, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (txmsg_start): %d (%s)\n",
- err, strerror(errno));
- return err;
- }
- }
-
- if (txmsg_end) {
- i = 1;
- err = bpf_map_update_elem(map_fd[5],
- &i, &txmsg_end, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (txmsg_end): %d (%s)\n",
- err, strerror(errno));
- return err;
- }
- }
-
- if (txmsg_ingress) {
- int in = BPF_F_INGRESS;
-
- i = 0;
- err = bpf_map_update_elem(map_fd[6], &i, &in, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
- err, strerror(errno));
- }
- i = 1;
- err = bpf_map_update_elem(map_fd[1], &i, &p1, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (p1 txmsg): %d (%s)\n",
- err, strerror(errno));
- }
- err = bpf_map_update_elem(map_fd[2], &i, &p1, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (p1 redir): %d (%s)\n",
- err, strerror(errno));
- }
-
- i = 2;
- err = bpf_map_update_elem(map_fd[2], &i, &p2, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (p2 txmsg): %d (%s)\n",
- err, strerror(errno));
- }
- }
-
- if (txmsg_skb) {
- int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1;
- int ingress = BPF_F_INGRESS;
-
- i = 0;
- err = bpf_map_update_elem(map_fd[7], &i, &ingress, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
- err, strerror(errno));
- }
-
- i = 3;
- err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY);
- if (err) {
- fprintf(stderr,
- "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n",
- err, strerror(errno));
- }
- }
- }
-
- if (txmsg_drop)
- options.drop_expected = true;
-
- if (test == PING_PONG)
- err = forever_ping_pong(rate, &options);
- else if (test == SENDMSG) {
- options.base = false;
- options.sendpage = false;
- err = sendmsg_test(iov_count, length, rate, &options);
- } else if (test == SENDPAGE) {
- options.base = false;
- options.sendpage = true;
- err = sendmsg_test(iov_count, length, rate, &options);
- } else if (test == BASE) {
- options.base = true;
- options.sendpage = false;
- err = sendmsg_test(iov_count, length, rate, &options);
- } else if (test == BASE_SENDPAGE) {
- options.base = true;
- options.sendpage = true;
- err = sendmsg_test(iov_count, length, rate, &options);
- } else
- fprintf(stderr, "unknown test\n");
-out:
- bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
- close(s1);
- close(s2);
- close(p1);
- close(p2);
- close(c1);
- close(c2);
- close(cg_fd);
- return err;
-}
-
-void running_handler(int a)
-{
- running = 0;
-}
diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
index cbbd868a50a8..7db889ca135c 100644
--- a/samples/vfio-mdev/Makefile
+++ b/samples/vfio-mdev/Makefile
@@ -1 +1,4 @@
obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
new file mode 100644
index 000000000000..2960e26c6ea4
--- /dev/null
+++ b/samples/vfio-mdev/mbochs.c
@@ -0,0 +1,1406 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Mediated virtual PCI display host device driver
+ *
+ * Emulate enough of qemu stdvga to make bochs-drm.ko happy. That is
+ * basically the vram memory bar and the bochs dispi interface vbe
+ * registers in the mmio register bar. Specifically it does *not*
+ * include any legacy vga stuff. Device looks a lot like "qemu -device
+ * secondary-vga".
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * based on mtty driver which is:
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/cdev.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <linux/dma-buf.h>
+#include <linux/highmem.h>
+#include <drm/drm_fourcc.h>
+#include <drm/drm_rect.h>
+#include <drm/drm_modeset_lock.h>
+#include <drm/drm_property.h>
+#include <drm/drm_plane.h>
+
+
+#define VBE_DISPI_INDEX_ID 0x0
+#define VBE_DISPI_INDEX_XRES 0x1
+#define VBE_DISPI_INDEX_YRES 0x2
+#define VBE_DISPI_INDEX_BPP 0x3
+#define VBE_DISPI_INDEX_ENABLE 0x4
+#define VBE_DISPI_INDEX_BANK 0x5
+#define VBE_DISPI_INDEX_VIRT_WIDTH 0x6
+#define VBE_DISPI_INDEX_VIRT_HEIGHT 0x7
+#define VBE_DISPI_INDEX_X_OFFSET 0x8
+#define VBE_DISPI_INDEX_Y_OFFSET 0x9
+#define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa
+#define VBE_DISPI_INDEX_COUNT 0xb
+
+#define VBE_DISPI_ID0 0xB0C0
+#define VBE_DISPI_ID1 0xB0C1
+#define VBE_DISPI_ID2 0xB0C2
+#define VBE_DISPI_ID3 0xB0C3
+#define VBE_DISPI_ID4 0xB0C4
+#define VBE_DISPI_ID5 0xB0C5
+
+#define VBE_DISPI_DISABLED 0x00
+#define VBE_DISPI_ENABLED 0x01
+#define VBE_DISPI_GETCAPS 0x02
+#define VBE_DISPI_8BIT_DAC 0x20
+#define VBE_DISPI_LFB_ENABLED 0x40
+#define VBE_DISPI_NOCLEARMEM 0x80
+
+
+#define MBOCHS_NAME "mbochs"
+#define MBOCHS_CLASS_NAME "mbochs"
+
+#define MBOCHS_CONFIG_SPACE_SIZE 0xff
+#define MBOCHS_MMIO_BAR_OFFSET PAGE_SIZE
+#define MBOCHS_MMIO_BAR_SIZE PAGE_SIZE
+#define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \
+ MBOCHS_MMIO_BAR_SIZE)
+
+#define STORE_LE16(addr, val) (*(u16 *)addr = val)
+#define STORE_LE32(addr, val) (*(u32 *)addr = val)
+
+
+MODULE_LICENSE("GPL v2");
+
+static int max_mbytes = 256;
+module_param_named(count, max_mbytes, int, 0444);
+MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices");
+
+
+#define MBOCHS_TYPE_1 "small"
+#define MBOCHS_TYPE_2 "medium"
+#define MBOCHS_TYPE_3 "large"
+
+static const struct mbochs_type {
+ const char *name;
+ u32 mbytes;
+} mbochs_types[] = {
+ {
+ .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1,
+ .mbytes = 4,
+ }, {
+ .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2,
+ .mbytes = 16,
+ }, {
+ .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3,
+ .mbytes = 64,
+ },
+};
+
+
+static dev_t mbochs_devt;
+static struct class *mbochs_class;
+static struct cdev mbochs_cdev;
+static struct device mbochs_dev;
+static int mbochs_used_mbytes;
+
+struct mbochs_mode {
+ u32 drm_format;
+ u32 bytepp;
+ u32 width;
+ u32 height;
+ u32 stride;
+ u32 __pad;
+ u64 offset;
+ u64 size;
+};
+
+struct mbochs_dmabuf {
+ struct mbochs_mode mode;
+ u32 id;
+ struct page **pages;
+ pgoff_t pagecount;
+ struct dma_buf *buf;
+ struct mdev_state *mdev_state;
+ struct list_head next;
+ bool unlinked;
+};
+
+/* State of each mdev device */
+struct mdev_state {
+ u8 *vconfig;
+ u64 bar_mask[3];
+ u32 memory_bar_mask;
+ struct mutex ops_lock;
+ struct mdev_device *mdev;
+ struct vfio_device_info dev_info;
+
+ const struct mbochs_type *type;
+ u16 vbe[VBE_DISPI_INDEX_COUNT];
+ u64 memsize;
+ struct page **pages;
+ pgoff_t pagecount;
+
+ struct list_head dmabufs;
+ u32 active_id;
+ u32 next_id;
+};
+
+static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = {
+ [VBE_DISPI_INDEX_ID] = "id",
+ [VBE_DISPI_INDEX_XRES] = "xres",
+ [VBE_DISPI_INDEX_YRES] = "yres",
+ [VBE_DISPI_INDEX_BPP] = "bpp",
+ [VBE_DISPI_INDEX_ENABLE] = "enable",
+ [VBE_DISPI_INDEX_BANK] = "bank",
+ [VBE_DISPI_INDEX_VIRT_WIDTH] = "virt-width",
+ [VBE_DISPI_INDEX_VIRT_HEIGHT] = "virt-height",
+ [VBE_DISPI_INDEX_X_OFFSET] = "x-offset",
+ [VBE_DISPI_INDEX_Y_OFFSET] = "y-offset",
+ [VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = "video-mem",
+};
+
+static const char *vbe_name(u32 index)
+{
+ if (index < ARRAY_SIZE(vbe_name_list))
+ return vbe_name_list[index];
+ return "(invalid)";
+}
+
+static struct page *mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff);
+
+static const struct mbochs_type *mbochs_find_type(struct kobject *kobj)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mbochs_types); i++)
+ if (strcmp(mbochs_types[i].name, kobj->name) == 0)
+ return mbochs_types + i;
+ return NULL;
+}
+
+static void mbochs_create_config_space(struct mdev_state *mdev_state)
+{
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
+ 0x1234);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
+ 0x1111);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
+ PCI_SUBVENDOR_ID_REDHAT_QUMRANET);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
+ PCI_SUBDEVICE_ID_QEMU);
+
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
+ PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
+ PCI_CLASS_DISPLAY_OTHER);
+ mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH);
+ mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32);
+ mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1;
+}
+
+static int mbochs_check_framebuffer(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ u16 *vbe = mdev_state->vbe;
+ u32 virt_width;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED))
+ goto nofb;
+
+ memset(mode, 0, sizeof(*mode));
+ switch (vbe[VBE_DISPI_INDEX_BPP]) {
+ case 32:
+ mode->drm_format = DRM_FORMAT_XRGB8888;
+ mode->bytepp = 4;
+ break;
+ default:
+ dev_info_ratelimited(dev, "%s: bpp %d not supported\n",
+ __func__, vbe[VBE_DISPI_INDEX_BPP]);
+ goto nofb;
+ }
+
+ mode->width = vbe[VBE_DISPI_INDEX_XRES];
+ mode->height = vbe[VBE_DISPI_INDEX_YRES];
+ virt_width = vbe[VBE_DISPI_INDEX_VIRT_WIDTH];
+ if (virt_width < mode->width)
+ virt_width = mode->width;
+ mode->stride = virt_width * mode->bytepp;
+ mode->size = (u64)mode->stride * mode->height;
+ mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp +
+ (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride);
+
+ if (mode->width < 64 || mode->height < 64) {
+ dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n",
+ __func__, mode->width, mode->height);
+ goto nofb;
+ }
+ if (mode->offset + mode->size > mdev_state->memsize) {
+ dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n",
+ __func__);
+ goto nofb;
+ }
+
+ return 0;
+
+nofb:
+ memset(mode, 0, sizeof(*mode));
+ return -EINVAL;
+}
+
+static bool mbochs_modes_equal(struct mbochs_mode *mode1,
+ struct mbochs_mode *mode2)
+{
+ return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0;
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int index = (offset - PCI_BASE_ADDRESS_0) / 0x04;
+ u32 cfg_addr;
+
+ switch (offset) {
+ case PCI_BASE_ADDRESS_0:
+ case PCI_BASE_ADDRESS_2:
+ cfg_addr = *(u32 *)buf;
+
+ if (cfg_addr == 0xffffffff) {
+ cfg_addr = (cfg_addr & mdev_state->bar_mask[index]);
+ } else {
+ cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
+ if (cfg_addr)
+ dev_info(dev, "BAR #%d @ 0x%x\n",
+ index, cfg_addr);
+ }
+
+ cfg_addr |= (mdev_state->vconfig[offset] &
+ ~PCI_BASE_ADDRESS_MEM_MASK);
+ STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+ break;
+ }
+}
+
+static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int index;
+ u16 reg16;
+
+ switch (offset) {
+ case 0x400 ... 0x41f: /* vga ioports remapped */
+ goto unhandled;
+ case 0x500 ... 0x515: /* bochs dispi interface */
+ if (count != 2)
+ goto unhandled;
+ index = (offset - 0x500) / 2;
+ reg16 = *(u16 *)buf;
+ if (index < ARRAY_SIZE(mdev_state->vbe))
+ mdev_state->vbe[index] = reg16;
+ dev_dbg(dev, "%s: vbe write %d = %d (%s)\n",
+ __func__, index, reg16, vbe_name(index));
+ break;
+ case 0x600 ... 0x607: /* qemu extended regs */
+ goto unhandled;
+ default:
+unhandled:
+ dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
+ __func__, offset, count);
+ break;
+ }
+}
+
+static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ u16 reg16 = 0;
+ int index;
+
+ switch (offset) {
+ case 0x500 ... 0x515: /* bochs dispi interface */
+ if (count != 2)
+ goto unhandled;
+ index = (offset - 0x500) / 2;
+ if (index < ARRAY_SIZE(mdev_state->vbe))
+ reg16 = mdev_state->vbe[index];
+ dev_dbg(dev, "%s: vbe read %d = %d (%s)\n",
+ __func__, index, reg16, vbe_name(index));
+ *(u16 *)buf = reg16;
+ break;
+ default:
+unhandled:
+ dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
+ __func__, offset, count);
+ memset(buf, 0, count);
+ break;
+ }
+}
+
+static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
+ loff_t pos, bool is_write)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct device *dev = mdev_dev(mdev);
+ struct page *pg;
+ loff_t poff;
+ char *map;
+ int ret = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ if (pos < MBOCHS_CONFIG_SPACE_SIZE) {
+ if (is_write)
+ handle_pci_cfg_write(mdev_state, pos, buf, count);
+ else
+ memcpy(buf, (mdev_state->vconfig + pos), count);
+
+ } else if (pos >= MBOCHS_MMIO_BAR_OFFSET &&
+ pos + count <= MBOCHS_MEMORY_BAR_OFFSET) {
+ pos -= MBOCHS_MMIO_BAR_OFFSET;
+ if (is_write)
+ handle_mmio_write(mdev_state, pos, buf, count);
+ else
+ handle_mmio_read(mdev_state, pos, buf, count);
+
+ } else if (pos >= MBOCHS_MEMORY_BAR_OFFSET &&
+ pos + count <=
+ MBOCHS_MEMORY_BAR_OFFSET + mdev_state->memsize) {
+ pos -= MBOCHS_MMIO_BAR_OFFSET;
+ poff = pos & ~PAGE_MASK;
+ pg = mbochs_get_page(mdev_state, pos >> PAGE_SHIFT);
+ map = kmap(pg);
+ if (is_write)
+ memcpy(map + poff, buf, count);
+ else
+ memcpy(buf, map + poff, count);
+ kunmap(pg);
+ put_page(pg);
+
+ } else {
+ dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n",
+ __func__, is_write ? "WR" : "RD", pos);
+ ret = -1;
+ goto accessfailed;
+ }
+
+ ret = count;
+
+
+accessfailed:
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return ret;
+}
+
+static int mbochs_reset(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ u32 size64k = mdev_state->memsize / (64 * 1024);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++)
+ mdev_state->vbe[i] = 0;
+ mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5;
+ mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k;
+ return 0;
+}
+
+static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+ const struct mbochs_type *type = mbochs_find_type(kobj);
+ struct device *dev = mdev_dev(mdev);
+ struct mdev_state *mdev_state;
+
+ if (!type)
+ type = &mbochs_types[0];
+ if (type->mbytes + mbochs_used_mbytes > max_mbytes)
+ return -ENOMEM;
+
+ mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
+ if (mdev_state == NULL)
+ return -ENOMEM;
+
+ mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
+ if (mdev_state->vconfig == NULL)
+ goto err_mem;
+
+ mdev_state->memsize = type->mbytes * 1024 * 1024;
+ mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT;
+ mdev_state->pages = kcalloc(mdev_state->pagecount,
+ sizeof(struct page *),
+ GFP_KERNEL);
+ if (!mdev_state->pages)
+ goto err_mem;
+
+ dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__,
+ kobj->name, type->mbytes, mdev_state->pagecount);
+
+ mutex_init(&mdev_state->ops_lock);
+ mdev_state->mdev = mdev;
+ mdev_set_drvdata(mdev, mdev_state);
+ INIT_LIST_HEAD(&mdev_state->dmabufs);
+ mdev_state->next_id = 1;
+
+ mdev_state->type = type;
+ mbochs_create_config_space(mdev_state);
+ mbochs_reset(mdev);
+
+ mbochs_used_mbytes += type->mbytes;
+ return 0;
+
+err_mem:
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+ return -ENOMEM;
+}
+
+static int mbochs_remove(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ mbochs_used_mbytes -= mdev_state->type->mbytes;
+ mdev_set_drvdata(mdev, NULL);
+ kfree(mdev_state->pages);
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+ return 0;
+}
+
+static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 1;
+ }
+
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+
+read_err:
+ return -EFAULT;
+}
+
+static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 1;
+ }
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+write_err:
+ return -EFAULT;
+}
+
+static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff)
+{
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!mdev_state->pages[pgoff]) {
+ mdev_state->pages[pgoff] =
+ alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0);
+ if (!mdev_state->pages[pgoff])
+ return NULL;
+ }
+
+ get_page(mdev_state->pages[pgoff]);
+ return mdev_state->pages[pgoff];
+}
+
+static struct page *mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff)
+{
+ struct page *page;
+
+ if (WARN_ON(pgoff >= mdev_state->pagecount))
+ return NULL;
+
+ mutex_lock(&mdev_state->ops_lock);
+ page = __mbochs_get_page(mdev_state, pgoff);
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return page;
+}
+
+static void mbochs_put_pages(struct mdev_state *mdev_state)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int i, count = 0;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ for (i = 0; i < mdev_state->pagecount; i++) {
+ if (!mdev_state->pages[i])
+ continue;
+ put_page(mdev_state->pages[i]);
+ mdev_state->pages[i] = NULL;
+ count++;
+ }
+ dev_dbg(dev, "%s: %d pages released\n", __func__, count);
+}
+
+static int mbochs_region_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mdev_state *mdev_state = vma->vm_private_data;
+ pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+
+ if (page_offset >= mdev_state->pagecount)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = mbochs_get_page(mdev_state, page_offset);
+ if (!vmf->page)
+ return VM_FAULT_SIGBUS;
+
+ return 0;
+}
+
+static const struct vm_operations_struct mbochs_region_vm_ops = {
+ .fault = mbochs_region_vm_fault,
+};
+
+static int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
+ return -EINVAL;
+ if (vma->vm_end < vma->vm_start)
+ return -EINVAL;
+ if (vma->vm_end - vma->vm_start > mdev_state->memsize)
+ return -EINVAL;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ vma->vm_ops = &mbochs_region_vm_ops;
+ vma->vm_private_data = mdev_state;
+ return 0;
+}
+
+static int mbochs_dmabuf_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mbochs_dmabuf *dmabuf = vma->vm_private_data;
+
+ if (WARN_ON(vmf->pgoff >= dmabuf->pagecount))
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = dmabuf->pages[vmf->pgoff];
+ get_page(vmf->page);
+ return 0;
+}
+
+static const struct vm_operations_struct mbochs_dmabuf_vm_ops = {
+ .fault = mbochs_dmabuf_vm_fault,
+};
+
+static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ vma->vm_ops = &mbochs_dmabuf_vm_ops;
+ vma->vm_private_data = dmabuf;
+ return 0;
+}
+
+static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf,
+ const char *prefix)
+{
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+ u32 fourcc = dmabuf->mode.drm_format;
+
+ dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n",
+ prefix, dmabuf->id,
+ fourcc ? ((fourcc >> 0) & 0xff) : '-',
+ fourcc ? ((fourcc >> 8) & 0xff) : '-',
+ fourcc ? ((fourcc >> 16) & 0xff) : '-',
+ fourcc ? ((fourcc >> 24) & 0xff) : '-',
+ dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride,
+ dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount);
+}
+
+static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at,
+ enum dma_data_direction direction)
+{
+ struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+ struct sg_table *sg;
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ sg = kzalloc(sizeof(*sg), GFP_KERNEL);
+ if (!sg)
+ goto err1;
+ if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount,
+ 0, dmabuf->mode.size, GFP_KERNEL) < 0)
+ goto err2;
+ if (!dma_map_sg(at->dev, sg->sgl, sg->nents, direction))
+ goto err3;
+
+ return sg;
+
+err3:
+ sg_free_table(sg);
+err2:
+ kfree(sg);
+err1:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at,
+ struct sg_table *sg,
+ enum dma_data_direction direction)
+{
+ struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ sg_free_table(sg);
+ kfree(sg);
+}
+
+static void mbochs_release_dmabuf(struct dma_buf *buf)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct mdev_state *mdev_state = dmabuf->mdev_state;
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ pgoff_t pg;
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ for (pg = 0; pg < dmabuf->pagecount; pg++)
+ put_page(dmabuf->pages[pg]);
+
+ mutex_lock(&mdev_state->ops_lock);
+ dmabuf->buf = NULL;
+ if (dmabuf->unlinked)
+ kfree(dmabuf);
+ mutex_unlock(&mdev_state->ops_lock);
+}
+
+static void *mbochs_kmap_atomic_dmabuf(struct dma_buf *buf,
+ unsigned long page_num)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct page *page = dmabuf->pages[page_num];
+
+ return kmap_atomic(page);
+}
+
+static void *mbochs_kmap_dmabuf(struct dma_buf *buf, unsigned long page_num)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct page *page = dmabuf->pages[page_num];
+
+ return kmap(page);
+}
+
+static struct dma_buf_ops mbochs_dmabuf_ops = {
+ .map_dma_buf = mbochs_map_dmabuf,
+ .unmap_dma_buf = mbochs_unmap_dmabuf,
+ .release = mbochs_release_dmabuf,
+ .map_atomic = mbochs_kmap_atomic_dmabuf,
+ .map = mbochs_kmap_dmabuf,
+ .mmap = mbochs_mmap_dmabuf,
+};
+
+static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct mbochs_dmabuf *dmabuf;
+ pgoff_t page_offset, pg;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL);
+ if (!dmabuf)
+ return NULL;
+
+ dmabuf->mode = *mode;
+ dmabuf->id = mdev_state->next_id++;
+ dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE);
+ dmabuf->pages = kcalloc(dmabuf->pagecount, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!dmabuf->pages)
+ goto err_free_dmabuf;
+
+ page_offset = dmabuf->mode.offset >> PAGE_SHIFT;
+ for (pg = 0; pg < dmabuf->pagecount; pg++) {
+ dmabuf->pages[pg] = __mbochs_get_page(mdev_state,
+ page_offset + pg);
+ if (!dmabuf->pages[pg])
+ goto err_free_pages;
+ }
+
+ dmabuf->mdev_state = mdev_state;
+ list_add(&dmabuf->next, &mdev_state->dmabufs);
+
+ mbochs_print_dmabuf(dmabuf, __func__);
+ return dmabuf;
+
+err_free_pages:
+ while (pg > 0)
+ put_page(dmabuf->pages[--pg]);
+ kfree(dmabuf->pages);
+err_free_dmabuf:
+ kfree(dmabuf);
+ return NULL;
+}
+
+static struct mbochs_dmabuf *
+mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct mbochs_dmabuf *dmabuf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
+ if (mbochs_modes_equal(&dmabuf->mode, mode))
+ return dmabuf;
+
+ return NULL;
+}
+
+static struct mbochs_dmabuf *
+mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id)
+{
+ struct mbochs_dmabuf *dmabuf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
+ if (dmabuf->id == id)
+ return dmabuf;
+
+ return NULL;
+}
+
+static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
+{
+ struct mdev_state *mdev_state = dmabuf->mdev_state;
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct dma_buf *buf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) {
+ dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ exp_info.ops = &mbochs_dmabuf_ops;
+ exp_info.size = dmabuf->mode.size;
+ exp_info.priv = dmabuf;
+
+ buf = dma_buf_export(&exp_info);
+ if (IS_ERR(buf)) {
+ dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n",
+ __func__, PTR_ERR(buf));
+ return PTR_ERR(buf);
+ }
+
+ dmabuf->buf = buf;
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+ return 0;
+}
+
+static int mbochs_get_region_info(struct mdev_device *mdev,
+ struct vfio_region_info *region_info,
+ u16 *cap_type_id, void **cap_type)
+{
+ struct mdev_state *mdev_state;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state)
+ return -EINVAL;
+
+ if (region_info->index >= VFIO_PCI_NUM_REGIONS)
+ return -EINVAL;
+
+ switch (region_info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ region_info->offset = 0;
+ region_info->size = MBOCHS_CONFIG_SPACE_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ region_info->offset = MBOCHS_MEMORY_BAR_OFFSET;
+ region_info->size = mdev_state->memsize;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP);
+ break;
+ case VFIO_PCI_BAR2_REGION_INDEX:
+ region_info->offset = MBOCHS_MMIO_BAR_OFFSET;
+ region_info->size = MBOCHS_MMIO_BAR_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ default:
+ region_info->size = 0;
+ region_info->offset = 0;
+ region_info->flags = 0;
+ }
+
+ return 0;
+}
+
+static int mbochs_get_irq_info(struct mdev_device *mdev,
+ struct vfio_irq_info *irq_info)
+{
+ irq_info->count = 0;
+ return 0;
+}
+
+static int mbochs_get_device_info(struct mdev_device *mdev,
+ struct vfio_device_info *dev_info)
+{
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+ dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+ dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+ return 0;
+}
+
+static int mbochs_query_gfx_plane(struct mdev_device *mdev,
+ struct vfio_device_gfx_plane_info *plane)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct device *dev = mdev_dev(mdev);
+ struct mbochs_dmabuf *dmabuf;
+ struct mbochs_mode mode;
+ int ret;
+
+ if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
+ if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
+ VFIO_GFX_PLANE_TYPE_DMABUF))
+ return 0;
+ return -EINVAL;
+ }
+
+ if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF)
+ return -EINVAL;
+
+ plane->drm_format_mod = 0;
+ plane->x_pos = 0;
+ plane->y_pos = 0;
+ plane->x_hot = 0;
+ plane->y_hot = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ ret = -EINVAL;
+ if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY)
+ ret = mbochs_check_framebuffer(mdev_state, &mode);
+ if (ret < 0) {
+ plane->drm_format = 0;
+ plane->width = 0;
+ plane->height = 0;
+ plane->stride = 0;
+ plane->size = 0;
+ plane->dmabuf_id = 0;
+ goto done;
+ }
+
+ dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode);
+ if (!dmabuf)
+ mbochs_dmabuf_alloc(mdev_state, &mode);
+ if (!dmabuf) {
+ mutex_unlock(&mdev_state->ops_lock);
+ return -ENOMEM;
+ }
+
+ plane->drm_format = dmabuf->mode.drm_format;
+ plane->width = dmabuf->mode.width;
+ plane->height = dmabuf->mode.height;
+ plane->stride = dmabuf->mode.stride;
+ plane->size = dmabuf->mode.size;
+ plane->dmabuf_id = dmabuf->id;
+
+done:
+ if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY &&
+ mdev_state->active_id != plane->dmabuf_id) {
+ dev_dbg(dev, "%s: primary: %d => %d\n", __func__,
+ mdev_state->active_id, plane->dmabuf_id);
+ mdev_state->active_id = plane->dmabuf_id;
+ }
+ mutex_unlock(&mdev_state->ops_lock);
+ return 0;
+}
+
+static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
+ u32 id)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct mbochs_dmabuf *dmabuf;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id);
+ if (!dmabuf) {
+ mutex_unlock(&mdev_state->ops_lock);
+ return -ENOENT;
+ }
+
+ if (!dmabuf->buf)
+ mbochs_dmabuf_export(dmabuf);
+
+ mutex_unlock(&mdev_state->ops_lock);
+
+ if (!dmabuf->buf)
+ return -EINVAL;
+
+ return dma_buf_fd(dmabuf->buf, 0);
+}
+
+static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = 0;
+ unsigned long minsz;
+ struct mdev_state *mdev_state;
+
+ mdev_state = mdev_get_drvdata(mdev);
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mbochs_get_device_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_DEVICE_GET_REGION_INFO:
+ {
+ struct vfio_region_info info;
+ u16 cap_type_id = 0;
+ void *cap_type = NULL;
+
+ minsz = offsetofend(struct vfio_region_info, offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mbochs_get_region_info(mdev, &info, &cap_type_id,
+ &cap_type);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if ((info.argsz < minsz) ||
+ (info.index >= mdev_state->dev_info.num_irqs))
+ return -EINVAL;
+
+ ret = mbochs_get_irq_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_QUERY_GFX_PLANE:
+ {
+ struct vfio_device_gfx_plane_info plane;
+
+ minsz = offsetofend(struct vfio_device_gfx_plane_info,
+ region_index);
+
+ if (copy_from_user(&plane, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (plane.argsz < minsz)
+ return -EINVAL;
+
+ ret = mbochs_query_gfx_plane(mdev, &plane);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &plane, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_GFX_DMABUF:
+ {
+ u32 dmabuf_id;
+
+ if (get_user(dmabuf_id, (__u32 __user *)arg))
+ return -EFAULT;
+
+ return mbochs_get_gfx_dmabuf(mdev, dmabuf_id);
+ }
+
+ case VFIO_DEVICE_SET_IRQS:
+ return -EINVAL;
+
+ case VFIO_DEVICE_RESET:
+ return mbochs_reset(mdev);
+ }
+ return -ENOTTY;
+}
+
+static int mbochs_open(struct mdev_device *mdev)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ return 0;
+}
+
+static void mbochs_close(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct mbochs_dmabuf *dmabuf, *tmp;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) {
+ list_del(&dmabuf->next);
+ if (dmabuf->buf) {
+ /* free in mbochs_release_dmabuf() */
+ dmabuf->unlinked = true;
+ } else {
+ kfree(dmabuf);
+ }
+ }
+ mbochs_put_pages(mdev_state);
+
+ mutex_unlock(&mdev_state->ops_lock);
+ module_put(THIS_MODULE);
+}
+
+static ssize_t
+memory_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct mdev_device *mdev = mdev_from_dev(dev);
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ return sprintf(buf, "%d MB\n", mdev_state->type->mbytes);
+}
+static DEVICE_ATTR_RO(memory);
+
+static struct attribute *mdev_dev_attrs[] = {
+ &dev_attr_memory.attr,
+ NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+ .name = "vendor",
+ .attrs = mdev_dev_attrs,
+};
+
+const struct attribute_group *mdev_dev_groups[] = {
+ &mdev_dev_group,
+ NULL,
+};
+
+static ssize_t
+name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ return sprintf(buf, "%s\n", kobj->name);
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t
+description_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ const struct mbochs_type *type = mbochs_find_type(kobj);
+
+ return sprintf(buf, "virtual display, %d MB video memory\n",
+ type ? type->mbytes : 0);
+}
+MDEV_TYPE_ATTR_RO(description);
+
+static ssize_t
+available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ const struct mbochs_type *type = mbochs_find_type(kobj);
+ int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
+
+ return sprintf(buf, "%d\n", count);
+}
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *mdev_types_attrs[] = {
+ &mdev_type_attr_name.attr,
+ &mdev_type_attr_description.attr,
+ &mdev_type_attr_device_api.attr,
+ &mdev_type_attr_available_instances.attr,
+ NULL,
+};
+
+static struct attribute_group mdev_type_group1 = {
+ .name = MBOCHS_TYPE_1,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group2 = {
+ .name = MBOCHS_TYPE_2,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group3 = {
+ .name = MBOCHS_TYPE_3,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group *mdev_type_groups[] = {
+ &mdev_type_group1,
+ &mdev_type_group2,
+ &mdev_type_group3,
+ NULL,
+};
+
+static const struct mdev_parent_ops mdev_fops = {
+ .owner = THIS_MODULE,
+ .mdev_attr_groups = mdev_dev_groups,
+ .supported_type_groups = mdev_type_groups,
+ .create = mbochs_create,
+ .remove = mbochs_remove,
+ .open = mbochs_open,
+ .release = mbochs_close,
+ .read = mbochs_read,
+ .write = mbochs_write,
+ .ioctl = mbochs_ioctl,
+ .mmap = mbochs_mmap,
+};
+
+static const struct file_operations vd_fops = {
+ .owner = THIS_MODULE,
+};
+
+static void mbochs_device_release(struct device *dev)
+{
+ /* nothing */
+}
+
+static int __init mbochs_dev_init(void)
+{
+ int ret = 0;
+
+ ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK, MBOCHS_NAME);
+ if (ret < 0) {
+ pr_err("Error: failed to register mbochs_dev, err: %d\n", ret);
+ return ret;
+ }
+ cdev_init(&mbochs_cdev, &vd_fops);
+ cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK);
+ pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt));
+
+ mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME);
+ if (IS_ERR(mbochs_class)) {
+ pr_err("Error: failed to register mbochs_dev class\n");
+ ret = PTR_ERR(mbochs_class);
+ goto failed1;
+ }
+ mbochs_dev.class = mbochs_class;
+ mbochs_dev.release = mbochs_device_release;
+ dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME);
+
+ ret = device_register(&mbochs_dev);
+ if (ret)
+ goto failed2;
+
+ ret = mdev_register_device(&mbochs_dev, &mdev_fops);
+ if (ret)
+ goto failed3;
+
+ return 0;
+
+failed3:
+ device_unregister(&mbochs_dev);
+failed2:
+ class_destroy(mbochs_class);
+failed1:
+ cdev_del(&mbochs_cdev);
+ unregister_chrdev_region(mbochs_devt, MINORMASK);
+ return ret;
+}
+
+static void __exit mbochs_dev_exit(void)
+{
+ mbochs_dev.bus = NULL;
+ mdev_unregister_device(&mbochs_dev);
+
+ device_unregister(&mbochs_dev);
+ cdev_del(&mbochs_cdev);
+ unregister_chrdev_region(mbochs_devt, MINORMASK);
+ class_destroy(mbochs_class);
+ mbochs_class = NULL;
+}
+
+module_init(mbochs_dev_init)
+module_exit(mbochs_dev_exit)
diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h
new file mode 100644
index 000000000000..96b3b1b49d34
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-defs.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Simple pci display device.
+ *
+ * Framebuffer memory is pci bar 0.
+ * Configuration (read-only) is in pci config space.
+ * Format field uses drm fourcc codes.
+ * ATM only DRM_FORMAT_XRGB8888 is supported.
+ */
+
+/* pci ids */
+#define MDPY_PCI_VENDOR_ID 0x1b36 /* redhat */
+#define MDPY_PCI_DEVICE_ID 0x000f
+#define MDPY_PCI_SUBVENDOR_ID PCI_SUBVENDOR_ID_REDHAT_QUMRANET
+#define MDPY_PCI_SUBDEVICE_ID PCI_SUBDEVICE_ID_QEMU
+
+/* pci cfg space offsets for fb config (dword) */
+#define MDPY_VENDORCAP_OFFSET 0x40
+#define MDPY_VENDORCAP_SIZE 0x10
+#define MDPY_FORMAT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x04)
+#define MDPY_WIDTH_OFFSET (MDPY_VENDORCAP_OFFSET + 0x08)
+#define MDPY_HEIGHT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x0c)
diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c
new file mode 100644
index 000000000000..2719bb259653
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-fb.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Framebuffer driver for mdpy (mediated virtual pci display device).
+ *
+ * See mdpy-defs.h for device specs
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * Using some code snippets from simplefb and cirrusfb.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/errno.h>
+#include <linux/fb.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <drm/drm_fourcc.h>
+#include "mdpy-defs.h"
+
+static const struct fb_fix_screeninfo mdpy_fb_fix = {
+ .id = "mdpy-fb",
+ .type = FB_TYPE_PACKED_PIXELS,
+ .visual = FB_VISUAL_TRUECOLOR,
+ .accel = FB_ACCEL_NONE,
+};
+
+static const struct fb_var_screeninfo mdpy_fb_var = {
+ .height = -1,
+ .width = -1,
+ .activate = FB_ACTIVATE_NOW,
+ .vmode = FB_VMODE_NONINTERLACED,
+
+ .bits_per_pixel = 32,
+ .transp.offset = 24,
+ .red.offset = 16,
+ .green.offset = 8,
+ .blue.offset = 0,
+ .transp.length = 8,
+ .red.length = 8,
+ .green.length = 8,
+ .blue.length = 8,
+};
+
+#define PSEUDO_PALETTE_SIZE 16
+
+struct mdpy_fb_par {
+ u32 palette[PSEUDO_PALETTE_SIZE];
+};
+
+static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
+ u_int transp, struct fb_info *info)
+{
+ u32 *pal = info->pseudo_palette;
+ u32 cr = red >> (16 - info->var.red.length);
+ u32 cg = green >> (16 - info->var.green.length);
+ u32 cb = blue >> (16 - info->var.blue.length);
+ u32 value, mask;
+
+ if (regno >= PSEUDO_PALETTE_SIZE)
+ return -EINVAL;
+
+ value = (cr << info->var.red.offset) |
+ (cg << info->var.green.offset) |
+ (cb << info->var.blue.offset);
+ if (info->var.transp.length > 0) {
+ mask = (1 << info->var.transp.length) - 1;
+ mask <<= info->var.transp.offset;
+ value |= mask;
+ }
+ pal[regno] = value;
+
+ return 0;
+}
+
+static void mdpy_fb_destroy(struct fb_info *info)
+{
+ if (info->screen_base)
+ iounmap(info->screen_base);
+}
+
+static struct fb_ops mdpy_fb_ops = {
+ .owner = THIS_MODULE,
+ .fb_destroy = mdpy_fb_destroy,
+ .fb_setcolreg = mdpy_fb_setcolreg,
+ .fb_fillrect = cfb_fillrect,
+ .fb_copyarea = cfb_copyarea,
+ .fb_imageblit = cfb_imageblit,
+};
+
+static int mdpy_fb_probe(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ struct fb_info *info;
+ struct mdpy_fb_par *par;
+ u32 format, width, height;
+ int ret;
+
+ ret = pci_enable_device(pdev);
+ if (ret < 0)
+ return ret;
+
+ ret = pci_request_regions(pdev, "mdpy-fb");
+ if (ret < 0)
+ return ret;
+
+ pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format);
+ pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width);
+ pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height);
+ if (format != DRM_FORMAT_XRGB8888) {
+ pci_err(pdev, "format mismatch (0x%x != 0x%x)\n",
+ format, DRM_FORMAT_XRGB8888);
+ return -EINVAL;
+ }
+ if (width < 100 || width > 10000) {
+ pci_err(pdev, "width (%d) out of range\n", width);
+ return -EINVAL;
+ }
+ if (height < 100 || height > 10000) {
+ pci_err(pdev, "height (%d) out of range\n", height);
+ return -EINVAL;
+ }
+ pci_info(pdev, "mdpy found: %dx%d framebuffer\n",
+ width, height);
+
+ info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev);
+ if (!info)
+ goto err_release_regions;
+ pci_set_drvdata(pdev, info);
+ par = info->par;
+
+ info->fix = mdpy_fb_fix;
+ info->fix.smem_start = pci_resource_start(pdev, 0);
+ info->fix.smem_len = pci_resource_len(pdev, 0);
+ info->fix.line_length = width * 4;
+
+ info->var = mdpy_fb_var;
+ info->var.xres = width;
+ info->var.yres = height;
+ info->var.xres_virtual = width;
+ info->var.yres_virtual = height;
+
+ info->screen_size = info->fix.smem_len;
+ info->screen_base = ioremap(info->fix.smem_start,
+ info->screen_size);
+ if (!info->screen_base) {
+ pci_err(pdev, "ioremap(pcibar) failed\n");
+ ret = -EIO;
+ goto err_release_fb;
+ }
+
+ info->apertures = alloc_apertures(1);
+ if (!info->apertures) {
+ ret = -ENOMEM;
+ goto err_unmap;
+ }
+ info->apertures->ranges[0].base = info->fix.smem_start;
+ info->apertures->ranges[0].size = info->fix.smem_len;
+
+ info->fbops = &mdpy_fb_ops;
+ info->flags = FBINFO_DEFAULT;
+ info->pseudo_palette = par->palette;
+
+ ret = register_framebuffer(info);
+ if (ret < 0) {
+ pci_err(pdev, "mdpy-fb device register failed: %d\n", ret);
+ goto err_unmap;
+ }
+
+ pci_info(pdev, "fb%d registered\n", info->node);
+ return 0;
+
+err_unmap:
+ iounmap(info->screen_base);
+
+err_release_fb:
+ framebuffer_release(info);
+
+err_release_regions:
+ pci_release_regions(pdev);
+
+ return ret;
+}
+
+static void mdpy_fb_remove(struct pci_dev *pdev)
+{
+ struct fb_info *info = pci_get_drvdata(pdev);
+
+ unregister_framebuffer(info);
+ framebuffer_release(info);
+}
+
+static struct pci_device_id mdpy_fb_pci_table[] = {
+ {
+ .vendor = MDPY_PCI_VENDOR_ID,
+ .device = MDPY_PCI_DEVICE_ID,
+ .subvendor = MDPY_PCI_SUBVENDOR_ID,
+ .subdevice = MDPY_PCI_SUBDEVICE_ID,
+ }, {
+ /* end of list */
+ }
+};
+
+static struct pci_driver mdpy_fb_pci_driver = {
+ .name = "mdpy-fb",
+ .id_table = mdpy_fb_pci_table,
+ .probe = mdpy_fb_probe,
+ .remove = mdpy_fb_remove,
+};
+
+static int __init mdpy_fb_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&mdpy_fb_pci_driver);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+module_init(mdpy_fb_init);
+
+MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table);
+MODULE_LICENSE("GPL v2");
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
new file mode 100644
index 000000000000..96e7969c473a
--- /dev/null
+++ b/samples/vfio-mdev/mdpy.c
@@ -0,0 +1,807 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Mediated virtual PCI display host device driver
+ *
+ * See mdpy-defs.h for device specs
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * based on mtty driver which is:
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/cdev.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <drm/drm_fourcc.h>
+#include "mdpy-defs.h"
+
+#define MDPY_NAME "mdpy"
+#define MDPY_CLASS_NAME "mdpy"
+
+#define MDPY_CONFIG_SPACE_SIZE 0xff
+#define MDPY_MEMORY_BAR_OFFSET PAGE_SIZE
+#define MDPY_DISPLAY_REGION 16
+
+#define STORE_LE16(addr, val) (*(u16 *)addr = val)
+#define STORE_LE32(addr, val) (*(u32 *)addr = val)
+
+
+MODULE_LICENSE("GPL v2");
+
+static int max_devices = 4;
+module_param_named(count, max_devices, int, 0444);
+MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices");
+
+
+#define MDPY_TYPE_1 "vga"
+#define MDPY_TYPE_2 "xga"
+#define MDPY_TYPE_3 "hd"
+
+static const struct mdpy_type {
+ const char *name;
+ u32 format;
+ u32 bytepp;
+ u32 width;
+ u32 height;
+} mdpy_types[] = {
+ {
+ .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 640,
+ .height = 480,
+ }, {
+ .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 1024,
+ .height = 768,
+ }, {
+ .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 1920,
+ .height = 1080,
+ },
+};
+
+static dev_t mdpy_devt;
+static struct class *mdpy_class;
+static struct cdev mdpy_cdev;
+static struct device mdpy_dev;
+static u32 mdpy_count;
+
+/* State of each mdev device */
+struct mdev_state {
+ u8 *vconfig;
+ u32 bar_mask;
+ struct mutex ops_lock;
+ struct mdev_device *mdev;
+ struct vfio_device_info dev_info;
+
+ const struct mdpy_type *type;
+ u32 memsize;
+ void *memblk;
+};
+
+static const struct mdpy_type *mdpy_find_type(struct kobject *kobj)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mdpy_types); i++)
+ if (strcmp(mdpy_types[i].name, kobj->name) == 0)
+ return mdpy_types + i;
+ return NULL;
+}
+
+static void mdpy_create_config_space(struct mdev_state *mdev_state)
+{
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
+ MDPY_PCI_VENDOR_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
+ MDPY_PCI_DEVICE_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
+ MDPY_PCI_SUBVENDOR_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
+ MDPY_PCI_SUBDEVICE_ID);
+
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
+ PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_STATUS],
+ PCI_STATUS_CAP_LIST);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
+ PCI_CLASS_DISPLAY_OTHER);
+ mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH);
+ mdev_state->bar_mask = ~(mdev_state->memsize) + 1;
+
+ /* vendor specific capability for the config registers */
+ mdev_state->vconfig[PCI_CAPABILITY_LIST] = MDPY_VENDORCAP_OFFSET;
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 0] = 0x09; /* vendor cap */
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 1] = 0x00; /* next ptr */
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 2] = MDPY_VENDORCAP_SIZE;
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET],
+ mdev_state->type->format);
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET],
+ mdev_state->type->width);
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET],
+ mdev_state->type->height);
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ u32 cfg_addr;
+
+ switch (offset) {
+ case PCI_BASE_ADDRESS_0:
+ cfg_addr = *(u32 *)buf;
+
+ if (cfg_addr == 0xffffffff) {
+ cfg_addr = (cfg_addr & mdev_state->bar_mask);
+ } else {
+ cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
+ if (cfg_addr)
+ dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr);
+ }
+
+ cfg_addr |= (mdev_state->vconfig[offset] &
+ ~PCI_BASE_ADDRESS_MEM_MASK);
+ STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+ break;
+ }
+}
+
+static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
+ loff_t pos, bool is_write)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct device *dev = mdev_dev(mdev);
+ int ret = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ if (pos < MDPY_CONFIG_SPACE_SIZE) {
+ if (is_write)
+ handle_pci_cfg_write(mdev_state, pos, buf, count);
+ else
+ memcpy(buf, (mdev_state->vconfig + pos), count);
+
+ } else if ((pos >= MDPY_MEMORY_BAR_OFFSET) &&
+ (pos + count <=
+ MDPY_MEMORY_BAR_OFFSET + mdev_state->memsize)) {
+ pos -= MDPY_MEMORY_BAR_OFFSET;
+ if (is_write)
+ memcpy(mdev_state->memblk, buf, count);
+ else
+ memcpy(buf, mdev_state->memblk, count);
+
+ } else {
+ dev_info(dev, "%s: %s @0x%llx (unhandled)\n",
+ __func__, is_write ? "WR" : "RD", pos);
+ ret = -1;
+ goto accessfailed;
+ }
+
+ ret = count;
+
+
+accessfailed:
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return ret;
+}
+
+static int mdpy_reset(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ u32 stride, i;
+
+ /* initialize with gray gradient */
+ stride = mdev_state->type->width * mdev_state->type->bytepp;
+ for (i = 0; i < mdev_state->type->height; i++)
+ memset(mdev_state->memblk + i * stride,
+ i * 255 / mdev_state->type->height,
+ stride);
+ return 0;
+}
+
+static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+ const struct mdpy_type *type = mdpy_find_type(kobj);
+ struct device *dev = mdev_dev(mdev);
+ struct mdev_state *mdev_state;
+ u32 fbsize;
+
+ if (mdpy_count >= max_devices)
+ return -ENOMEM;
+
+ mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
+ if (mdev_state == NULL)
+ return -ENOMEM;
+
+ mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
+ if (mdev_state->vconfig == NULL) {
+ kfree(mdev_state);
+ return -ENOMEM;
+ }
+
+ if (!type)
+ type = &mdpy_types[0];
+ fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
+
+ mdev_state->memblk = vmalloc_user(fbsize);
+ if (!mdev_state->memblk) {
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+ return -ENOMEM;
+ }
+ dev_info(dev, "%s: %s (%dx%d)\n",
+ __func__, kobj->name, type->width, type->height);
+
+ mutex_init(&mdev_state->ops_lock);
+ mdev_state->mdev = mdev;
+ mdev_set_drvdata(mdev, mdev_state);
+
+ mdev_state->type = type;
+ mdev_state->memsize = fbsize;
+ mdpy_create_config_space(mdev_state);
+ mdpy_reset(mdev);
+
+ mdpy_count++;
+ return 0;
+}
+
+static int mdpy_remove(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct device *dev = mdev_dev(mdev);
+
+ dev_info(dev, "%s\n", __func__);
+
+ mdev_set_drvdata(mdev, NULL);
+ vfree(mdev_state->memblk);
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+
+ mdpy_count--;
+ return 0;
+}
+
+static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 1;
+ }
+
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+
+read_err:
+ return -EFAULT;
+}
+
+static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 1;
+ }
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+write_err:
+ return -EFAULT;
+}
+
+static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
+ return -EINVAL;
+ if (vma->vm_end < vma->vm_start)
+ return -EINVAL;
+ if (vma->vm_end - vma->vm_start > mdev_state->memsize)
+ return -EINVAL;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ return remap_vmalloc_range_partial(vma, vma->vm_start,
+ mdev_state->memblk,
+ vma->vm_end - vma->vm_start);
+}
+
+static int mdpy_get_region_info(struct mdev_device *mdev,
+ struct vfio_region_info *region_info,
+ u16 *cap_type_id, void **cap_type)
+{
+ struct mdev_state *mdev_state;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state)
+ return -EINVAL;
+
+ if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
+ region_info->index != MDPY_DISPLAY_REGION)
+ return -EINVAL;
+
+ switch (region_info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ region_info->offset = 0;
+ region_info->size = MDPY_CONFIG_SPACE_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ case MDPY_DISPLAY_REGION:
+ region_info->offset = MDPY_MEMORY_BAR_OFFSET;
+ region_info->size = mdev_state->memsize;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP);
+ break;
+ default:
+ region_info->size = 0;
+ region_info->offset = 0;
+ region_info->flags = 0;
+ }
+
+ return 0;
+}
+
+static int mdpy_get_irq_info(struct mdev_device *mdev,
+ struct vfio_irq_info *irq_info)
+{
+ irq_info->count = 0;
+ return 0;
+}
+
+static int mdpy_get_device_info(struct mdev_device *mdev,
+ struct vfio_device_info *dev_info)
+{
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+ dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+ dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+ return 0;
+}
+
+static int mdpy_query_gfx_plane(struct mdev_device *mdev,
+ struct vfio_device_gfx_plane_info *plane)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
+ if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
+ VFIO_GFX_PLANE_TYPE_REGION))
+ return 0;
+ return -EINVAL;
+ }
+
+ if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION)
+ return -EINVAL;
+
+ plane->drm_format = mdev_state->type->format;
+ plane->width = mdev_state->type->width;
+ plane->height = mdev_state->type->height;
+ plane->stride = (mdev_state->type->width *
+ mdev_state->type->bytepp);
+ plane->size = mdev_state->memsize;
+ plane->region_index = MDPY_DISPLAY_REGION;
+
+ /* unused */
+ plane->drm_format_mod = 0;
+ plane->x_pos = 0;
+ plane->y_pos = 0;
+ plane->x_hot = 0;
+ plane->y_hot = 0;
+
+ return 0;
+}
+
+static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = 0;
+ unsigned long minsz;
+ struct mdev_state *mdev_state;
+
+ mdev_state = mdev_get_drvdata(mdev);
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mdpy_get_device_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_DEVICE_GET_REGION_INFO:
+ {
+ struct vfio_region_info info;
+ u16 cap_type_id = 0;
+ void *cap_type = NULL;
+
+ minsz = offsetofend(struct vfio_region_info, offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mdpy_get_region_info(mdev, &info, &cap_type_id,
+ &cap_type);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if ((info.argsz < minsz) ||
+ (info.index >= mdev_state->dev_info.num_irqs))
+ return -EINVAL;
+
+ ret = mdpy_get_irq_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_QUERY_GFX_PLANE:
+ {
+ struct vfio_device_gfx_plane_info plane;
+
+ minsz = offsetofend(struct vfio_device_gfx_plane_info,
+ region_index);
+
+ if (copy_from_user(&plane, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (plane.argsz < minsz)
+ return -EINVAL;
+
+ ret = mdpy_query_gfx_plane(mdev, &plane);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &plane, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_SET_IRQS:
+ return -EINVAL;
+
+ case VFIO_DEVICE_RESET:
+ return mdpy_reset(mdev);
+ }
+ return -ENOTTY;
+}
+
+static int mdpy_open(struct mdev_device *mdev)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ return 0;
+}
+
+static void mdpy_close(struct mdev_device *mdev)
+{
+ module_put(THIS_MODULE);
+}
+
+static ssize_t
+resolution_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct mdev_device *mdev = mdev_from_dev(dev);
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ return sprintf(buf, "%dx%d\n",
+ mdev_state->type->width,
+ mdev_state->type->height);
+}
+static DEVICE_ATTR_RO(resolution);
+
+static struct attribute *mdev_dev_attrs[] = {
+ &dev_attr_resolution.attr,
+ NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+ .name = "vendor",
+ .attrs = mdev_dev_attrs,
+};
+
+const struct attribute_group *mdev_dev_groups[] = {
+ &mdev_dev_group,
+ NULL,
+};
+
+static ssize_t
+name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ return sprintf(buf, "%s\n", kobj->name);
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t
+description_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ const struct mdpy_type *type = mdpy_find_type(kobj);
+
+ return sprintf(buf, "virtual display, %dx%d framebuffer\n",
+ type ? type->width : 0,
+ type ? type->height : 0);
+}
+MDEV_TYPE_ATTR_RO(description);
+
+static ssize_t
+available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ return sprintf(buf, "%d\n", max_devices - mdpy_count);
+}
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *mdev_types_attrs[] = {
+ &mdev_type_attr_name.attr,
+ &mdev_type_attr_description.attr,
+ &mdev_type_attr_device_api.attr,
+ &mdev_type_attr_available_instances.attr,
+ NULL,
+};
+
+static struct attribute_group mdev_type_group1 = {
+ .name = MDPY_TYPE_1,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group2 = {
+ .name = MDPY_TYPE_2,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group3 = {
+ .name = MDPY_TYPE_3,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group *mdev_type_groups[] = {
+ &mdev_type_group1,
+ &mdev_type_group2,
+ &mdev_type_group3,
+ NULL,
+};
+
+static const struct mdev_parent_ops mdev_fops = {
+ .owner = THIS_MODULE,
+ .mdev_attr_groups = mdev_dev_groups,
+ .supported_type_groups = mdev_type_groups,
+ .create = mdpy_create,
+ .remove = mdpy_remove,
+ .open = mdpy_open,
+ .release = mdpy_close,
+ .read = mdpy_read,
+ .write = mdpy_write,
+ .ioctl = mdpy_ioctl,
+ .mmap = mdpy_mmap,
+};
+
+static const struct file_operations vd_fops = {
+ .owner = THIS_MODULE,
+};
+
+static void mdpy_device_release(struct device *dev)
+{
+ /* nothing */
+}
+
+static int __init mdpy_dev_init(void)
+{
+ int ret = 0;
+
+ ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK, MDPY_NAME);
+ if (ret < 0) {
+ pr_err("Error: failed to register mdpy_dev, err: %d\n", ret);
+ return ret;
+ }
+ cdev_init(&mdpy_cdev, &vd_fops);
+ cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK);
+ pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt));
+
+ mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME);
+ if (IS_ERR(mdpy_class)) {
+ pr_err("Error: failed to register mdpy_dev class\n");
+ ret = PTR_ERR(mdpy_class);
+ goto failed1;
+ }
+ mdpy_dev.class = mdpy_class;
+ mdpy_dev.release = mdpy_device_release;
+ dev_set_name(&mdpy_dev, "%s", MDPY_NAME);
+
+ ret = device_register(&mdpy_dev);
+ if (ret)
+ goto failed2;
+
+ ret = mdev_register_device(&mdpy_dev, &mdev_fops);
+ if (ret)
+ goto failed3;
+
+ return 0;
+
+failed3:
+ device_unregister(&mdpy_dev);
+failed2:
+ class_destroy(mdpy_class);
+failed1:
+ cdev_del(&mdpy_cdev);
+ unregister_chrdev_region(mdpy_devt, MINORMASK);
+ return ret;
+}
+
+static void __exit mdpy_dev_exit(void)
+{
+ mdpy_dev.bus = NULL;
+ mdev_unregister_device(&mdpy_dev);
+
+ device_unregister(&mdpy_dev);
+ cdev_del(&mdpy_cdev);
+ unregister_chrdev_region(mdpy_devt, MINORMASK);
+ class_destroy(mdpy_class);
+ mdpy_class = NULL;
+}
+
+module_init(mdpy_dev_init)
+module_exit(mdpy_dev_exit)