summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/audit.c9
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c38
-rw-r--r--kernel/bpf/syscall.c20
-rw-r--r--kernel/bpf/task_iter.c33
-rw-r--r--kernel/crash_core.c1
-rw-r--r--kernel/dma/Kconfig18
-rw-r--r--kernel/dma/Makefile2
-rw-r--r--kernel/dma/contiguous.c2
-rw-r--r--kernel/dma/direct.c1
-rw-r--r--kernel/dma/map_benchmark.c361
-rw-r--r--kernel/dma/mapping.c12
-rw-r--r--kernel/dma/pool.c3
-rw-r--r--kernel/dma/swiotlb.c20
-rw-r--r--kernel/dma/virt.c61
-rw-r--r--kernel/events/core.c12
-rw-r--r--kernel/fail_function.c6
-rw-r--r--kernel/fork.c22
-rw-r--r--kernel/gcov/gcc_4_7.c10
-rw-r--r--kernel/kcmp.c59
-rw-r--r--kernel/livepatch/Kconfig2
-rw-r--r--kernel/livepatch/patch.c15
-rw-r--r--kernel/module.c200
-rw-r--r--kernel/params.c10
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/printk/printk.c257
-rw-r--r--kernel/printk/printk_ringbuffer.c32
-rw-r--r--kernel/reboot.c244
-rw-r--r--kernel/relay.c109
-rw-r--r--kernel/resource.c34
-rw-r--r--kernel/resource_kunit.c152
-rw-r--r--kernel/sched/wait.c17
-rw-r--r--kernel/seccomp.c296
-rw-r--r--kernel/signal.c22
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/task_work.c30
-rw-r--r--kernel/time/Kconfig18
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/time/tick-legacy.c37
-rw-r--r--kernel/time/timekeeping.c41
-rw-r--r--kernel/time/timekeeping.h1
-rw-r--r--kernel/trace/Kconfig70
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c187
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/fgraph.c3
-rw-r--r--kernel/trace/ftrace.c57
-rw-r--r--kernel/trace/ring_buffer.c223
-rw-r--r--kernel/trace/synth_event_gen_test.c2
-rw-r--r--kernel/trace/trace.c53
-rw-r--r--kernel/trace/trace.h182
-rw-r--r--kernel/trace/trace_benchmark.c6
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_dynevent.c2
-rw-r--r--kernel/trace/trace_dynevent.h6
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_event_perf.c15
-rw-r--r--kernel/trace/trace_events.c9
-rw-r--r--kernel/trace/trace_events_filter.c23
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_events_synth.c4
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions.c23
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_hwlat.c4
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c9
-rw-r--r--kernel/trace/trace_output.c6
-rw-r--r--kernel/trace/trace_output.h1
-rw-r--r--kernel/trace/trace_recursion_record.c236
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_selftest.c29
-rw-r--r--kernel/trace/trace_stack.c3
-rw-r--r--kernel/trace/tracing_map.c6
-rw-r--r--kernel/trace/tracing_map.h2
80 files changed, 2260 insertions, 1161 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c9f19911be0..aa7368c7eabf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -34,8 +34,11 @@ KCOV_INSTRUMENT_extable.o := n
KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
+# If sanitizers detect any issues in kcov, it may lead to recursion
+# via printk, etc.
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
+UBSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
obj-y += sched/
@@ -122,6 +125,7 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_RSEQ) += rseq.o
obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
+obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
diff --git a/kernel/acct.c b/kernel/acct.c
index f175df8f6aa4..a64102be2bb0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -381,9 +381,7 @@ static comp2_t encode_comp2_t(u64 value)
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
}
}
-#endif
-
-#if ACCT_VERSION == 3
+#elif ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
@@ -500,8 +498,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
-#endif
-#if ACCT_VERSION == 3
+#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
diff --git a/kernel/audit.c b/kernel/audit.c
index 68cee3bc8cfe..1ffc2e059027 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -67,7 +67,7 @@
#define AUDIT_DISABLED -1
#define AUDIT_UNINITIALIZED 0
#define AUDIT_INITIALIZED 1
-static int audit_initialized;
+static int audit_initialized = AUDIT_UNINITIALIZED;
u32 audit_enabled = AUDIT_OFF;
bool audit_ever_enabled = !!AUDIT_OFF;
@@ -523,7 +523,7 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net)
}
/**
- * kauditd_print_skb - Print the audit record to the ring buffer
+ * kauditd_printk_skb - Print the audit record to the ring buffer
* @skb: audit record
*
* Whatever the reason, this packet may not make it to the auditd connection
@@ -1779,7 +1779,7 @@ unsigned int audit_serial(void)
{
static atomic_t serial = ATOMIC_INIT(0);
- return atomic_add_return(1, &serial);
+ return atomic_inc_return(&serial);
}
static inline void audit_get_stamp(struct audit_context *ctx,
@@ -1865,6 +1865,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
}
audit_get_stamp(ab->ctx, &t, &serial);
+ /* cancel dummy context to enable supporting records */
+ if (ctx)
+ ctx->dummy = 0;
audit_log_format(ab, "audit(%llu.%03lu:%u): ",
(unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index bfcfcd61adb6..5b3f01da172b 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -154,7 +154,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
/* Update mark data in audit rules based on fsnotify events. */
static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
struct inode *inode, struct inode *dir,
- const struct qstr *dname)
+ const struct qstr *dname, u32 cookie)
{
struct audit_fsnotify_mark *audit_mark;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 83e1c07fc99e..6c91902f4f45 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1037,7 +1037,7 @@ static void evict_chunk(struct audit_chunk *chunk)
static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
struct inode *inode, struct inode *dir,
- const struct qstr *file_name)
+ const struct qstr *file_name, u32 cookie)
{
return 0;
}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 246e5ba704c0..2acf7ca49154 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -466,7 +466,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
/* Update watch data in audit rules based on fsnotify events. */
static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
struct inode *inode, struct inode *dir,
- const struct qstr *dname)
+ const struct qstr *dname, u32 cookie)
{
struct audit_parent *parent;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c00aa5837965..ce8c9e2279ba 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -102,8 +102,6 @@ struct audit_aux_data {
int type;
};
-#define AUDIT_AUX_IPCPERM 0
-
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
@@ -552,11 +550,11 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_EXIT:
- if (ctx && ctx->return_valid)
+ if (ctx && ctx->return_valid != AUDITSC_INVALID)
result = audit_comparator(ctx->return_code, f->op, f->val);
break;
case AUDIT_SUCCESS:
- if (ctx && ctx->return_valid) {
+ if (ctx && ctx->return_valid != AUDITSC_INVALID) {
if (f->val)
result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
else
@@ -929,6 +927,8 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
+ context->fds[0] = -1;
+ context->return_valid = AUDITSC_INVALID;
return context;
}
@@ -1367,7 +1367,10 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
/* name was specified as a relative path and the
* directory component is the cwd
*/
- audit_log_d_path(ab, " name=", &context->pwd);
+ if (context->pwd.dentry && context->pwd.mnt)
+ audit_log_d_path(ab, " name=", &context->pwd);
+ else
+ audit_log_format(ab, " name=(null)");
break;
default:
/* log the name's directory component */
@@ -1435,9 +1438,6 @@ static void audit_log_proctitle(void)
struct audit_context *context = audit_context();
struct audit_buffer *ab;
- if (!context || context->dummy)
- return;
-
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
if (!ab)
return; /* audit_panic or being filtered */
@@ -1487,7 +1487,7 @@ static void audit_log_exit(void)
context->arch, context->major);
if (context->personality != PER_LINUX)
audit_log_format(ab, " per=%lx", context->personality);
- if (context->return_valid)
+ if (context->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
@@ -1624,7 +1624,7 @@ void __audit_free(struct task_struct *tsk)
* need to log via audit_log_exit().
*/
if (tsk == current && !context->dummy && context->in_syscall) {
- context->return_valid = 0;
+ context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
audit_filter_syscall(tsk, context,
@@ -1866,6 +1866,8 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
return aname;
}
@@ -1894,20 +1896,6 @@ __audit_reusename(const __user char *uptr)
return NULL;
}
-inline void _audit_getcwd(struct audit_context *context)
-{
- if (!context->pwd.dentry)
- get_fs_pwd(current->fs, &context->pwd);
-}
-
-void __audit_getcwd(void)
-{
- struct audit_context *context = audit_context();
-
- if (context->in_syscall)
- _audit_getcwd(context);
-}
-
/**
* __audit_getname - add a name to the list
* @name: name to add
@@ -1931,8 +1919,6 @@ void __audit_getname(struct filename *name)
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
name->refcnt++;
-
- _audit_getcwd(context);
}
static inline int audit_copy_fcaps(struct audit_names *name,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 287be337d5f6..4caf06fe4152 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3874,7 +3874,6 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
pid_t pid = attr->task_fd_query.pid;
u32 fd = attr->task_fd_query.fd;
const struct perf_event *event;
- struct files_struct *files;
struct task_struct *task;
struct file *file;
int err;
@@ -3892,23 +3891,11 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (!task)
return -ENOENT;
- files = get_files_struct(task);
- put_task_struct(task);
- if (!files)
- return -ENOENT;
-
err = 0;
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
+ file = fget_task(task, fd);
+ put_task_struct(task);
if (!file)
- err = -EBADF;
- else
- get_file(file);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-
- if (err)
- goto out;
+ return -EBADF;
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
@@ -3948,7 +3935,6 @@ out_not_supp:
err = -ENOTSUPP;
put_file:
fput(file);
-out:
return err;
}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 0458a40edf10..e73c07593024 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -130,7 +130,6 @@ struct bpf_iter_seq_task_file_info {
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
- struct files_struct *files;
u32 tid;
u32 fd;
};
@@ -139,37 +138,26 @@ static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
struct pid_namespace *ns = info->common.ns;
- u32 curr_tid = info->tid, max_fds;
- struct files_struct *curr_files;
+ u32 curr_tid = info->tid;
struct task_struct *curr_task;
- int curr_fd = info->fd;
+ unsigned int curr_fd = info->fd;
/* If this function returns a non-NULL file object,
- * it held a reference to the task/files_struct/file.
+ * it held a reference to the task/file.
* Otherwise, it does not hold any reference.
*/
again:
if (info->task) {
curr_task = info->task;
- curr_files = info->files;
curr_fd = info->fd;
} else {
curr_task = task_seq_get_next(ns, &curr_tid, true);
if (!curr_task) {
info->task = NULL;
- info->files = NULL;
return NULL;
}
- curr_files = get_files_struct(curr_task);
- if (!curr_files) {
- put_task_struct(curr_task);
- curr_tid = ++(info->tid);
- info->fd = 0;
- goto again;
- }
-
- info->files = curr_files;
+ /* set info->task and info->tid */
info->task = curr_task;
if (curr_tid == info->tid) {
curr_fd = info->fd;
@@ -180,13 +168,11 @@ again:
}
rcu_read_lock();
- max_fds = files_fdtable(curr_files)->max_fds;
- for (; curr_fd < max_fds; curr_fd++) {
+ for (;; curr_fd++) {
struct file *f;
-
- f = fcheck_files(curr_files, curr_fd);
+ f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
if (!f)
- continue;
+ break;
if (!get_file_rcu(f))
continue;
@@ -198,10 +184,8 @@ again:
/* the current task is done, go to the next task */
rcu_read_unlock();
- put_files_struct(curr_files);
put_task_struct(curr_task);
info->task = NULL;
- info->files = NULL;
info->fd = 0;
curr_tid = ++(info->tid);
goto again;
@@ -213,7 +197,6 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
struct file *file;
info->task = NULL;
- info->files = NULL;
file = task_file_seq_get_next(info);
if (file && *pos == 0)
++*pos;
@@ -275,9 +258,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v)
(void)__task_file_seq_show(seq, v, true);
} else {
fput((struct file *)v);
- put_files_struct(info->files);
put_task_struct(info->task);
- info->files = NULL;
info->task = NULL;
}
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4fcfe0b70c4e..825284baaf46 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -447,6 +447,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_OFFSET(uts_namespace, name);
VMCOREINFO_SYMBOL(node_online_map);
#ifdef CONFIG_MMU
VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a21458..479fc145acfc 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -20,6 +20,10 @@ config DMA_OPS
config DMA_OPS_BYPASS
bool
+# Lets platform IOMMU driver choose between bypass and IOMMU
+config ARCH_HAS_DMA_MAP_DIRECT
+ bool
+
config NEED_SG_DMA_LENGTH
bool
@@ -75,11 +79,6 @@ config ARCH_HAS_DMA_PREP_COHERENT
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
-config DMA_VIRT_OPS
- bool
- depends on HAS_DMA
- select DMA_OPS
-
config SWIOTLB
bool
select NEED_DMA_MAP_STATE
@@ -225,3 +224,12 @@ config DMA_API_DEBUG_SG
is technically out-of-spec.
If unsure, say N.
+
+config DMA_MAP_BENCHMARK
+ bool "Enable benchmarking of streaming DMA mapping"
+ depends on DEBUG_FS
+ help
+ Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
+ performance of dma_(un)map_page.
+
+ See tools/testing/selftests/dma/dma_map_benchmark.c
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index dc755ab68aab..0dd65ec1d234 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -5,8 +5,8 @@ obj-$(CONFIG_DMA_OPS) += ops_helpers.o
obj-$(CONFIG_DMA_OPS) += dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
-obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
obj-$(CONFIG_DMA_REMAP) += remap.o
+obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 16b95ff12e4d..3d63d91cba5c 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -20,7 +20,7 @@
* coders, etc.
*
* Such devices often require big memory buffers (a full HD frame
- * is, for instance, more then 2 mega pixels large, i.e. more than 6
+ * is, for instance, more than 2 mega pixels large, i.e. more than 6
* MB of memory), which makes mechanisms such as kmalloc() or
* alloc_page() ineffective.
*
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 06c111544f61..002268262c9a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -547,4 +547,3 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
dev->dma_range_map = map;
return 0;
}
-EXPORT_SYMBOL_GPL(dma_direct_set_offset);
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
new file mode 100644
index 000000000000..b1496e744c68
--- /dev/null
+++ b/kernel/dma/map_benchmark.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Hisilicon Limited.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+
+#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
+#define DMA_MAP_MAX_THREADS 1024
+#define DMA_MAP_MAX_SECONDS 300
+
+#define DMA_MAP_BIDIRECTIONAL 0
+#define DMA_MAP_TO_DEVICE 1
+#define DMA_MAP_FROM_DEVICE 2
+
+struct map_benchmark {
+ __u64 avg_map_100ns; /* average map latency in 100ns */
+ __u64 map_stddev; /* standard deviation of map latency */
+ __u64 avg_unmap_100ns; /* as above */
+ __u64 unmap_stddev;
+ __u32 threads; /* how many threads will do map/unmap in parallel */
+ __u32 seconds; /* how long the test will last */
+ __s32 node; /* which numa node this benchmark will run on */
+ __u32 dma_bits; /* DMA addressing capability */
+ __u32 dma_dir; /* DMA data direction */
+ __u64 expansion[10]; /* For future use */
+};
+
+struct map_benchmark_data {
+ struct map_benchmark bparam;
+ struct device *dev;
+ struct dentry *debugfs;
+ enum dma_data_direction dir;
+ atomic64_t sum_map_100ns;
+ atomic64_t sum_unmap_100ns;
+ atomic64_t sum_sq_map;
+ atomic64_t sum_sq_unmap;
+ atomic64_t loops;
+};
+
+static int map_benchmark_thread(void *data)
+{
+ void *buf;
+ dma_addr_t dma_addr;
+ struct map_benchmark_data *map = data;
+ int ret = 0;
+
+ buf = (void *)__get_free_page(GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ while (!kthread_should_stop()) {
+ u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
+ ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
+ ktime_t map_delta, unmap_delta;
+
+ /*
+ * for a non-coherent device, if we don't stain them in the
+ * cache, this will give an underestimate of the real-world
+ * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
+ * 66 means evertything goes well! 66 is lucky.
+ */
+ if (map->dir != DMA_FROM_DEVICE)
+ memset(buf, 0x66, PAGE_SIZE);
+
+ map_stime = ktime_get();
+ dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir);
+ if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
+ pr_err("dma_map_single failed on %s\n",
+ dev_name(map->dev));
+ ret = -ENOMEM;
+ goto out;
+ }
+ map_etime = ktime_get();
+ map_delta = ktime_sub(map_etime, map_stime);
+
+ unmap_stime = ktime_get();
+ dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
+ unmap_etime = ktime_get();
+ unmap_delta = ktime_sub(unmap_etime, unmap_stime);
+
+ /* calculate sum and sum of squares */
+
+ map_100ns = div64_ul(map_delta, 100);
+ unmap_100ns = div64_ul(unmap_delta, 100);
+ map_sq = map_100ns * map_100ns;
+ unmap_sq = unmap_100ns * unmap_100ns;
+
+ atomic64_add(map_100ns, &map->sum_map_100ns);
+ atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
+ atomic64_add(map_sq, &map->sum_sq_map);
+ atomic64_add(unmap_sq, &map->sum_sq_unmap);
+ atomic64_inc(&map->loops);
+ }
+
+out:
+ free_page((unsigned long)buf);
+ return ret;
+}
+
+static int do_map_benchmark(struct map_benchmark_data *map)
+{
+ struct task_struct **tsk;
+ int threads = map->bparam.threads;
+ int node = map->bparam.node;
+ const cpumask_t *cpu_mask = cpumask_of_node(node);
+ u64 loops;
+ int ret = 0;
+ int i;
+
+ tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
+ if (!tsk)
+ return -ENOMEM;
+
+ get_device(map->dev);
+
+ for (i = 0; i < threads; i++) {
+ tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
+ map->bparam.node, "dma-map-benchmark/%d", i);
+ if (IS_ERR(tsk[i])) {
+ pr_err("create dma_map thread failed\n");
+ ret = PTR_ERR(tsk[i]);
+ goto out;
+ }
+
+ if (node != NUMA_NO_NODE)
+ kthread_bind_mask(tsk[i], cpu_mask);
+ }
+
+ /* clear the old value in the previous benchmark */
+ atomic64_set(&map->sum_map_100ns, 0);
+ atomic64_set(&map->sum_unmap_100ns, 0);
+ atomic64_set(&map->sum_sq_map, 0);
+ atomic64_set(&map->sum_sq_unmap, 0);
+ atomic64_set(&map->loops, 0);
+
+ for (i = 0; i < threads; i++)
+ wake_up_process(tsk[i]);
+
+ msleep_interruptible(map->bparam.seconds * 1000);
+
+ /* wait for the completion of benchmark threads */
+ for (i = 0; i < threads; i++) {
+ ret = kthread_stop(tsk[i]);
+ if (ret)
+ goto out;
+ }
+
+ loops = atomic64_read(&map->loops);
+ if (likely(loops > 0)) {
+ u64 map_variance, unmap_variance;
+ u64 sum_map = atomic64_read(&map->sum_map_100ns);
+ u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
+ u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
+ u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
+
+ /* average latency */
+ map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
+ map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
+
+ /* standard deviation of latency */
+ map_variance = div64_u64(sum_sq_map, loops) -
+ map->bparam.avg_map_100ns *
+ map->bparam.avg_map_100ns;
+ unmap_variance = div64_u64(sum_sq_unmap, loops) -
+ map->bparam.avg_unmap_100ns *
+ map->bparam.avg_unmap_100ns;
+ map->bparam.map_stddev = int_sqrt64(map_variance);
+ map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
+ }
+
+out:
+ put_device(map->dev);
+ kfree(tsk);
+ return ret;
+}
+
+static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct map_benchmark_data *map = file->private_data;
+ void __user *argp = (void __user *)arg;
+ u64 old_dma_mask;
+
+ int ret;
+
+ if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case DMA_MAP_BENCHMARK:
+ if (map->bparam.threads == 0 ||
+ map->bparam.threads > DMA_MAP_MAX_THREADS) {
+ pr_err("invalid thread number\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.seconds == 0 ||
+ map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
+ pr_err("invalid duration seconds\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.node != NUMA_NO_NODE &&
+ !node_possible(map->bparam.node)) {
+ pr_err("invalid numa node\n");
+ return -EINVAL;
+ }
+
+ switch (map->bparam.dma_dir) {
+ case DMA_MAP_BIDIRECTIONAL:
+ map->dir = DMA_BIDIRECTIONAL;
+ break;
+ case DMA_MAP_FROM_DEVICE:
+ map->dir = DMA_FROM_DEVICE;
+ break;
+ case DMA_MAP_TO_DEVICE:
+ map->dir = DMA_TO_DEVICE;
+ break;
+ default:
+ pr_err("invalid DMA direction\n");
+ return -EINVAL;
+ }
+
+ old_dma_mask = dma_get_mask(map->dev);
+
+ ret = dma_set_mask(map->dev,
+ DMA_BIT_MASK(map->bparam.dma_bits));
+ if (ret) {
+ pr_err("failed to set dma_mask on device %s\n",
+ dev_name(map->dev));
+ return -EINVAL;
+ }
+
+ ret = do_map_benchmark(map);
+
+ /*
+ * restore the original dma_mask as many devices' dma_mask are
+ * set by architectures, acpi, busses. When we bind them back
+ * to their original drivers, those drivers shouldn't see
+ * dma_mask changed by benchmark
+ */
+ dma_set_mask(map->dev, old_dma_mask);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static const struct file_operations map_benchmark_fops = {
+ .open = simple_open,
+ .unlocked_ioctl = map_benchmark_ioctl,
+};
+
+static void map_benchmark_remove_debugfs(void *data)
+{
+ struct map_benchmark_data *map = (struct map_benchmark_data *)data;
+
+ debugfs_remove(map->debugfs);
+}
+
+static int __map_benchmark_probe(struct device *dev)
+{
+ struct dentry *entry;
+ struct map_benchmark_data *map;
+ int ret;
+
+ map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+ map->dev = dev;
+
+ ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
+ if (ret) {
+ pr_err("Can't add debugfs remove action\n");
+ return ret;
+ }
+
+ /*
+ * we only permit a device bound with this driver, 2nd probe
+ * will fail
+ */
+ entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
+ &map_benchmark_fops);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+ map->debugfs = entry;
+
+ return 0;
+}
+
+static int map_benchmark_platform_probe(struct platform_device *pdev)
+{
+ return __map_benchmark_probe(&pdev->dev);
+}
+
+static struct platform_driver map_benchmark_platform_driver = {
+ .driver = {
+ .name = "dma_map_benchmark",
+ },
+ .probe = map_benchmark_platform_probe,
+};
+
+static int
+map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ return __map_benchmark_probe(&pdev->dev);
+}
+
+static struct pci_driver map_benchmark_pci_driver = {
+ .name = "dma_map_benchmark",
+ .probe = map_benchmark_pci_probe,
+};
+
+static int __init map_benchmark_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&map_benchmark_pci_driver);
+ if (ret)
+ return ret;
+
+ ret = platform_driver_register(&map_benchmark_platform_driver);
+ if (ret) {
+ pci_unregister_driver(&map_benchmark_pci_driver);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit map_benchmark_cleanup(void)
+{
+ platform_driver_unregister(&map_benchmark_platform_driver);
+ pci_unregister_driver(&map_benchmark_pci_driver);
+}
+
+module_init(map_benchmark_init);
+module_exit(map_benchmark_cleanup);
+
+MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
+MODULE_DESCRIPTION("dma_map benchmark driver");
+MODULE_LICENSE("GPL");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 51bb8fa8eb89..f87a89d08654 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -149,7 +149,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
- if (dma_map_direct(dev, ops))
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
@@ -165,7 +166,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (dma_map_direct(dev, ops))
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_unmap_page_direct(dev, addr + size))
dma_direct_unmap_page(dev, addr, size, dir, attrs);
else if (ops->unmap_page)
ops->unmap_page(dev, addr, size, dir, attrs);
@@ -188,7 +190,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
if (WARN_ON_ONCE(!dev->dma_mask))
return 0;
- if (dma_map_direct(dev, ops))
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_map_sg_direct(dev, sg, nents))
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
@@ -207,7 +210,8 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
debug_dma_unmap_sg(dev, sg, nents, dir);
- if (dma_map_direct(dev, ops))
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_unmap_sg_direct(dev, sg, nents))
dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
else if (ops->unmap_sg)
ops->unmap_sg(dev, sg, nents, dir, attrs);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index d4637f72239b..5f84e6cdb78e 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -38,9 +38,6 @@ static void __init dma_atomic_pool_debugfs_init(void)
struct dentry *root;
root = debugfs_create_dir("dma_pools", NULL);
- if (IS_ERR_OR_NULL(root))
- return;
-
debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 781b9dca197c..7c42df6e6100 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -152,8 +152,6 @@ void swiotlb_set_max_segment(unsigned int val)
max_segment = rounddown(val, PAGE_SIZE);
}
-/* default to 64MB */
-#define IO_TLB_DEFAULT_SIZE (64UL<<20)
unsigned long swiotlb_size_or_default(void)
{
unsigned long size;
@@ -163,6 +161,24 @@ unsigned long swiotlb_size_or_default(void)
return size ? size : (IO_TLB_DEFAULT_SIZE);
}
+void __init swiotlb_adjust_size(unsigned long new_size)
+{
+ unsigned long size;
+
+ /*
+ * If swiotlb parameter has not been specified, give a chance to
+ * architectures such as those supporting memory encryption to
+ * adjust/expand SWIOTLB size for their use.
+ */
+ if (!io_tlb_nslabs) {
+ size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+ io_tlb_nslabs = size >> IO_TLB_SHIFT;
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+ pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
+ }
+}
+
void swiotlb_print_info(void)
{
unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
deleted file mode 100644
index 59d32317dd57..000000000000
--- a/kernel/dma/virt.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DMA operations that map to virtual addresses without flushing memory.
- */
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/dma-map-ops.h>
-#include <linux/scatterlist.h>
-
-static void *dma_virt_alloc(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp,
- unsigned long attrs)
-{
- void *ret;
-
- ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size));
- if (ret)
- *dma_handle = (uintptr_t)ret;
- return ret;
-}
-
-static void dma_virt_free(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_addr,
- unsigned long attrs)
-{
- free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
-{
- return (uintptr_t)(page_address(page) + offset);
-}
-
-static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl,
- int nents, enum dma_data_direction dir,
- unsigned long attrs)
-{
- int i;
- struct scatterlist *sg;
-
- for_each_sg(sgl, sg, nents, i) {
- BUG_ON(!sg_page(sg));
- sg_dma_address(sg) = (uintptr_t)sg_virt(sg);
- sg_dma_len(sg) = sg->length;
- }
-
- return nents;
-}
-
-const struct dma_map_ops dma_virt_ops = {
- .alloc = dma_virt_alloc,
- .free = dma_virt_free,
- .map_page = dma_virt_map_page,
- .map_sg = dma_virt_map_sg,
- .alloc_pages = dma_common_alloc_pages,
- .free_pages = dma_common_free_pages,
-};
-EXPORT_SYMBOL(dma_virt_ops);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 19ae6c931c52..55d18791a72d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1327,7 +1327,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
- * exec_update_mutex
+ * exec_update_lock
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
@@ -11959,14 +11959,14 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (task) {
- err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
+ err = down_read_interruptible(&task->signal->exec_update_lock);
if (err)
goto err_file;
/*
* Preserve ptrace permission check for backwards compatibility.
*
- * We must hold exec_update_mutex across this and any potential
+ * We must hold exec_update_lock across this and any potential
* perf_install_in_context() call for this new event to
* serialize against exec() altering our credentials (and the
* perf_event_exit_task() that could imply).
@@ -12129,7 +12129,7 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&ctx->mutex);
if (task) {
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
put_task_struct(task);
}
@@ -12153,7 +12153,7 @@ err_locked:
mutex_unlock(&ctx->mutex);
err_cred:
if (task)
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
err_file:
fput(event_file);
err_context:
@@ -12470,7 +12470,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
*
- * Can be called with exec_update_mutex held when called from
+ * Can be called with exec_update_lock held when called from
* setup_new_exec().
*/
void perf_event_exit_task(struct task_struct *child)
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index b0b1ad93fa95..60dc825ecc2b 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -37,9 +37,7 @@ static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
{
switch (get_injectable_error_type(addr)) {
case EI_ETYPE_NULL:
- if (retv != 0)
- return 0;
- break;
+ return 0;
case EI_ETYPE_ERRNO:
if (retv < (unsigned long)-MAX_ERRNO)
return (unsigned long)-EINVAL;
@@ -48,6 +46,8 @@ static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
return (unsigned long)-EINVAL;
break;
+ case EI_ETYPE_TRUE:
+ return 1;
}
return retv;
diff --git a/kernel/fork.c b/kernel/fork.c
index 7425b3224891..37720a6d04ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -225,8 +225,8 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
- /* Clear the KASAN shadow of the stack. */
- kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+ /* Mark stack accessible for KASAN. */
+ kasan_unpoison_range(s->addr, THREAD_SIZE);
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
@@ -1225,7 +1225,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
struct mm_struct *mm;
int err;
- err = mutex_lock_killable(&task->signal->exec_update_mutex);
+ err = down_read_killable(&task->signal->exec_update_lock);
if (err)
return ERR_PTR(err);
@@ -1235,7 +1235,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mmput(mm);
mm = ERR_PTR(-EACCES);
}
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return mm;
}
@@ -1595,7 +1595,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
- mutex_init(&sig->exec_update_mutex);
+ init_rwsem(&sig->exec_update_lock);
return 0;
}
@@ -3031,21 +3031,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
* the exec layer of the kernel.
*/
-int unshare_files(struct files_struct **displaced)
+int unshare_files(void)
{
struct task_struct *task = current;
- struct files_struct *copy = NULL;
+ struct files_struct *old, *copy = NULL;
int error;
error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
- if (error || !copy) {
- *displaced = NULL;
+ if (error || !copy)
return error;
- }
- *displaced = task->files;
+
+ old = task->files;
task_lock(task);
task->files = copy;
task_unlock(task);
+ put_files_struct(old);
return 0;
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 53c67c87f141..c53408a00d0b 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -25,10 +25,8 @@
#define GCOV_COUNTERS 9
#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
-#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
-#define GCOV_COUNTERS 9
#else
-#define GCOV_COUNTERS 8
+#define GCOV_COUNTERS 9
#endif
#define GCOV_TAG_FUNCTION_LENGTH 3
@@ -229,10 +227,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
/**
* gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
+ * @dst: profiling data set to which data is added
+ * @src: profiling data set which is added
*
- * Adds profiling counts of @source to @dest.
+ * Adds profiling counts of @src to @dst.
*/
void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
{
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index b3ff9288c6cc..5353edfad8e1 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -61,39 +61,34 @@ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
static struct file *
get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
- struct file *file = NULL;
+ struct file *file;
- task_lock(task);
rcu_read_lock();
-
- if (task->files)
- file = fcheck_files(task->files, idx);
-
+ file = task_lookup_fd_rcu(task, idx);
rcu_read_unlock();
- task_unlock(task);
return file;
}
-static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2)
{
- if (likely(m2 != m1))
- mutex_unlock(m2);
- mutex_unlock(m1);
+ if (likely(l2 != l1))
+ up_read(l2);
+ up_read(l1);
}
-static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2)
{
int err;
- if (m2 > m1)
- swap(m1, m2);
+ if (l2 > l1)
+ swap(l1, l2);
- err = mutex_lock_killable(m1);
- if (!err && likely(m1 != m2)) {
- err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+ err = down_read_killable(l1);
+ if (!err && likely(l1 != l2)) {
+ err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING);
if (err)
- mutex_unlock(m1);
+ up_read(l1);
}
return err;
@@ -107,7 +102,6 @@ static int kcmp_epoll_target(struct task_struct *task1,
{
struct file *filp, *filp_epoll, *filp_tgt;
struct kcmp_epoll_slot slot;
- struct files_struct *files;
if (copy_from_user(&slot, uslot, sizeof(slot)))
return -EFAULT;
@@ -116,23 +110,12 @@ static int kcmp_epoll_target(struct task_struct *task1,
if (!filp)
return -EBADF;
- files = get_files_struct(task2);
- if (!files)
+ filp_epoll = fget_task(task2, slot.efd);
+ if (!filp_epoll)
return -EBADF;
- spin_lock(&files->file_lock);
- filp_epoll = fcheck_files(files, slot.efd);
- if (filp_epoll)
- get_file(filp_epoll);
- else
- filp_tgt = ERR_PTR(-EBADF);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-
- if (filp_epoll) {
- filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
- fput(filp_epoll);
- }
+ filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+ fput(filp_epoll);
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);
@@ -173,8 +156,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
/*
* One should have enough rights to inspect task details.
*/
- ret = kcmp_lock(&task1->signal->exec_update_mutex,
- &task2->signal->exec_update_mutex);
+ ret = kcmp_lock(&task1->signal->exec_update_lock,
+ &task2->signal->exec_update_lock);
if (ret)
goto err;
if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
@@ -229,8 +212,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
}
err_unlock:
- kcmp_unlock(&task1->signal->exec_update_mutex,
- &task2->signal->exec_update_mutex);
+ kcmp_unlock(&task1->signal->exec_update_lock,
+ &task2->signal->exec_update_lock);
err:
put_task_struct(task1);
put_task_struct(task2);
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 54102deb50ba..53d51ed619a3 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -6,7 +6,7 @@ config HAVE_LIVEPATCH
config LIVEPATCH
bool "Kernel Live Patching"
- depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
depends on MODULES
depends on SYSFS
depends on KALLSYMS_ALL
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index b552cf2d85f8..e8029aea67f1 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -40,14 +40,18 @@ struct klp_ops *klp_find_ops(void *old_func)
static void notrace klp_ftrace_handler(unsigned long ip,
unsigned long parent_ip,
struct ftrace_ops *fops,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct klp_ops *ops;
struct klp_func *func;
int patch_state;
+ int bit;
ops = container_of(fops, struct klp_ops, fops);
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (WARN_ON_ONCE(bit < 0))
+ return;
/*
* A variant of synchronize_rcu() is used to allow patching functions
* where RCU is not watching, see klp_synchronize_transition().
@@ -113,10 +117,11 @@ static void notrace klp_ftrace_handler(unsigned long ip,
if (func->nop)
goto unlock;
- klp_arch_set_pc(regs, (unsigned long)func->new_func);
+ klp_arch_set_pc(fregs, (unsigned long)func->new_func);
unlock:
preempt_enable_notrace();
+ ftrace_test_recursion_unlock(bit);
}
/*
@@ -194,8 +199,10 @@ static int klp_patch_func(struct klp_func *func)
return -ENOMEM;
ops->fops.func = klp_ftrace_handler;
- ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
- FTRACE_OPS_FL_DYNAMIC |
+ ops->fops.flags = FTRACE_OPS_FL_DYNAMIC |
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+ FTRACE_OPS_FL_SAVE_REGS |
+#endif
FTRACE_OPS_FL_IPMODIFY |
FTRACE_OPS_FL_PERMANENT;
diff --git a/kernel/module.c b/kernel/module.c
index c3a9e972d3b2..4bf30e4b3eaa 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,9 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- Copyright (C) 2002 Richard Henderson
- Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
-
-*/
+ * Copyright (C) 2002 Richard Henderson
+ * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ */
#define INCLUDE_VERMAGIC
@@ -86,7 +85,8 @@
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
* 3) module_addr_min/module_addr_max.
- * (delete and add uses RCU list operations). */
+ * (delete and add uses RCU list operations).
+ */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
@@ -615,8 +615,10 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
return false;
}
-/* Find an exported symbol and return it, along with, (optional) crc and
- * (optional) module which owns it. Needs preempt disabled or module_mutex. */
+/*
+ * Find an exported symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it. Needs preempt disabled or module_mutex.
+ */
static const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const s32 **crc,
@@ -756,13 +758,12 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
}
/**
- * is_module_percpu_address - test whether address is from module static percpu
+ * is_module_percpu_address() - test whether address is from module static percpu
* @addr: address to test
*
* Test whether @addr belongs to module static percpu area.
*
- * RETURNS:
- * %true if @addr is from module static percpu area
+ * Return: %true if @addr is from module static percpu area
*/
bool is_module_percpu_address(unsigned long addr)
{
@@ -986,11 +987,10 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
}
/**
- * module_refcount - return the refcount or -1 if unloading
- *
+ * module_refcount() - return the refcount or -1 if unloading
* @mod: the module we're checking
*
- * Returns:
+ * Return:
* -1 if the module is in the process of unloading
* otherwise the number of references in the kernel to the module
*/
@@ -1675,8 +1675,10 @@ static void remove_sect_attrs(struct module *mod)
if (mod->sect_attrs) {
sysfs_remove_group(&mod->mkobj.kobj,
&mod->sect_attrs->grp);
- /* We are positive that no one is using any sect attrs
- * at this point. Deallocate immediately. */
+ /*
+ * We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately.
+ */
free_sect_attrs(mod->sect_attrs);
mod->sect_attrs = NULL;
}
@@ -1924,7 +1926,6 @@ static int mod_sysfs_init(struct module *mod)
if (err)
mod_kobject_put(mod);
- /* delay uevent until full sysfs population */
out:
return err;
}
@@ -1961,7 +1962,6 @@ static int mod_sysfs_setup(struct module *mod,
add_sect_attrs(mod, info);
add_notes_attrs(mod, info);
- kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
return 0;
out_unreg_modinfo_attrs:
@@ -2247,8 +2247,10 @@ static void free_module(struct module *mod)
mod_sysfs_teardown(mod);
- /* We leave it in list to prevent duplicate loads, but make sure
- * that noone uses it while it's being deconstructed. */
+ /*
+ * We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed.
+ */
mutex_lock(&module_mutex);
mod->state = MODULE_STATE_UNFORMED;
mutex_unlock(&module_mutex);
@@ -2365,8 +2367,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
if (!strncmp(name, "__gnu_lto", 9))
break;
- /* We compiled with -fno-common. These are not
- supposed to happen. */
+ /*
+ * We compiled with -fno-common. These are not
+ * supposed to happen.
+ */
pr_debug("Common symbol: %s\n", name);
pr_warn("%s: please compile with -fno-common\n",
mod->name);
@@ -2469,16 +2473,20 @@ static long get_offset(struct module *mod, unsigned int *size,
return ret;
}
-/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
- might -- code, read-only data, read-write data, small data. Tally
- sizes, and place the offsets into sh_entsize fields: high bit means it
- belongs in init. */
+/*
+ * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ * might -- code, read-only data, read-write data, small data. Tally
+ * sizes, and place the offsets into sh_entsize fields: high bit means it
+ * belongs in init.
+ */
static void layout_sections(struct module *mod, struct load_info *info)
{
static unsigned long const masks[][2] = {
- /* NOTE: all executable code must be the first section
+ /*
+ * NOTE: all executable code must be the first section
* in this array; otherwise modify the text_size
- * finder in the two loops below */
+ * finder in the two loops below
+ */
{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
{ SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
@@ -2924,40 +2932,43 @@ static int module_sig_check(struct load_info *info, int flags)
/* We truncate the module to discard the signature */
info->len -= markerlen;
err = mod_verify_sig(mod, info);
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
}
+ /*
+ * We don't permit modules to be loaded into the trusted kernels
+ * without a valid signature on them, but if we're not enforcing,
+ * certain errors are non-fatal.
+ */
switch (err) {
- case 0:
- info->sig_ok = true;
- return 0;
-
- /* We don't permit modules to be loaded into trusted kernels
- * without a valid signature on them, but if we're not
- * enforcing, certain errors are non-fatal.
- */
case -ENODATA:
- reason = "Loading of unsigned module";
- goto decide;
+ reason = "unsigned module";
+ break;
case -ENOPKG:
- reason = "Loading of module with unsupported crypto";
- goto decide;
+ reason = "module with unsupported crypto";
+ break;
case -ENOKEY:
- reason = "Loading of module with unavailable key";
- decide:
- if (is_module_sig_enforced()) {
- pr_notice("%s: %s is rejected\n", info->name, reason);
- return -EKEYREJECTED;
- }
-
- return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+ reason = "module with unavailable key";
+ break;
- /* All other errors are fatal, including nomem, unparseable
- * signatures and signature check failures - even if signatures
- * aren't required.
- */
default:
+ /*
+ * All other errors are fatal, including lack of memory,
+ * unparseable signatures, and signature check failures --
+ * even if signatures aren't required.
+ */
return err;
}
+
+ if (is_module_sig_enforced()) {
+ pr_notice("%s: loading of %s is rejected\n", info->name, reason);
+ return -EKEYREJECTED;
+ }
+
+ return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
}
#else /* !CONFIG_MODULE_SIG */
static int module_sig_check(struct load_info *info, int flags)
@@ -3090,8 +3101,10 @@ static int rewrite_section_headers(struct load_info *info, int flags)
return -ENOEXEC;
}
- /* Mark all sections sh_addr with their address in the
- temporary image. */
+ /*
+ * Mark all sections sh_addr with their address in the
+ * temporary image.
+ */
shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
#ifndef CONFIG_MODULE_UNLOAD
@@ -3525,9 +3538,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
if (ndx)
info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
- /* Determine total sizes, and put offsets in sh_entsize. For now
- this is done generically; there doesn't appear to be any
- special cases for the architectures. */
+ /*
+ * Determine total sizes, and put offsets in sh_entsize. For now
+ * this is done generically; there doesn't appear to be any
+ * special cases for the architectures.
+ */
layout_sections(info->mod, info);
layout_symtab(info->mod, info);
@@ -3671,6 +3686,9 @@ static noinline int do_init_module(struct module *mod)
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);
+ /* Delay uevent until module has finished its init routine */
+ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+
/*
* We need to finish all async code before the module init sequence
* is done. This has potential to deadlock. For example, a newly
@@ -3815,8 +3833,10 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_enable_nx(mod);
module_enable_x(mod);
- /* Mark state as coming so strong_try_module_get() ignores us,
- * but kallsyms etc. can see us. */
+ /*
+ * Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us.
+ */
mod->state = MODULE_STATE_COMING;
mutex_unlock(&module_mutex);
@@ -3863,8 +3883,10 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
return 0;
}
-/* Allocate and load the module: note that size of section 0 is always
- zero, and we rely on this for optional sections. */
+/*
+ * Allocate and load the module: note that size of section 0 is always
+ * zero, and we rely on this for optional sections.
+ */
static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
@@ -3938,8 +3960,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
init_param_lock(mod);
- /* Now we've got everything in the final locations, we can
- * find optional sections. */
+ /*
+ * Now we've got everything in the final locations, we can
+ * find optional sections.
+ */
err = find_module_sections(mod, info);
if (err)
goto free_unload;
@@ -4027,6 +4051,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
bug_cleanup:
+ mod->state = MODULE_STATE_GOING;
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
@@ -4152,8 +4177,10 @@ static const char *find_kallsyms_symbol(struct module *mod,
bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
- /* Scan for closest preceding symbol, and next symbol. (ELF
- starts real symbols at 1). */
+ /*
+ * Scan for closest preceding symbol, and next symbol. (ELF
+ * starts real symbols at 1).
+ */
for (i = 1; i < kallsyms->num_symtab; i++) {
const Elf_Sym *sym = &kallsyms->symtab[i];
unsigned long thisval = kallsyms_symbol_value(sym);
@@ -4161,8 +4188,10 @@ static const char *find_kallsyms_symbol(struct module *mod,
if (sym->st_shndx == SHN_UNDEF)
continue;
- /* We ignore unnamed symbols: they're uninformative
- * and inserted at a whim. */
+ /*
+ * We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim.
+ */
if (*kallsyms_symbol_name(kallsyms, i) == '\0'
|| is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
continue;
@@ -4192,8 +4221,10 @@ void * __weak dereference_module_function_descriptor(struct module *mod,
return ptr;
}
-/* For kallsyms to ask for address resolution. NULL means not found. Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption. */
+/*
+ * For kallsyms to ask for address resolution. NULL means not found. Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption.
+ */
const char *module_address_lookup(unsigned long addr,
unsigned long *size,
unsigned long *offset,
@@ -4451,11 +4482,12 @@ static int m_show(struct seq_file *m, void *p)
return 0;
}
-/* Format: modulename size refcount deps address
-
- Where refcount is a number or -, and deps is a comma-separated list
- of depends or -.
-*/
+/*
+ * Format: modulename size refcount deps address
+ *
+ * Where refcount is a number or -, and deps is a comma-separated list
+ * of depends or -.
+ */
static const struct seq_operations modules_op = {
.start = m_start,
.next = m_next,
@@ -4525,8 +4557,8 @@ out:
return e;
}
-/*
- * is_module_address - is this address inside a module?
+/**
+ * is_module_address() - is this address inside a module?
* @addr: the address to check.
*
* See is_module_text_address() if you simply want to see if the address
@@ -4543,8 +4575,8 @@ bool is_module_address(unsigned long addr)
return ret;
}
-/*
- * __module_address - get the module which contains an address.
+/**
+ * __module_address() - get the module which contains an address.
* @addr: the address.
*
* Must be called with preempt disabled or module mutex held so that
@@ -4568,8 +4600,8 @@ struct module *__module_address(unsigned long addr)
return mod;
}
-/*
- * is_module_text_address - is this address inside module code?
+/**
+ * is_module_text_address() - is this address inside module code?
* @addr: the address to check.
*
* See is_module_address() if you simply want to see if the address is
@@ -4587,8 +4619,8 @@ bool is_module_text_address(unsigned long addr)
return ret;
}
-/*
- * __module_text_address - get the module whose code contains an address.
+/**
+ * __module_text_address() - get the module whose code contains an address.
* @addr: the address.
*
* Must be called with preempt disabled or module mutex held so that
@@ -4627,8 +4659,10 @@ void print_modules(void)
}
#ifdef CONFIG_MODVERSIONS
-/* Generate the signature for all relevant module structures here.
- * If these change, we don't want to try to parse the module. */
+/*
+ * Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module.
+ */
void module_layout(struct module *mod,
struct modversion_info *ver,
struct kernel_param *kp,
diff --git a/kernel/params.c b/kernel/params.c
index 164d79330849..2daa2780a92c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -843,18 +843,16 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
}
-extern const struct module_version_attribute *__start___modver[];
-extern const struct module_version_attribute *__stop___modver[];
+extern const struct module_version_attribute __start___modver[];
+extern const struct module_version_attribute __stop___modver[];
static void __init version_sysfs_builtin(void)
{
- const struct module_version_attribute **p;
+ const struct module_version_attribute *vattr;
struct module_kobject *mk;
int err;
- for (p = __start___modver; p < __stop___modver; p++) {
- const struct module_version_attribute *vattr = *p;
-
+ for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
mk = locate_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
diff --git a/kernel/pid.c b/kernel/pid.c
index 47466d0bbc5b..ebdf9c60cd0b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -628,7 +628,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
struct file *file;
int ret;
- ret = mutex_lock_killable(&task->signal->exec_update_mutex);
+ ret = down_read_killable(&task->signal->exec_update_lock);
if (ret)
return ERR_PTR(ret);
@@ -637,7 +637,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
else
file = ERR_PTR(-EPERM);
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return file ?: ERR_PTR(-EBADF);
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 28713dda3f6b..ffdd0dc7ec6d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -491,52 +491,6 @@ static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
*trunc_msg_len = 0;
}
-/* insert record into the buffer, discard old ones, update heads */
-static int log_store(u32 caller_id, int facility, int level,
- enum log_flags flags, u64 ts_nsec,
- const struct dev_printk_info *dev_info,
- const char *text, u16 text_len)
-{
- struct prb_reserved_entry e;
- struct printk_record r;
- u16 trunc_msg_len = 0;
-
- prb_rec_init_wr(&r, text_len);
-
- if (!prb_reserve(&e, prb, &r)) {
- /* truncate the message if it is too long for empty buffer */
- truncate_msg(&text_len, &trunc_msg_len);
- prb_rec_init_wr(&r, text_len + trunc_msg_len);
- /* survive when the log buffer is too small for trunc_msg */
- if (!prb_reserve(&e, prb, &r))
- return 0;
- }
-
- /* fill message */
- memcpy(&r.text_buf[0], text, text_len);
- if (trunc_msg_len)
- memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
- r.info->text_len = text_len + trunc_msg_len;
- r.info->facility = facility;
- r.info->level = level & 7;
- r.info->flags = flags & 0x1f;
- if (ts_nsec > 0)
- r.info->ts_nsec = ts_nsec;
- else
- r.info->ts_nsec = local_clock();
- r.info->caller_id = caller_id;
- if (dev_info)
- memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
-
- /* A message without a trailing newline can be continued. */
- if (!(flags & LOG_NEWLINE))
- prb_commit(&e);
- else
- prb_final_commit(&e);
-
- return (text_len + trunc_msg_len);
-}
-
int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
static int syslog_action_restricted(int type)
@@ -741,7 +695,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
if (LOG_FACILITY(u) != 0)
facility = LOG_FACILITY(u);
endp++;
- len -= endp - line;
line = endp;
}
}
@@ -1172,7 +1125,7 @@ void __init setup_log_buf(int early)
new_descs, ilog2(new_descs_count),
new_infos);
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
@@ -1189,7 +1142,7 @@ void __init setup_log_buf(int early)
*/
prb = &printk_rb_dynamic;
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
if (seq != prb_next_seq(&printk_rb_static)) {
pr_err("dropped %llu messages\n",
@@ -1907,83 +1860,177 @@ static inline u32 printk_caller_id(void)
0x80000000 + raw_smp_processor_id();
}
-static size_t log_output(int facility, int level, enum log_flags lflags,
- const struct dev_printk_info *dev_info,
- char *text, size_t text_len)
+/**
+ * parse_prefix - Parse level and control flags.
+ *
+ * @text: The terminated text message.
+ * @level: A pointer to the current level value, will be updated.
+ * @lflags: A pointer to the current log flags, will be updated.
+ *
+ * @level may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @level must be set to
+ * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
+ *
+ * @lflags may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @lflags will be OR'd with the parsed
+ * value.
+ *
+ * Return: The length of the parsed level and control flags.
+ */
+static u16 parse_prefix(char *text, int *level, enum log_flags *lflags)
{
- const u32 caller_id = printk_caller_id();
+ u16 prefix_len = 0;
+ int kern_level;
- if (lflags & LOG_CONT) {
- struct prb_reserved_entry e;
- struct printk_record r;
+ while (*text) {
+ kern_level = printk_get_level(text);
+ if (!kern_level)
+ break;
- prb_rec_init_wr(&r, text_len);
- if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
- memcpy(&r.text_buf[r.info->text_len], text, text_len);
- r.info->text_len += text_len;
- if (lflags & LOG_NEWLINE) {
- r.info->flags |= LOG_NEWLINE;
- prb_final_commit(&e);
- } else {
- prb_commit(&e);
- }
- return text_len;
+ switch (kern_level) {
+ case '0' ... '7':
+ if (level && *level == LOGLEVEL_DEFAULT)
+ *level = kern_level - '0';
+ break;
+ case 'c': /* KERN_CONT */
+ if (lflags)
+ *lflags |= LOG_CONT;
+ }
+
+ prefix_len += 2;
+ text += 2;
+ }
+
+ return prefix_len;
+}
+
+static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags,
+ const char *fmt, va_list args)
+{
+ u16 text_len;
+
+ text_len = vscnprintf(text, size, fmt, args);
+
+ /* Mark and strip a trailing newline. */
+ if (text_len && text[text_len - 1] == '\n') {
+ text_len--;
+ *lflags |= LOG_NEWLINE;
+ }
+
+ /* Strip log level and control flags. */
+ if (facility == 0) {
+ u16 prefix_len;
+
+ prefix_len = parse_prefix(text, NULL, NULL);
+ if (prefix_len) {
+ text_len -= prefix_len;
+ memmove(text, text + prefix_len, text_len);
}
}
- /* Store it in the record log */
- return log_store(caller_id, facility, level, lflags, 0,
- dev_info, text, text_len);
+ return text_len;
}
-/* Must be called under logbuf_lock. */
+__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
- static char textbuf[LOG_LINE_MAX];
- char *text = textbuf;
- size_t text_len;
+ const u32 caller_id = printk_caller_id();
+ struct prb_reserved_entry e;
enum log_flags lflags = 0;
+ struct printk_record r;
+ u16 trunc_msg_len = 0;
+ char prefix_buf[8];
+ u16 reserve_size;
+ va_list args2;
+ u16 text_len;
+ u64 ts_nsec;
/*
- * The printf needs to come first; we need the syslog
- * prefix which might be passed-in as a parameter.
+ * Since the duration of printk() can vary depending on the message
+ * and state of the ringbuffer, grab the timestamp now so that it is
+ * close to the call of printk(). This provides a more deterministic
+ * timestamp with respect to the caller.
*/
- text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+ ts_nsec = local_clock();
- /* mark and strip a trailing newline */
- if (text_len && text[text_len-1] == '\n') {
- text_len--;
+ /*
+ * The sprintf needs to come first since the syslog prefix might be
+ * passed in as a parameter. An extra byte must be reserved so that
+ * later the vscnprintf() into the reserved buffer has room for the
+ * terminating '\0', which is not counted by vsnprintf().
+ */
+ va_copy(args2, args);
+ reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
+ va_end(args2);
+
+ if (reserve_size > LOG_LINE_MAX)
+ reserve_size = LOG_LINE_MAX;
+
+ /* Extract log level or control flags. */
+ if (facility == 0)
+ parse_prefix(&prefix_buf[0], &level, &lflags);
+
+ if (level == LOGLEVEL_DEFAULT)
+ level = default_message_loglevel;
+
+ if (dev_info)
lflags |= LOG_NEWLINE;
- }
- /* strip kernel syslog prefix and extract log level or control flags */
- if (facility == 0) {
- int kern_level;
+ if (lflags & LOG_CONT) {
+ prb_rec_init_wr(&r, reserve_size);
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+ text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
+ facility, &lflags, fmt, args);
+ r.info->text_len += text_len;
- while ((kern_level = printk_get_level(text)) != 0) {
- switch (kern_level) {
- case '0' ... '7':
- if (level == LOGLEVEL_DEFAULT)
- level = kern_level - '0';
- break;
- case 'c': /* KERN_CONT */
- lflags |= LOG_CONT;
+ if (lflags & LOG_NEWLINE) {
+ r.info->flags |= LOG_NEWLINE;
+ prb_final_commit(&e);
+ } else {
+ prb_commit(&e);
}
- text_len -= 2;
- text += 2;
+ return text_len;
}
}
- if (level == LOGLEVEL_DEFAULT)
- level = default_message_loglevel;
+ /*
+ * Explicitly initialize the record before every prb_reserve() call.
+ * prb_reserve_in_last() and prb_reserve() purposely invalidate the
+ * structure when they fail.
+ */
+ prb_rec_init_wr(&r, reserve_size);
+ if (!prb_reserve(&e, prb, &r)) {
+ /* truncate the message if it is too long for empty buffer */
+ truncate_msg(&reserve_size, &trunc_msg_len);
+
+ prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
+ if (!prb_reserve(&e, prb, &r))
+ return 0;
+ }
+ /* fill message */
+ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args);
+ if (trunc_msg_len)
+ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
+ r.info->text_len = text_len + trunc_msg_len;
+ r.info->facility = facility;
+ r.info->level = level & 7;
+ r.info->flags = lflags & 0x1f;
+ r.info->ts_nsec = ts_nsec;
+ r.info->caller_id = caller_id;
if (dev_info)
- lflags |= LOG_NEWLINE;
+ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
+
+ /* A message without a trailing newline can be continued. */
+ if (!(lflags & LOG_NEWLINE))
+ prb_commit(&e);
+ else
+ prb_final_commit(&e);
- return log_output(facility, level, lflags, dev_info, text, text_len);
+ return (text_len + trunc_msg_len);
}
asmlinkage int vprintk_emit(int facility, int level,
@@ -2006,10 +2053,9 @@ asmlinkage int vprintk_emit(int facility, int level,
boot_delay_msec(level);
printk_delay();
- /* This stops the holder of console_sem just where we want him */
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
@@ -2189,8 +2235,15 @@ static int __init console_setup(char *str)
char *s, *options, *brl_options = NULL;
int idx;
- if (str[0] == 0)
+ /*
+ * console="" or console=null have been suggested as a way to
+ * disable console output. Use ttynull that has been created
+ * for exacly this purpose.
+ */
+ if (str[0] == 0 || strcmp(str, "null") == 0) {
+ __add_preferred_console("ttynull", 0, NULL, NULL, true);
return 1;
+ }
if (_braille_console_setup(&str, &brl_options))
return 1;
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 74e25a1704f2..6704f06e0417 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -559,11 +559,12 @@ static void desc_make_reusable(struct prb_desc_ring *desc_ring,
* on error the caller can re-load the tail lpos to determine the situation.
*/
static bool data_make_reusable(struct printk_ringbuffer *rb,
- struct prb_data_ring *data_ring,
unsigned long lpos_begin,
unsigned long lpos_end,
unsigned long *lpos_out)
{
+
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
struct prb_desc_ring *desc_ring = &rb->desc_ring;
struct prb_data_block *blk;
enum desc_state d_state;
@@ -625,10 +626,9 @@ static bool data_make_reusable(struct printk_ringbuffer *rb,
* descriptors into the reusable state if the tail is pushed beyond
* their associated data block.
*/
-static bool data_push_tail(struct printk_ringbuffer *rb,
- struct prb_data_ring *data_ring,
- unsigned long lpos)
+static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
unsigned long tail_lpos_new;
unsigned long tail_lpos;
unsigned long next_lpos;
@@ -669,8 +669,7 @@ static bool data_push_tail(struct printk_ringbuffer *rb,
* Make all descriptors reusable that are associated with
* data blocks before @lpos.
*/
- if (!data_make_reusable(rb, data_ring, tail_lpos, lpos,
- &next_lpos)) {
+ if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
/*
* 1. Guarantee the block ID loaded in
* data_make_reusable() is performed before
@@ -807,7 +806,7 @@ static bool desc_push_tail(struct printk_ringbuffer *rb,
* data blocks once their associated descriptor is gone.
*/
- if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next))
+ if (!data_push_tail(rb, desc.text_blk_lpos.next))
return false;
/*
@@ -1019,10 +1018,10 @@ static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
* if necessary. This function also associates the data block with
* a specified descriptor.
*/
-static char *data_alloc(struct printk_ringbuffer *rb,
- struct prb_data_ring *data_ring, unsigned int size,
+static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
struct prb_data_block *blk;
unsigned long begin_lpos;
unsigned long next_lpos;
@@ -1041,7 +1040,7 @@ static char *data_alloc(struct printk_ringbuffer *rb,
do {
next_lpos = get_next_lpos(data_ring, begin_lpos, size);
- if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) {
+ if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
/* Failed to allocate, specify a data-less block. */
blk_lpos->begin = FAILED_LPOS;
blk_lpos->next = FAILED_LPOS;
@@ -1100,10 +1099,10 @@ static char *data_alloc(struct printk_ringbuffer *rb,
* Return a pointer to the beginning of the entire data buffer or NULL on
* failure.
*/
-static char *data_realloc(struct printk_ringbuffer *rb,
- struct prb_data_ring *data_ring, unsigned int size,
+static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
struct prb_data_block *blk;
unsigned long head_lpos;
unsigned long next_lpos;
@@ -1130,7 +1129,7 @@ static char *data_realloc(struct printk_ringbuffer *rb,
return &blk->data[0];
}
- if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring)))
+ if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring)))
return NULL;
/* The memory barrier involvement is the same as data_alloc:A. */
@@ -1395,7 +1394,7 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer
if (r->text_buf_size > max_size)
goto fail;
- r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size,
+ r->text_buf = data_alloc(rb, r->text_buf_size,
&d->text_blk_lpos, id);
} else {
if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
@@ -1419,7 +1418,7 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer
if (r->text_buf_size > max_size)
goto fail;
- r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size,
+ r->text_buf = data_realloc(rb, r->text_buf_size,
&d->text_blk_lpos, id);
}
if (r->text_buf_size && !r->text_buf)
@@ -1547,8 +1546,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
if (info->seq > 0)
desc_make_final(desc_ring, DESC_ID(id - 1));
- r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size,
- &d->text_blk_lpos, id);
+ r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
/* If text data allocation fails, a data-less record is committed. */
if (r->text_buf_size && !r->text_buf) {
prb_commit(e);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 2a18b76ffc06..eb1b15850761 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -553,20 +553,24 @@ static int __init reboot_setup(char *str)
break;
case 's':
- if (isdigit(*(str+1)))
- reboot_cpu = simple_strtoul(str+1, NULL, 0);
- else if (str[1] == 'm' && str[2] == 'p' &&
- isdigit(*(str+3)))
- reboot_cpu = simple_strtoul(str+3, NULL, 0);
- else
+ /*
+ * reboot_cpu is s[mp]#### with #### being the processor
+ * to be used for rebooting. Skip 's' or 'smp' prefix.
+ */
+ str += str[1] == 'm' && str[2] == 'p' ? 3 : 1;
+
+ if (isdigit(str[0])) {
+ int cpu = simple_strtoul(str, NULL, 0);
+
+ if (cpu >= num_possible_cpus()) {
+ pr_err("Ignoring the CPU number in reboot= option. "
+ "CPU %d exceeds possible cpu number %d\n",
+ cpu, num_possible_cpus());
+ break;
+ }
+ reboot_cpu = cpu;
+ } else
*mode = REBOOT_SOFT;
- if (reboot_cpu >= num_possible_cpus()) {
- pr_err("Ignoring the CPU number in reboot= option. "
- "CPU %d exceeds possible cpu number %d\n",
- reboot_cpu, num_possible_cpus());
- reboot_cpu = 0;
- break;
- }
break;
case 'g':
@@ -596,3 +600,217 @@ static int __init reboot_setup(char *str)
return 1;
}
__setup("reboot=", reboot_setup);
+
+#ifdef CONFIG_SYSFS
+
+#define REBOOT_COLD_STR "cold"
+#define REBOOT_WARM_STR "warm"
+#define REBOOT_HARD_STR "hard"
+#define REBOOT_SOFT_STR "soft"
+#define REBOOT_GPIO_STR "gpio"
+#define REBOOT_UNDEFINED_STR "undefined"
+
+#define BOOT_TRIPLE_STR "triple"
+#define BOOT_KBD_STR "kbd"
+#define BOOT_BIOS_STR "bios"
+#define BOOT_ACPI_STR "acpi"
+#define BOOT_EFI_STR "efi"
+#define BOOT_PCI_STR "pci"
+
+static ssize_t mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ const char *val;
+
+ switch (reboot_mode) {
+ case REBOOT_COLD:
+ val = REBOOT_COLD_STR;
+ break;
+ case REBOOT_WARM:
+ val = REBOOT_WARM_STR;
+ break;
+ case REBOOT_HARD:
+ val = REBOOT_HARD_STR;
+ break;
+ case REBOOT_SOFT:
+ val = REBOOT_SOFT_STR;
+ break;
+ case REBOOT_GPIO:
+ val = REBOOT_GPIO_STR;
+ break;
+ default:
+ val = REBOOT_UNDEFINED_STR;
+ }
+
+ return sprintf(buf, "%s\n", val);
+}
+static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (!strncmp(buf, REBOOT_COLD_STR, strlen(REBOOT_COLD_STR)))
+ reboot_mode = REBOOT_COLD;
+ else if (!strncmp(buf, REBOOT_WARM_STR, strlen(REBOOT_WARM_STR)))
+ reboot_mode = REBOOT_WARM;
+ else if (!strncmp(buf, REBOOT_HARD_STR, strlen(REBOOT_HARD_STR)))
+ reboot_mode = REBOOT_HARD;
+ else if (!strncmp(buf, REBOOT_SOFT_STR, strlen(REBOOT_SOFT_STR)))
+ reboot_mode = REBOOT_SOFT;
+ else if (!strncmp(buf, REBOOT_GPIO_STR, strlen(REBOOT_GPIO_STR)))
+ reboot_mode = REBOOT_GPIO;
+ else
+ return -EINVAL;
+
+ reboot_default = 0;
+
+ return count;
+}
+static struct kobj_attribute reboot_mode_attr = __ATTR_RW(mode);
+
+#ifdef CONFIG_X86
+static ssize_t force_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", reboot_force);
+}
+static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool res;
+
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (kstrtobool(buf, &res))
+ return -EINVAL;
+
+ reboot_default = 0;
+ reboot_force = res;
+
+ return count;
+}
+static struct kobj_attribute reboot_force_attr = __ATTR_RW(force);
+
+static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ const char *val;
+
+ switch (reboot_type) {
+ case BOOT_TRIPLE:
+ val = BOOT_TRIPLE_STR;
+ break;
+ case BOOT_KBD:
+ val = BOOT_KBD_STR;
+ break;
+ case BOOT_BIOS:
+ val = BOOT_BIOS_STR;
+ break;
+ case BOOT_ACPI:
+ val = BOOT_ACPI_STR;
+ break;
+ case BOOT_EFI:
+ val = BOOT_EFI_STR;
+ break;
+ case BOOT_CF9_FORCE:
+ val = BOOT_PCI_STR;
+ break;
+ default:
+ val = REBOOT_UNDEFINED_STR;
+ }
+
+ return sprintf(buf, "%s\n", val);
+}
+static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (!strncmp(buf, BOOT_TRIPLE_STR, strlen(BOOT_TRIPLE_STR)))
+ reboot_type = BOOT_TRIPLE;
+ else if (!strncmp(buf, BOOT_KBD_STR, strlen(BOOT_KBD_STR)))
+ reboot_type = BOOT_KBD;
+ else if (!strncmp(buf, BOOT_BIOS_STR, strlen(BOOT_BIOS_STR)))
+ reboot_type = BOOT_BIOS;
+ else if (!strncmp(buf, BOOT_ACPI_STR, strlen(BOOT_ACPI_STR)))
+ reboot_type = BOOT_ACPI;
+ else if (!strncmp(buf, BOOT_EFI_STR, strlen(BOOT_EFI_STR)))
+ reboot_type = BOOT_EFI;
+ else if (!strncmp(buf, BOOT_PCI_STR, strlen(BOOT_PCI_STR)))
+ reboot_type = BOOT_CF9_FORCE;
+ else
+ return -EINVAL;
+
+ reboot_default = 0;
+
+ return count;
+}
+static struct kobj_attribute reboot_type_attr = __ATTR_RW(type);
+#endif
+
+#ifdef CONFIG_SMP
+static ssize_t cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", reboot_cpu);
+}
+static ssize_t cpu_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int cpunum;
+ int rc;
+
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ rc = kstrtouint(buf, 0, &cpunum);
+
+ if (rc)
+ return rc;
+
+ if (cpunum >= num_possible_cpus())
+ return -ERANGE;
+
+ reboot_default = 0;
+ reboot_cpu = cpunum;
+
+ return count;
+}
+static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu);
+#endif
+
+static struct attribute *reboot_attrs[] = {
+ &reboot_mode_attr.attr,
+#ifdef CONFIG_X86
+ &reboot_force_attr.attr,
+ &reboot_type_attr.attr,
+#endif
+#ifdef CONFIG_SMP
+ &reboot_cpu_attr.attr,
+#endif
+ NULL,
+};
+
+static const struct attribute_group reboot_attr_group = {
+ .attrs = reboot_attrs,
+};
+
+static int __init reboot_ksysfs_init(void)
+{
+ struct kobject *reboot_kobj;
+ int ret;
+
+ reboot_kobj = kobject_create_and_add("reboot", kernel_kobj);
+ if (!reboot_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(reboot_kobj, &reboot_attr_group);
+ if (ret) {
+ kobject_put(reboot_kobj);
+ return ret;
+ }
+
+ return 0;
+}
+late_initcall(reboot_ksysfs_init);
+
+#endif
diff --git a/kernel/relay.c b/kernel/relay.c
index b08d936d5fa7..d1a67fbb819d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -28,15 +28,6 @@ static DEFINE_MUTEX(relay_channels_mutex);
static LIST_HEAD(relay_channels);
/*
- * close() vm_op implementation for relay file mapping.
- */
-static void relay_file_mmap_close(struct vm_area_struct *vma)
-{
- struct rchan_buf *buf = vma->vm_private_data;
- buf->chan->cb->buf_unmapped(buf, vma->vm_file);
-}
-
-/*
* fault() vm_op implementation for relay file mapping.
*/
static vm_fault_t relay_buf_fault(struct vm_fault *vmf)
@@ -62,7 +53,6 @@ static vm_fault_t relay_buf_fault(struct vm_fault *vmf)
*/
static const struct vm_operations_struct relay_file_mmap_ops = {
.fault = relay_buf_fault,
- .close = relay_file_mmap_close,
};
/*
@@ -96,7 +86,6 @@ static void relay_free_page_array(struct page **array)
static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
{
unsigned long length = vma->vm_end - vma->vm_start;
- struct file *filp = vma->vm_file;
if (!buf)
return -EBADF;
@@ -107,7 +96,6 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
vma->vm_ops = &relay_file_mmap_ops;
vma->vm_flags |= VM_DONTEXPAND;
vma->vm_private_data = buf;
- buf->chan->cb->buf_mapped(buf, filp);
return 0;
}
@@ -264,70 +252,16 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
* High-level relay kernel API and associated functions.
*/
-/*
- * rchan_callback implementations defining default channel behavior. Used
- * in place of corresponding NULL values in client callback struct.
- */
-
-/*
- * subbuf_start() default callback. Does nothing.
- */
-static int subbuf_start_default_callback (struct rchan_buf *buf,
- void *subbuf,
- void *prev_subbuf,
- size_t prev_padding)
-{
- if (relay_buf_full(buf))
- return 0;
-
- return 1;
-}
-
-/*
- * buf_mapped() default callback. Does nothing.
- */
-static void buf_mapped_default_callback(struct rchan_buf *buf,
- struct file *filp)
-{
-}
-
-/*
- * buf_unmapped() default callback. Does nothing.
- */
-static void buf_unmapped_default_callback(struct rchan_buf *buf,
- struct file *filp)
-{
-}
-
-/*
- * create_buf_file_create() default callback. Does nothing.
- */
-static struct dentry *create_buf_file_default_callback(const char *filename,
- struct dentry *parent,
- umode_t mode,
- struct rchan_buf *buf,
- int *is_global)
+static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
+ void *prev_subbuf, size_t prev_padding)
{
- return NULL;
-}
+ if (!buf->chan->cb->subbuf_start)
+ return !relay_buf_full(buf);
-/*
- * remove_buf_file() default callback. Does nothing.
- */
-static int remove_buf_file_default_callback(struct dentry *dentry)
-{
- return -EINVAL;
+ return buf->chan->cb->subbuf_start(buf, subbuf,
+ prev_subbuf, prev_padding);
}
-/* relay channel default callbacks */
-static struct rchan_callbacks default_channel_callbacks = {
- .subbuf_start = subbuf_start_default_callback,
- .buf_mapped = buf_mapped_default_callback,
- .buf_unmapped = buf_unmapped_default_callback,
- .create_buf_file = create_buf_file_default_callback,
- .remove_buf_file = remove_buf_file_default_callback,
-};
-
/**
* wakeup_readers - wake up readers waiting on a channel
* @work: contains the channel buffer
@@ -371,7 +305,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
- buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+ relay_subbuf_start(buf, buf->data, NULL, 0);
}
/**
@@ -499,27 +433,6 @@ static void relay_close_buf(struct rchan_buf *buf)
kref_put(&buf->kref, relay_remove_buf);
}
-static void setup_callbacks(struct rchan *chan,
- struct rchan_callbacks *cb)
-{
- if (!cb) {
- chan->cb = &default_channel_callbacks;
- return;
- }
-
- if (!cb->subbuf_start)
- cb->subbuf_start = subbuf_start_default_callback;
- if (!cb->buf_mapped)
- cb->buf_mapped = buf_mapped_default_callback;
- if (!cb->buf_unmapped)
- cb->buf_unmapped = buf_unmapped_default_callback;
- if (!cb->create_buf_file)
- cb->create_buf_file = create_buf_file_default_callback;
- if (!cb->remove_buf_file)
- cb->remove_buf_file = remove_buf_file_default_callback;
- chan->cb = cb;
-}
-
int relay_prepare_cpu(unsigned int cpu)
{
struct rchan *chan;
@@ -565,7 +478,7 @@ struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
size_t subbuf_size,
size_t n_subbufs,
- struct rchan_callbacks *cb,
+ const struct rchan_callbacks *cb,
void *private_data)
{
unsigned int i;
@@ -576,6 +489,8 @@ struct rchan *relay_open(const char *base_filename,
return NULL;
if (subbuf_size > UINT_MAX / n_subbufs)
return NULL;
+ if (!cb || !cb->create_buf_file || !cb->remove_buf_file)
+ return NULL;
chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
if (!chan)
@@ -597,7 +512,7 @@ struct rchan *relay_open(const char *base_filename,
chan->has_base_filename = 1;
strlcpy(chan->base_filename, base_filename, NAME_MAX);
}
- setup_callbacks(chan, cb);
+ chan->cb = cb;
kref_init(&chan->kref);
mutex_lock(&relay_channels_mutex);
@@ -780,7 +695,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
- if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+ if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) {
buf->offset = buf->chan->subbuf_size + 1;
return 0;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 3ae2f56cc79d..833394f9c608 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -320,9 +320,8 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/**
- * Finds the lowest iomem resource that covers part of [@start..@end]. The
- * caller must specify @start, @end, @flags, and @desc (which may be
- * IORES_DESC_NONE).
+ * find_next_iomem_res - Finds the lowest iomem resource that covers part of
+ * [@start..@end].
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
@@ -337,6 +336,9 @@ EXPORT_SYMBOL(release_resource);
* @desc: descriptor the resource must have
* @first_lvl: walk only the first level children, if set
* @res: return ptr, if resource found
+ *
+ * The caller must specify @start, @end, @flags, and @desc
+ * (which may be IORES_DESC_NONE).
*/
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
@@ -416,11 +418,9 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
}
/**
- * Walks through iomem resources and calls func() with matching resource
- * ranges. This walks through whole tree and not just first level children.
- * All the memory ranges which overlap start,end and also match flags and
- * desc are valid candidates.
- *
+ * walk_iomem_res_desc - Walks through iomem resources and calls func()
+ * with matching resource ranges.
+ * *
* @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
* @flags: I/O resource flags
* @start: start addr
@@ -428,6 +428,10 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
* @arg: function argument for the callback @func
* @func: callback function that is called for each qualifying resource area
*
+ * This walks through whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
+ * desc are valid candidates.
+ *
* NOTE: For a new descriptor search, define a new IORES_DESC in
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
@@ -557,13 +561,13 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags,
}
read_unlock(&resource_lock);
- if (other == 0)
- return type ? REGION_INTERSECTS : REGION_DISJOINT;
+ if (type == 0)
+ return REGION_DISJOINT;
- if (type)
- return REGION_MIXED;
+ if (other == 0)
+ return REGION_INTERSECTS;
- return REGION_DISJOINT;
+ return REGION_MIXED;
}
EXPORT_SYMBOL_GPL(region_intersects);
@@ -1372,9 +1376,9 @@ static bool system_ram_resources_mergeable(struct resource *r1,
!r1->child && !r2->child;
}
-/*
+/**
* merge_system_ram_resource - mark the System RAM resource mergeable and try to
- * merge it with adjacent, mergeable resources
+ * merge it with adjacent, mergeable resources
* @res: resource descriptor
*
* This interface is intended for memory hotplug, whereby lots of contiguous
diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c
new file mode 100644
index 000000000000..58ab9f914602
--- /dev/null
+++ b/kernel/resource_kunit.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Test cases for API provided by resource.c and ioport.h
+ */
+
+#include <kunit/test.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#define R0_START 0x0000
+#define R0_END 0xffff
+#define R1_START 0x1234
+#define R1_END 0x2345
+#define R2_START 0x4567
+#define R2_END 0x5678
+#define R3_START 0x6789
+#define R3_END 0x789a
+#define R4_START 0x2000
+#define R4_END 0x7000
+
+static struct resource r0 = { .start = R0_START, .end = R0_END };
+static struct resource r1 = { .start = R1_START, .end = R1_END };
+static struct resource r2 = { .start = R2_START, .end = R2_END };
+static struct resource r3 = { .start = R3_START, .end = R3_END };
+static struct resource r4 = { .start = R4_START, .end = R4_END };
+
+struct result {
+ struct resource *r1;
+ struct resource *r2;
+ struct resource r;
+ bool ret;
+};
+
+static struct result results_for_union[] = {
+ {
+ .r1 = &r1, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r4, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r3, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r4, .r2 = &r1, .r.start = R1_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r3, .ret = false,
+ }, {
+ .r1 = &r2, .r2 = &r4, .r.start = R4_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r4, .r.start = R4_START, .r.end = R3_END, .ret = true,
+ },
+};
+
+static struct result results_for_intersection[] = {
+ {
+ .r1 = &r1, .r2 = &r0, .r.start = R1_START, .r.end = R1_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r0, .r.start = R2_START, .r.end = R2_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r0, .r.start = R3_START, .r.end = R3_END, .ret = true,
+ }, {
+ .r1 = &r4, .r2 = &r0, .r.start = R4_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r3, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r4, .r2 = &r1, .r.start = R4_START, .r.end = R1_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r3, .ret = false,
+ }, {
+ .r1 = &r2, .r2 = &r4, .r.start = R2_START, .r.end = R2_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r4, .r.start = R3_START, .r.end = R4_END, .ret = true,
+ },
+};
+
+static void resource_do_test(struct kunit *test, bool ret, struct resource *r,
+ bool exp_ret, struct resource *exp_r,
+ struct resource *r1, struct resource *r2)
+{
+ KUNIT_EXPECT_EQ_MSG(test, ret, exp_ret, "Resources %pR %pR", r1, r2);
+ KUNIT_EXPECT_EQ_MSG(test, r->start, exp_r->start, "Start elements are not equal");
+ KUNIT_EXPECT_EQ_MSG(test, r->end, exp_r->end, "End elements are not equal");
+}
+
+static void resource_do_union_test(struct kunit *test, struct result *r)
+{
+ struct resource result;
+ bool ret;
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_union(r->r1, r->r2, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2);
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_union(r->r2, r->r1, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1);
+}
+
+static void resource_test_union(struct kunit *test)
+{
+ struct result *r = results_for_union;
+ unsigned int i = 0;
+
+ do {
+ resource_do_union_test(test, &r[i]);
+ } while (++i < ARRAY_SIZE(results_for_union));
+}
+
+static void resource_do_intersection_test(struct kunit *test, struct result *r)
+{
+ struct resource result;
+ bool ret;
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_intersection(r->r1, r->r2, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2);
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_intersection(r->r2, r->r1, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1);
+}
+
+static void resource_test_intersection(struct kunit *test)
+{
+ struct result *r = results_for_intersection;
+ unsigned int i = 0;
+
+ do {
+ resource_do_intersection_test(test, &r[i]);
+ } while (++i < ARRAY_SIZE(results_for_intersection));
+}
+
+static struct kunit_case resource_test_cases[] = {
+ KUNIT_CASE(resource_test_union),
+ KUNIT_CASE(resource_test_intersection),
+ {}
+};
+
+static struct kunit_suite resource_test_suite = {
+ .name = "resource",
+ .test_cases = resource_test_cases,
+};
+kunit_test_suite(resource_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 01f5d3020589..183cc6ae68a6 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -57,7 +68,11 @@ EXPORT_SYMBOL(remove_wait_queue);
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
+ * number) then we wake that number of exclusive tasks, and potentially all
+ * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
+ * the list and any non-exclusive tasks will be woken first. A priority task
+ * may be at the head of the list, and can consume the event without any other
+ * tasks being woken.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 15f47fc11d13..952dc1c90229 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,6 +143,38 @@ struct notification {
struct list_head notifications;
};
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * struct action_cache - per-filter cache of seccomp actions per
+ * arch/syscall pair
+ *
+ * @allow_native: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * native architecture.
+ * @allow_compat: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * compat architecture.
+ */
+struct action_cache {
+ DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
+#ifdef SECCOMP_ARCH_COMPAT
+ DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
+#endif
+};
+#else
+struct action_cache { };
+
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ return false;
+}
+
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* struct seccomp_filter - container for seccomp BPF programs
*
@@ -159,6 +191,7 @@ struct notification {
* this filter after reaching 0. The @users count is always smaller
* or equal to @refs. Hence, reaching 0 for @users does not mean
* the filter can be freed.
+ * @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
@@ -180,6 +213,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
@@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
+#ifdef SECCOMP_ARCH_NATIVE
+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
+ size_t bitmap_size,
+ int syscall_nr)
+{
+ if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+ return false;
+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+ return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check_allow - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ int syscall_nr = sd->nr;
+ const struct action_cache *cache = &sfilter->cache;
+
+#ifndef SECCOMP_ARCH_COMPAT
+ /* A native-only architecture doesn't need to check sd->arch. */
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+#else
+ if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+ return seccomp_cache_check_allow_bitmap(cache->allow_compat,
+ SECCOMP_ARCH_COMPAT_NR,
+ syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ WARN_ON_ONCE(true);
+ return false;
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
+ if (seccomp_cache_check_allow(f, sd))
+ return SECCOMP_RET_ALLOW;
+
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
@@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* We are effectively holding the siglock by not having any sighand. */
+ WARN_ON(tsk->sighand != NULL);
+
/* Detach task from its filter tree. */
tsk->seccomp.filter = NULL;
__seccomp_filter_release(orig);
@@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig =
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
+ true;
+#else
+ false;
+#endif
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -609,6 +700,148 @@ out:
return filter;
}
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * seccomp_is_const_allow - check if filter is constant allow with given data
+ * @fprog: The BPF programs
+ * @sd: The seccomp data to check against, only syscall number and arch
+ * number are considered constant.
+ */
+static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
+ struct seccomp_data *sd)
+{
+ unsigned int reg_value = 0;
+ unsigned int pc;
+ bool op_res;
+
+ if (WARN_ON_ONCE(!fprog))
+ return false;
+
+ for (pc = 0; pc < fprog->len; pc++) {
+ struct sock_filter *insn = &fprog->filter[pc];
+ u16 code = insn->code;
+ u32 k = insn->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ switch (k) {
+ case offsetof(struct seccomp_data, nr):
+ reg_value = sd->nr;
+ break;
+ case offsetof(struct seccomp_data, arch):
+ reg_value = sd->arch;
+ break;
+ default:
+ /* can't optimize (non-constant value load) */
+ return false;
+ }
+ break;
+ case BPF_RET | BPF_K:
+ /* reached return with constant values only, check allow */
+ return k == SECCOMP_RET_ALLOW;
+ case BPF_JMP | BPF_JA:
+ pc += insn->k;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ op_res = reg_value == k;
+ break;
+ case BPF_JGE:
+ op_res = reg_value >= k;
+ break;
+ case BPF_JGT:
+ op_res = reg_value > k;
+ break;
+ case BPF_JSET:
+ op_res = !!(reg_value & k);
+ break;
+ default:
+ /* can't optimize (unknown jump) */
+ return false;
+ }
+
+ pc += op_res ? insn->jt : insn->jf;
+ break;
+ case BPF_ALU | BPF_AND | BPF_K:
+ reg_value &= k;
+ break;
+ default:
+ /* can't optimize (unknown insn) */
+ return false;
+ }
+ }
+
+ /* ran off the end of the filter?! */
+ WARN_ON(1);
+ return false;
+}
+
+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
+ void *bitmap, const void *bitmap_prev,
+ size_t bitmap_size, int arch)
+{
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+ struct seccomp_data sd;
+ int nr;
+
+ if (bitmap_prev) {
+ /* The new filter must be as restrictive as the last. */
+ bitmap_copy(bitmap, bitmap_prev, bitmap_size);
+ } else {
+ /* Before any filters, all syscalls are always allowed. */
+ bitmap_fill(bitmap, bitmap_size);
+ }
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ /* No bitmap change: not a cacheable action. */
+ if (!test_bit(nr, bitmap))
+ continue;
+
+ sd.nr = nr;
+ sd.arch = arch;
+
+ /* No bitmap change: continue to always allow. */
+ if (seccomp_is_const_allow(fprog, &sd))
+ continue;
+
+ /*
+ * Not a cacheable action: always run filters.
+ * atomic clear_bit() not needed, filter not visible yet.
+ */
+ __clear_bit(nr, bitmap);
+ }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ struct action_cache *cache = &sfilter->cache;
+ const struct action_cache *cache_prev =
+ sfilter->prev ? &sfilter->prev->cache : NULL;
+
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
+ cache_prev ? cache_prev->allow_native : NULL,
+ SECCOMP_ARCH_NATIVE_NR,
+ SECCOMP_ARCH_NATIVE);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
+ cache_prev ? cache_prev->allow_compat : NULL,
+ SECCOMP_ARCH_COMPAT_NR,
+ SECCOMP_ARCH_COMPAT);
+#endif /* SECCOMP_ARCH_COMPAT */
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
+ seccomp_cache_prepare(filter);
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);
@@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
-static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos)
{
char names[sizeof(seccomp_actions_avail)];
@@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
return proc_dostring(&table, 0, buffer, lenp, ppos);
}
-static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
@@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void)
device_initcall(seccomp_sysctl_init)
#endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+ const void *bitmap, size_t bitmap_size)
+{
+ int nr;
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ bool cached = test_bit(nr, bitmap);
+ char *status = cached ? "ALLOW" : "FILTER";
+
+ seq_printf(m, "%s %d %s\n", name, nr, status);
+ }
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct seccomp_filter *f;
+ unsigned long flags;
+
+ /*
+ * We don't want some sandboxed process to know what their seccomp
+ * filters consist of.
+ */
+ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!lock_task_sighand(task, &flags))
+ return -ESRCH;
+
+ f = READ_ONCE(task->seccomp.filter);
+ if (!f) {
+ unlock_task_sighand(task, &flags);
+ return 0;
+ }
+
+ /* prevent filter from being freed while we are printing it */
+ __get_seccomp_filter(f);
+ unlock_task_sighand(task, &flags);
+
+ proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+ f->cache.allow_native,
+ SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+ f->cache.allow_compat,
+ SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ __put_seccomp_filter(f);
+ return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
diff --git a/kernel/signal.c b/kernel/signal.c
index c37170655171..5736c55aaa1a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2555,14 +2555,12 @@ bool get_signal(struct ksignal *ksig)
* that the arch handlers don't all have to do it. If we get here
* without TIF_SIGPENDING, just exit after running signal work.
*/
-#ifdef TIF_NOTIFY_SIGNAL
if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
tracehook_notify_signal();
if (!task_sigpending(current))
return false;
}
-#endif
if (unlikely(uprobe_deny_signal()))
return false;
@@ -2576,26 +2574,6 @@ bool get_signal(struct ksignal *ksig)
relock:
spin_lock_irq(&sighand->siglock);
- /*
- * Make sure we can safely read ->jobctl() in task_work add. As Oleg
- * states:
- *
- * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we
- * roughly have
- *
- * task_work_add: get_signal:
- * STORE(task->task_works, new_work); STORE(task->jobctl);
- * mb(); mb();
- * LOAD(task->jobctl); LOAD(task->task_works);
- *
- * and we can rely on STORE-MB-LOAD [ in task_work_add].
- */
- smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK);
- if (unlikely(current->task_works)) {
- spin_unlock_irq(&sighand->siglock);
- task_work_run();
- goto relock;
- }
/*
* Every stopped thread goes here after wakeup. Check to see if
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f27ac94d5fa7..19aa806890d5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1);
COND_SYSCALL(epoll_ctl);
COND_SYSCALL(epoll_pwait);
COND_SYSCALL_COMPAT(epoll_pwait);
+COND_SYSCALL(epoll_pwait2);
+COND_SYSCALL_COMPAT(epoll_pwait2);
/* fs/fcntl.c */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 15b087286bea..9cde961875c0 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -5,34 +5,6 @@
static struct callback_head work_exited; /* all we need is ->next == NULL */
-/*
- * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster
- * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is
- * shared for threads, and can cause contention on sighand->lock. Even for
- * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking
- * or IRQ disabling is involved for notification (or running) purposes.
- */
-static void task_work_notify_signal(struct task_struct *task)
-{
-#if defined(TIF_NOTIFY_SIGNAL)
- set_notify_signal(task);
-#else
- unsigned long flags;
-
- /*
- * Only grab the sighand lock if we don't already have some
- * task_work pending. This pairs with the smp_store_mb()
- * in get_signal(), see comment there.
- */
- if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
- lock_task_sighand(task, &flags)) {
- task->jobctl |= JOBCTL_TASK_WORK;
- signal_wake_up(task, 0);
- unlock_task_sighand(task, &flags);
- }
-#endif
-}
-
/**
* task_work_add - ask the @task to execute @work->func()
* @task: the task which should run the callback
@@ -76,7 +48,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
set_notify_resume(task);
break;
case TWA_SIGNAL:
- task_work_notify_signal(task);
+ set_notify_signal(task);
break;
default:
WARN_ON_ONCE(1);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a09b1d61df6a..9a41848b6ebb 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -26,13 +26,9 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
config GENERIC_TIME_VSYSCALL
bool
-# Old style timekeeping
-config ARCH_USES_GETTIMEOFFSET
- bool
-
# The generic clock events infrastructure
config GENERIC_CLOCKEVENTS
- bool
+ def_bool !LEGACY_TIMER_TICK
# Architecture can handle broadcast in a driver-agnostic way
config ARCH_HAS_TICK_BROADCAST
@@ -61,6 +57,13 @@ config POSIX_CPU_TIMERS_TASK_WORK
bool
default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
+config LEGACY_TIMER_TICK
+ bool
+ help
+ The legacy timer tick helper is used by platforms that
+ lack support for the generic clockevent framework.
+ New platforms should use generic clockevents instead.
+
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
@@ -72,7 +75,6 @@ config TICK_ONESHOT
config NO_HZ_COMMON
bool
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
choice
@@ -87,7 +89,6 @@ config HZ_PERIODIC
config NO_HZ_IDLE
bool "Idle dynticks system (tickless idle)"
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select NO_HZ_COMMON
help
This option enables a tickless idle system: timer interrupts
@@ -99,7 +100,6 @@ config NO_HZ_IDLE
config NO_HZ_FULL
bool "Full dynticks system (tickless)"
# NO_HZ_COMMON dependency
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
# We need at least one periodic CPU for timekeeping
depends on SMP
depends on HAVE_CONTEXT_TRACKING
@@ -158,7 +158,6 @@ config CONTEXT_TRACKING_FORCE
config NO_HZ
bool "Old Idle dynticks config"
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
help
This is the old config entry that enables dynticks idle.
We keep it around for a little while to enforce backward
@@ -166,7 +165,6 @@ config NO_HZ
config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
help
This option enables high resolution timer support. If your
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c8f00168afe8..1fb1c1ef6a19 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -16,6 +16,7 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
+obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 02441ead3c3b..cce484a2cc7c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -705,8 +705,6 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
&cs->max_cycles);
}
-#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
-
static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
{
struct clocksource *cs;
@@ -798,12 +796,6 @@ static void clocksource_select_fallback(void)
__clocksource_select(true);
}
-#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
-static inline void clocksource_select(void) { }
-static inline void clocksource_select_fallback(void) { }
-
-#endif
-
/*
* clocksource_done_booting - Called near the end of core bootup
*
diff --git a/kernel/time/tick-legacy.c b/kernel/time/tick-legacy.c
new file mode 100644
index 000000000000..af225b32f5b3
--- /dev/null
+++ b/kernel/time/tick-legacy.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Timer tick function for architectures that lack generic clockevents,
+ * consolidated here from m68k/ia64/parisc/arm.
+ */
+
+#include <linux/irq.h>
+#include <linux/profile.h>
+#include <linux/timekeeper_internal.h>
+
+#include "tick-internal.h"
+
+/**
+ * legacy_timer_tick() - advances the timekeeping infrastructure
+ * @ticks: number of ticks, that have elapsed since the last call.
+ *
+ * This is used by platforms that have not been converted to
+ * generic clockevents.
+ *
+ * If 'ticks' is zero, the CPU is not handling timekeeping, so
+ * only perform process accounting and profiling.
+ *
+ * Must be called with interrupts disabled.
+ */
+void legacy_timer_tick(unsigned long ticks)
+{
+ if (ticks) {
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
+ do_timer(ticks);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
+ update_wall_time();
+ }
+ update_process_times(user_mode(get_irq_regs()));
+ profile_tick(CPU_PROFILING);
+}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 74503c0151e5..a45cedda93a7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -369,13 +369,6 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
/* Timekeeper helper functions. */
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-static u32 default_arch_gettimeoffset(void) { return 0; }
-u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
-#else
-static inline u32 arch_gettimeoffset(void) { return 0; }
-#endif
-
static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
{
u64 nsec;
@@ -383,8 +376,7 @@ static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 de
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + arch_gettimeoffset();
+ return nsec;
}
static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
@@ -790,16 +782,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk->tkr_raw.cycle_last = cycle_now;
tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
-
- /* If arch requires, add in get_arch_timeoffset() */
- tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
-
-
tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
- /* If arch requires, add in get_arch_timeoffset() */
- tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
-
tk_normalize_xtime(tk);
}
@@ -2146,19 +2130,12 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
if (unlikely(timekeeping_suspended))
goto out;
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
- offset = real_tk->cycle_interval;
-
- if (mode != TK_ADV_TICK)
- goto out;
-#else
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
goto out;
-#endif
/* Do some additional sanity checking */
timekeeping_check_update(tk, offset);
@@ -2474,19 +2451,3 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */
-
-/**
- * xtime_update() - advances the timekeeping infrastructure
- * @ticks: number of ticks, that have elapsed since the last call.
- *
- * Must be called with interrupts disabled.
- */
-void xtime_update(unsigned long ticks)
-{
- raw_spin_lock(&jiffies_lock);
- write_seqcount_begin(&jiffies_seq);
- do_timer(ticks);
- write_seqcount_end(&jiffies_seq);
- raw_spin_unlock(&jiffies_lock);
- update_wall_time();
-}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 6c2cbd9ef999..543beba096c7 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -22,6 +22,7 @@ static inline int sched_clock_suspend(void) { return 0; }
static inline void sched_clock_resume(void) { }
#endif
+extern void update_process_times(int user);
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e1bf5228fb69..d5a19413d4f8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -31,6 +31,15 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
bool
+config HAVE_DYNAMIC_FTRACE_WITH_ARGS
+ bool
+ help
+ If this is set, then arguments and stack can be found from
+ the pt_regs passed into the function callback regs parameter
+ by default, even without setting the REGS flag in the ftrace_ops.
+ This allows for use of regs_get_kernel_argument() and
+ kernel_stack_pointer().
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -253,7 +262,6 @@ config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
- depends on !ARCH_USES_GETTIMEOFFSET
select TRACE_IRQFLAGS
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -277,7 +285,6 @@ config IRQSOFF_TRACER
config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
- depends on !ARCH_USES_GETTIMEOFFSET
depends on PREEMPTION
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -727,6 +734,45 @@ config TRACE_EVAL_MAP_FILE
If unsure, say N.
+config FTRACE_RECORD_RECURSION
+ bool "Record functions that recurse in function tracing"
+ depends on FUNCTION_TRACER
+ help
+ All callbacks that attach to the function tracing have some sort
+ of protection against recursion. Even though the protection exists,
+ it adds overhead. This option will create a file in the tracefs
+ file system called "recursed_functions" that will list the functions
+ that triggered a recursion.
+
+ This will add more overhead to cases that have recursion.
+
+ If unsure, say N
+
+config FTRACE_RECORD_RECURSION_SIZE
+ int "Max number of recursed functions to record"
+ default 128
+ depends on FTRACE_RECORD_RECURSION
+ help
+ This defines the limit of number of functions that can be
+ listed in the "recursed_functions" file, that lists all
+ the functions that caused a recursion to happen.
+ This file can be reset, but the limit can not change in
+ size at runtime.
+
+config RING_BUFFER_RECORD_RECURSION
+ bool "Record functions that recurse in the ring buffer"
+ depends on FTRACE_RECORD_RECURSION
+ # default y, because it is coupled with FTRACE_RECORD_RECURSION
+ default y
+ help
+ The ring buffer has its own internal recursion. Although when
+ recursion happens it wont cause harm because of the protection,
+ but it does cause an unwanted overhead. Enabling this option will
+ place where recursion was detected into the ftrace "recursed_functions"
+ file.
+
+ This will add more overhead to cases that have recursion.
+
config GCOV_PROFILE_FTRACE
bool "Enable GCOV profiling on ftrace subsystem"
depends on GCOV_KERNEL
@@ -797,6 +843,26 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
+config RING_BUFFER_VALIDATE_TIME_DELTAS
+ bool "Verify ring buffer time stamp deltas"
+ depends on RING_BUFFER
+ help
+ This will audit the time stamps on the ring buffer sub
+ buffer to make sure that all the time deltas for the
+ events on a sub buffer matches the current time stamp.
+ This audit is performed for every event that is not
+ interrupted, or interrupting another event. A check
+ is also made when traversing sub buffers to make sure
+ that all the deltas on the previous sub buffer do not
+ add up to be greater than the current time stamp.
+
+ NOTE: This adds significant overhead to recording of events,
+ and should only be used to test the logic of the ring buffer.
+ Do not use it on production systems.
+
+ Only say Y if you understand what this does, and you
+ still want it enabled. Otherwise say N
+
config MMIOTRACE_TEST
tristate "Test module for mmiotrace"
depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index e153be351548..7e44cea89fdc 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
+obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f1022945e346..fb0fe4c66b84 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -449,7 +449,7 @@ static struct dentry *blk_create_buf_file_callback(const char *filename,
&relay_file_operations);
}
-static struct rchan_callbacks blk_relay_callbacks = {
+static const struct rchan_callbacks blk_relay_callbacks = {
.subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
@@ -458,14 +458,9 @@ static struct rchan_callbacks blk_relay_callbacks = {
static void blk_trace_setup_lba(struct blk_trace *bt,
struct block_device *bdev)
{
- struct hd_struct *part = NULL;
-
- if (bdev)
- part = bdev->bd_part;
-
- if (part) {
- bt->start_lba = part->start_sect;
- bt->end_lba = part->start_sect + part->nr_sects;
+ if (bdev) {
+ bt->start_lba = bdev->bd_start_sect;
+ bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
} else {
bt->start_lba = 0;
bt->end_lba = -1ULL;
@@ -800,12 +795,12 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
#endif
static u64
-blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+blk_trace_request_get_cgid(struct request *rq)
{
if (!rq->bio)
return 0;
/* Use the first bio */
- return blk_trace_bio_get_cgid(q, rq->bio);
+ return blk_trace_bio_get_cgid(rq->q, rq->bio);
}
/*
@@ -846,40 +841,35 @@ static void blk_add_trace_rq(struct request *rq, int error,
rcu_read_unlock();
}
-static void blk_add_trace_rq_insert(void *ignore,
- struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
-static void blk_add_trace_rq_issue(void *ignore,
- struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
-static void blk_add_trace_rq_merge(void *ignore,
- struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
-static void blk_add_trace_rq_requeue(void *ignore,
- struct request_queue *q,
- struct request *rq)
+static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
int error, unsigned int nr_bytes)
{
blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
- blk_trace_request_get_cgid(rq->q, rq));
+ blk_trace_request_get_cgid(rq));
}
/**
@@ -911,10 +901,9 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
rcu_read_unlock();
}
-static void blk_add_trace_bio_bounce(void *ignore,
- struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
@@ -924,63 +913,24 @@ static void blk_add_trace_bio_complete(void *ignore,
blk_status_to_errno(bio->bi_status));
}
-static void blk_add_trace_bio_backmerge(void *ignore,
- struct request_queue *q,
- struct request *rq,
- struct bio *bio)
+static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0);
}
-static void blk_add_trace_bio_frontmerge(void *ignore,
- struct request_queue *q,
- struct request *rq,
- struct bio *bio)
+static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0);
}
-static void blk_add_trace_bio_queue(void *ignore,
- struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0);
}
-static void blk_add_trace_getrq(void *ignore,
- struct request_queue *q,
- struct bio *bio, int rw)
+static void blk_add_trace_getrq(void *ignore, struct bio *bio)
{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
- else {
- struct blk_trace *bt;
-
- rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL, 0);
- rcu_read_unlock();
- }
-}
-
-
-static void blk_add_trace_sleeprq(void *ignore,
- struct request_queue *q,
- struct bio *bio, int rw)
-{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
- else {
- struct blk_trace *bt;
-
- rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL, 0);
- rcu_read_unlock();
- }
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0);
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
@@ -1015,10 +965,9 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
rcu_read_unlock();
}
-static void blk_add_trace_split(void *ignore,
- struct request_queue *q, struct bio *bio,
- unsigned int pdu)
+static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
+ struct request_queue *q = bio->bi_disk->queue;
struct blk_trace *bt;
rcu_read_lock();
@@ -1039,20 +988,16 @@ static void blk_add_trace_split(void *ignore,
/**
* blk_add_trace_bio_remap - Add a trace for a bio-remap operation
* @ignore: trace callback data parameter (not used)
- * @q: queue the io is for
* @bio: the source bio
- * @dev: target device
+ * @dev: source device
* @from: source sector
*
- * Description:
- * Device mapper or raid target sometimes need to split a bio because
- * it spans a stripe (or similar). Add a trace for that action.
- *
+ * Called after a bio is remapped to a different device and/or sector.
**/
-static void blk_add_trace_bio_remap(void *ignore,
- struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
+ sector_t from)
{
+ struct request_queue *q = bio->bi_disk->queue;
struct blk_trace *bt;
struct blk_io_trace_remap r;
@@ -1077,7 +1022,6 @@ static void blk_add_trace_bio_remap(void *ignore,
/**
* blk_add_trace_rq_remap - Add a trace for a request-remap operation
* @ignore: trace callback data parameter (not used)
- * @q: queue the io is for
* @rq: the source request
* @dev: target device
* @from: source sector
@@ -1087,16 +1031,14 @@ static void blk_add_trace_bio_remap(void *ignore,
* Add a trace for that action.
*
**/
-static void blk_add_trace_rq_remap(void *ignore,
- struct request_queue *q,
- struct request *rq, dev_t dev,
+static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
sector_t from)
{
struct blk_trace *bt;
struct blk_io_trace_remap r;
rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
+ bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
@@ -1108,13 +1050,12 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+ sizeof(r), &r, blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
/**
* blk_add_driver_data - Add binary message with driver-specific data
- * @q: queue the io is for
* @rq: io request
* @data: driver-specific data
* @len: length of driver-specific data
@@ -1123,14 +1064,12 @@ static void blk_add_trace_rq_remap(void *ignore,
* Some drivers might want to write driver-specific data per request.
*
**/
-void blk_add_driver_data(struct request_queue *q,
- struct request *rq,
- void *data, size_t len)
+void blk_add_driver_data(struct request *rq, void *data, size_t len)
{
struct blk_trace *bt;
rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
+ bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
@@ -1138,7 +1077,7 @@ void blk_add_driver_data(struct request_queue *q,
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, 0, len, data,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1169,8 +1108,6 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
- WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@ -1190,7 +1127,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
@@ -1343,7 +1279,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
* ones now use the 64bit ino as the whole ID and
* no longer use generation.
*
- * Regarldess of the content, always output
+ * Regardless of the content, always output
* "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
* be mapped back to @id on both 64 and 32bit ino
* setups. See __kernfs_fh_to_dentry().
@@ -1385,7 +1321,7 @@ static void blk_log_dump_pdu(struct trace_seq *s,
i == 0 ? "" : " ", pdu_buf[i]);
/*
- * stop when the rest is just zeroes and indicate so
+ * stop when the rest is just zeros and indicate so
* with a ".." appended
*/
if (i == end && end != pdu_len - 1) {
@@ -1815,30 +1751,15 @@ static ssize_t blk_trace_mask2str(char *buf, int mask)
return p - buf;
}
-static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
-{
- if (bdev->bd_disk == NULL)
- return NULL;
-
- return bdev_get_queue(bdev);
-}
-
static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- struct block_device *bdev = bdget_part(dev_to_part(dev));
- struct request_queue *q;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
ssize_t ret = -ENXIO;
- if (bdev == NULL)
- goto out;
-
- q = blk_trace_get_queue(bdev);
- if (q == NULL)
- goto out_bdput;
-
mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
@@ -1861,9 +1782,6 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
out_unlock_bdev:
mutex_unlock(&q->debugfs_mutex);
-out_bdput:
- bdput(bdev);
-out:
return ret;
}
@@ -1871,8 +1789,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- struct block_device *bdev;
- struct request_queue *q;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
@@ -1888,17 +1806,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
goto out;
value = ret;
}
- } else if (kstrtoull(buf, 0, &value))
- goto out;
-
- ret = -ENXIO;
- bdev = bdget_part(dev_to_part(dev));
- if (bdev == NULL)
- goto out;
-
- q = blk_trace_get_queue(bdev);
- if (q == NULL)
- goto out_bdput;
+ } else {
+ if (kstrtoull(buf, 0, &value))
+ goto out;
+ }
mutex_lock(&q->debugfs_mutex);
@@ -1936,8 +1847,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
out_unlock_bdev:
mutex_unlock(&q->debugfs_mutex);
-out_bdput:
- bdput(bdev);
out:
return ret ? ret : count;
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ebadaa83502c..6c0018abe68a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -116,7 +116,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
* to all call sites, we did a bpf_prog_array_valid() there to check
* whether call->prog_array is empty or not, which is
- * a heurisitc to speed up execution.
+ * a heuristic to speed up execution.
*
* If bpf_prog_array_valid() fetched prog_array was
* non-NULL, we go into trace_call_bpf() and do the actual
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 5658f13037b3..73edb9e4f354 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -334,8 +334,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
static struct ftrace_ops graph_ops = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID |
FTRACE_OPS_FL_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9c1bba8cc51b..4d8e35575549 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,7 +80,7 @@ enum {
struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+ .flags = FTRACE_OPS_FL_STUB,
INIT_OPS_HASH(ftrace_list_end)
};
@@ -121,7 +121,7 @@ struct ftrace_ops global_ops;
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs);
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
#else
/* See comment below, where ftrace_ops_list_func is defined */
static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
@@ -140,7 +140,7 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
}
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
int pid;
@@ -154,7 +154,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
return;
}
- op->saved_func(ip, parent_ip, op, regs);
+ op->saved_func(ip, parent_ip, op, fregs);
}
static void ftrace_sync_ipi(void *data)
@@ -754,7 +754,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
static void
function_profile_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
@@ -866,7 +866,7 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ .flags = FTRACE_OPS_FL_INITIALIZED,
INIT_OPS_HASH(ftrace_profile_ops)
};
@@ -1040,8 +1040,7 @@ struct ftrace_ops global_ops = {
.local_hash.notrace_hash = EMPTY_HASH,
.local_hash.filter_hash = EMPTY_HASH,
INIT_OPS_HASH(global_ops)
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID,
};
@@ -2146,6 +2145,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
else
rec->flags &= ~FTRACE_FL_TRAMP_EN;
}
+
if (flag & FTRACE_FL_DIRECT) {
/*
* If there's only one user (direct_ops helper)
@@ -2389,8 +2389,9 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
}
static void call_direct_funcs(unsigned long ip, unsigned long pip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
unsigned long addr;
addr = ftrace_find_rec_direct(ip);
@@ -2402,7 +2403,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops direct_ops = {
.func = call_direct_funcs,
- .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
+ .flags = FTRACE_OPS_FL_IPMODIFY
| FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
| FTRACE_OPS_FL_PERMANENT,
/*
@@ -4183,7 +4184,6 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
struct ftrace_hash **orig_hash, *new_hash;
LIST_HEAD(process_mods);
char *func;
- int ret;
mutex_lock(&ops->func_hash->regex_lock);
@@ -4236,7 +4236,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
mutex_lock(&ftrace_lock);
- ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
+ ftrace_hash_move_and_update_ops(ops, orig_hash,
new_hash, enable);
mutex_unlock(&ftrace_lock);
@@ -4314,7 +4314,7 @@ static int __init ftrace_mod_cmd_init(void)
core_initcall(ftrace_mod_cmd_init);
static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct ftrace_probe_ops *probe_ops;
struct ftrace_func_probe *probe;
@@ -5588,7 +5588,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
struct ftrace_hash **orig_hash;
struct trace_parser *parser;
int filter_hash;
- int ret;
if (file->f_mode & FMODE_READ) {
iter = m->private;
@@ -5616,7 +5615,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
- ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
+ ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
iter->hash, filter_hash);
mutex_unlock(&ftrace_lock);
} else {
@@ -6884,8 +6883,7 @@ void ftrace_init_trace_array(struct trace_array *tr)
struct ftrace_ops global_ops = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID,
};
@@ -6935,12 +6933,13 @@ void ftrace_reset_array_ops(struct trace_array *tr)
static nokprobe_inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ignored, struct pt_regs *regs)
+ struct ftrace_ops *ignored, struct ftrace_regs *fregs)
{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
struct ftrace_ops *op;
int bit;
- bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
@@ -6969,7 +6968,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
pr_warn("op=%p %pS\n", op, op);
goto out;
}
- op->func(ip, parent_ip, op, regs);
+ op->func(ip, parent_ip, op, fregs);
}
} while_for_each_ftrace_op(op);
out:
@@ -6992,9 +6991,9 @@ out:
*/
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
- __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+ __ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
NOKPROBE_SYMBOL(ftrace_ops_list_func);
#else
@@ -7011,18 +7010,18 @@ NOKPROBE_SYMBOL(ftrace_ops_no_ops);
* this function will be called by the mcount trampoline.
*/
static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
int bit;
- bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
preempt_disable_notrace();
if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching())
- op->func(ip, parent_ip, op, regs);
+ op->func(ip, parent_ip, op, fregs);
preempt_enable_notrace();
trace_clear_recursion(bit);
@@ -7043,11 +7042,11 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func);
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If the function does not handle recursion, needs to be RCU safe,
- * or does per cpu logic, then we need to call the assist handler.
+ * If the function does not handle recursion or needs to be RCU safe,
+ * then we need to call the assist handler.
*/
- if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
- ops->flags & FTRACE_OPS_FL_RCU)
+ if (ops->flags & (FTRACE_OPS_FL_RECURSION |
+ FTRACE_OPS_FL_RCU))
return ftrace_ops_assist_func;
return ops->func;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a6268e09160a..ec08f948dd80 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -129,7 +130,16 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT)
+
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
+# define RB_FORCE_8BYTE_ALIGNMENT 0
+# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT 1
+# define RB_ARCH_ALIGNMENT 8U
+#endif
+
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -1422,7 +1432,8 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}
-static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
+static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ long nr_pages, struct list_head *pages)
{
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
@@ -1462,13 +1473,15 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
struct page *page;
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- mflags, cpu_to_node(cpu));
+ mflags, cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
+ rb_check_bpage(cpu_buffer, bpage);
+
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
if (!page)
goto free_pages;
bpage->page = page_address(page);
@@ -1500,7 +1513,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
WARN_ON(!nr_pages);
- if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages))
return -ENOMEM;
/*
@@ -1973,8 +1986,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
- size = nr_pages * BUF_PAGE_SIZE;
-
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
@@ -2009,8 +2020,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
* allocated without receiving ENOMEM
*/
INIT_LIST_HEAD(&cpu_buffer->new_pages);
- if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu)) {
+ if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages)) {
/* not enough memory for new pages */
err = -ENOMEM;
goto out_err;
@@ -2075,8 +2086,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
INIT_LIST_HEAD(&cpu_buffer->new_pages);
if (cpu_buffer->nr_pages_to_update > 0 &&
- __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu_id)) {
+ __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages)) {
err = -ENOMEM;
goto out_err;
}
@@ -2628,9 +2639,6 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
return skip_time_extend(event);
}
-static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event);
-
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
static inline bool sched_clock_stable(void)
{
@@ -2719,7 +2727,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA) {
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
event->type_len = 0;
event->array[0] = length;
} else
@@ -2734,11 +2742,11 @@ static unsigned rb_calculate_event_length(unsigned length)
if (!length)
length++;
- if (length > RB_MAX_SMALL_DATA)
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ALIGNMENT);
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
* In case the time delta is larger than the 27 bits for it
@@ -2758,20 +2766,6 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-static __always_inline bool
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
static u64 rb_time_delta(struct ring_buffer_event *event)
{
switch (event->type_len) {
@@ -3006,6 +3000,13 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
irq_work_queue(&cpu_buffer->irq_work.work);
}
+#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
+# define do_ring_buffer_record_recursion() \
+ do_ftrace_record_recursion(_THIS_IP_, _RET_IP_)
+#else
+# define do_ring_buffer_record_recursion() do { } while (0)
+#endif
+
/*
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
@@ -3088,8 +3089,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
* been updated yet. In this case, use the TRANSITION bit.
*/
bit = RB_CTX_TRANSITION;
- if (val & (1 << (bit + cpu_buffer->nest)))
+ if (val & (1 << (bit + cpu_buffer->nest))) {
+ do_ring_buffer_record_recursion();
return 1;
+ }
}
val |= (1 << (bit + cpu_buffer->nest));
@@ -3183,6 +3186,153 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+/* Special value to validate all deltas on a page. */
+#define CHECK_FULL_PAGE 1L
+
+#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+static void dump_buffer_page(struct buffer_data_page *bpage,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int e;
+
+ ts = bpage->time_stamp;
+ pr_warn(" [%lld] PAGE TIME STAMP\n", ts);
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(bpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = ring_buffer_event_time_stamp(event);
+ ts += delta;
+ pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = ring_buffer_event_time_stamp(event);
+ ts = delta;
+ pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ ts += event->time_delta;
+ pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
+ break;
+
+ case RINGBUF_TYPE_DATA:
+ ts += event->time_delta;
+ pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
+ break;
+
+ default:
+ break;
+ }
+ }
+}
+
+static DEFINE_PER_CPU(atomic_t, checking);
+static atomic_t ts_dump;
+
+/*
+ * Check if the current event time stamp matches the deltas on
+ * the buffer page.
+ */
+static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+ struct ring_buffer_event *event;
+ struct buffer_data_page *bpage;
+ u64 ts, delta;
+ bool full = false;
+ int e;
+
+ bpage = info->tail_page->page;
+
+ if (tail == CHECK_FULL_PAGE) {
+ full = true;
+ tail = local_read(&bpage->commit);
+ } else if (info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) {
+ /* Ignore events with absolute time stamps */
+ return;
+ }
+
+ /*
+ * Do not check the first event (skip possible extends too).
+ * Also do not check if previous events have not been committed.
+ */
+ if (tail <= 8 || tail > local_read(&bpage->commit))
+ return;
+
+ /*
+ * If this interrupted another event,
+ */
+ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
+ goto out;
+
+ ts = bpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(bpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = ring_buffer_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = ring_buffer_event_time_stamp(event);
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ /* fall through */
+ case RINGBUF_TYPE_DATA:
+ ts += event->time_delta;
+ break;
+
+ default:
+ RB_WARN_ON(cpu_buffer, 1);
+ }
+ }
+ if ((full && ts > info->ts) ||
+ (!full && ts + info->delta != info->ts)) {
+ /* If another report is happening, ignore this one */
+ if (atomic_inc_return(&ts_dump) != 1) {
+ atomic_dec(&ts_dump);
+ goto out;
+ }
+ atomic_inc(&cpu_buffer->record_disabled);
+ pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld after:%lld\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta, info->after);
+ dump_buffer_page(bpage, info, tail);
+ atomic_dec(&ts_dump);
+ /* Do not re-enable checking */
+ return;
+ }
+out:
+ atomic_dec(this_cpu_ptr(&checking));
+}
+#else
+static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+}
+#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */
+
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
@@ -3240,6 +3390,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (a_ok && b_ok && info->before != info->after)
(void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
info->before, info->after);
+ if (a_ok && b_ok)
+ check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
@@ -3257,9 +3409,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* This did not interrupt any time update */
info->delta = info->ts - info->after;
else
- /* Just use full timestamp for inerrupting event */
+ /* Just use full timestamp for interrupting event */
info->delta = info->ts;
barrier();
+ check_buffer(cpu_buffer, info, tail);
if (unlikely(info->ts != save_before)) {
/* SLOW PATH - Interrupted between C and E */
@@ -3293,7 +3446,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
info->ts = ts;
} else {
/*
- * Interrupted beween C and E:
+ * Interrupted between C and E:
* Lost the previous events time stamp. Just set the
* delta to zero, and this will be the same time as
* the event this event interrupted. And the events that
@@ -3500,7 +3653,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
}
/**
- * ring_buffer_commit_discard - discard an event that has not been committed
+ * ring_buffer_discard_commit - discard an event that has not been committed
* @buffer: the ring buffer
* @event: non committed event to discard
*
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index edd912cd14aa..a4b4bbf8c3bf 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -307,7 +307,7 @@ static int __init test_create_synth_event(void)
return ret;
delete:
/* We got an error after creating the event, delete it */
- ret = synth_event_delete("create_synth_test");
+ synth_event_delete("create_synth_test");
goto out;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 06134189e9a7..b8a2d786b503 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -68,10 +68,21 @@ bool ring_buffer_expanded;
static bool __read_mostly tracing_selftest_running;
/*
- * If a tracer is running, we do not want to run SELFTEST.
+ * If boot-time tracing including tracers/events via kernel cmdline
+ * is running, we do not want to run SELFTEST.
*/
bool __read_mostly tracing_selftest_disabled;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+void __init disable_tracing_selftest(const char *reason)
+{
+ if (!tracing_selftest_disabled) {
+ tracing_selftest_disabled = true;
+ pr_info("Ftrace startup test is disabled due to %s\n", reason);
+ }
+}
+#endif
+
/* Pipe tracepoints to printk */
struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
@@ -2113,11 +2124,7 @@ int __init register_tracer(struct tracer *type)
apply_trace_boot_options();
/* disable other selftests, since this will break it. */
- tracing_selftest_disabled = true;
-#ifdef CONFIG_FTRACE_STARTUP_TEST
- printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
- type->name);
-#endif
+ disable_tracing_selftest("running a tracer");
out_unlock:
return ret;
@@ -3121,7 +3128,7 @@ struct trace_buffer_struct {
static struct trace_buffer_struct *trace_percpu_buffer;
/*
- * Thise allows for lockless recording. If we're nested too deeply, then
+ * This allows for lockless recording. If we're nested too deeply, then
* this returns NULL.
*/
static char *get_trace_buf(void)
@@ -9062,7 +9069,10 @@ int tracing_init_dentry(void)
extern struct trace_eval_map *__start_ftrace_eval_maps[];
extern struct trace_eval_map *__stop_ftrace_eval_maps[];
-static void __init trace_eval_init(void)
+static struct workqueue_struct *eval_map_wq __initdata;
+static struct work_struct eval_map_work __initdata;
+
+static void __init eval_map_work_func(struct work_struct *work)
{
int len;
@@ -9070,6 +9080,33 @@ static void __init trace_eval_init(void)
trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
}
+static int __init trace_eval_init(void)
+{
+ INIT_WORK(&eval_map_work, eval_map_work_func);
+
+ eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0);
+ if (!eval_map_wq) {
+ pr_err("Unable to allocate eval_map_wq\n");
+ /* Do work here */
+ eval_map_work_func(&eval_map_work);
+ return -ENOMEM;
+ }
+
+ queue_work(eval_map_wq, &eval_map_work);
+ return 0;
+}
+
+static int __init trace_eval_sync(void)
+{
+ /* Make sure the eval map updates are finished */
+ if (eval_map_wq)
+ destroy_workqueue(eval_map_wq);
+ return 0;
+}
+
+late_initcall_sync(trace_eval_sync);
+
+
#ifdef CONFIG_MODULES
static void trace_module_add_evals(struct module *mod)
{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1dadef445cd1..e448d2da0b99 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -558,183 +558,6 @@ struct tracer {
bool noboot;
};
-
-/* Only current can touch trace_recursion */
-
-/*
- * For function tracing recursion:
- * The order of these bits are important.
- *
- * When function tracing occurs, the following steps are made:
- * If arch does not support a ftrace feature:
- * call internal function (uses INTERNAL bits) which calls...
- * If callback is registered to the "global" list, the list
- * function is called and recursion checks the GLOBAL bits.
- * then this function calls...
- * The function callback, which can use the FTRACE bits to
- * check for recursion.
- *
- * Now if the arch does not support a feature, and it calls
- * the global list function which calls the ftrace callback
- * all three of these steps will do a recursion protection.
- * There's no reason to do one if the previous caller already
- * did. The recursion that we are protecting against will
- * go through the same steps again.
- *
- * To prevent the multiple recursion checks, if a recursion
- * bit is set that is higher than the MAX bit of the current
- * check, then we know that the check was made by the previous
- * caller, and we can skip the current check.
- */
-enum {
- /* Function recursion bits */
- TRACE_FTRACE_BIT,
- TRACE_FTRACE_NMI_BIT,
- TRACE_FTRACE_IRQ_BIT,
- TRACE_FTRACE_SIRQ_BIT,
-
- /* INTERNAL_BITs must be greater than FTRACE_BITs */
- TRACE_INTERNAL_BIT,
- TRACE_INTERNAL_NMI_BIT,
- TRACE_INTERNAL_IRQ_BIT,
- TRACE_INTERNAL_SIRQ_BIT,
-
- TRACE_BRANCH_BIT,
-/*
- * Abuse of the trace_recursion.
- * As we need a way to maintain state if we are tracing the function
- * graph in irq because we want to trace a particular function that
- * was called in irq context but we have irq tracing off. Since this
- * can only be modified by current, we can reuse trace_recursion.
- */
- TRACE_IRQ_BIT,
-
- /* Set if the function is in the set_graph_function file */
- TRACE_GRAPH_BIT,
-
- /*
- * In the very unlikely case that an interrupt came in
- * at a start of graph tracing, and we want to trace
- * the function in that interrupt, the depth can be greater
- * than zero, because of the preempted start of a previous
- * trace. In an even more unlikely case, depth could be 2
- * if a softirq interrupted the start of graph tracing,
- * followed by an interrupt preempting a start of graph
- * tracing in the softirq, and depth can even be 3
- * if an NMI came in at the start of an interrupt function
- * that preempted a softirq start of a function that
- * preempted normal context!!!! Luckily, it can't be
- * greater than 3, so the next two bits are a mask
- * of what the depth is when we set TRACE_GRAPH_BIT
- */
-
- TRACE_GRAPH_DEPTH_START_BIT,
- TRACE_GRAPH_DEPTH_END_BIT,
-
- /*
- * To implement set_graph_notrace, if this bit is set, we ignore
- * function graph tracing of called functions, until the return
- * function is called to clear it.
- */
- TRACE_GRAPH_NOTRACE_BIT,
-
- /*
- * When transitioning between context, the preempt_count() may
- * not be correct. Allow for a single recursion to cover this case.
- */
- TRACE_TRANSITION_BIT,
-};
-
-#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
-#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
-#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
-
-#define trace_recursion_depth() \
- (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
-#define trace_recursion_set_depth(depth) \
- do { \
- current->trace_recursion &= \
- ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \
- current->trace_recursion |= \
- ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \
- } while (0)
-
-#define TRACE_CONTEXT_BITS 4
-
-#define TRACE_FTRACE_START TRACE_FTRACE_BIT
-#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_LIST_START TRACE_INTERNAL_BIT
-#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
-
-static __always_inline int trace_get_context_bit(void)
-{
- int bit;
-
- if (in_interrupt()) {
- if (in_nmi())
- bit = 0;
-
- else if (in_irq())
- bit = 1;
- else
- bit = 2;
- } else
- bit = 3;
-
- return bit;
-}
-
-static __always_inline int trace_test_and_set_recursion(int start, int max)
-{
- unsigned int val = current->trace_recursion;
- int bit;
-
- /* A previous recursion check was made */
- if ((val & TRACE_CONTEXT_MASK) > max)
- return 0;
-
- bit = trace_get_context_bit() + start;
- if (unlikely(val & (1 << bit))) {
- /*
- * It could be that preempt_count has not been updated during
- * a switch between contexts. Allow for a single recursion.
- */
- bit = TRACE_TRANSITION_BIT;
- if (trace_recursion_test(bit))
- return -1;
- trace_recursion_set(bit);
- barrier();
- return bit + 1;
- }
-
- /* Normal check passed, clear the transition to allow it again */
- trace_recursion_clear(TRACE_TRANSITION_BIT);
-
- val |= 1 << bit;
- current->trace_recursion = val;
- barrier();
-
- return bit + 1;
-}
-
-static __always_inline void trace_clear_recursion(int bit)
-{
- unsigned int val = current->trace_recursion;
-
- if (!bit)
- return;
-
- bit--;
- bit = 1 << bit;
- val &= ~bit;
-
- barrier();
- current->trace_recursion = val;
-}
-
static inline struct ring_buffer_iter *
trace_buffer_iter(struct trace_iterator *iter, int cpu)
{
@@ -896,6 +719,8 @@ extern bool ring_buffer_expanded;
extern bool tracing_selftest_disabled;
#ifdef CONFIG_FTRACE_STARTUP_TEST
+extern void __init disable_tracing_selftest(const char *reason);
+
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_function_graph(struct tracer *trace,
@@ -919,6 +744,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
*/
#define __tracer_data __refdata
#else
+static inline void __init disable_tracing_selftest(const char *reason)
+{
+}
/* Tracers are seldom changed. Optimize when selftests are disabled. */
#define __tracer_data __read_mostly
#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 2e9a4746ea85..801c2a7f7605 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -31,7 +31,7 @@ static bool ok_to_run;
* it simply writes "START". As the first write is cold cache and
* the rest is hot, we save off that time in bm_first and it is
* reported as "first", which is shown in the second write to the
- * tracepoint. The "first" field is writen within the statics from
+ * tracepoint. The "first" field is written within the statics from
* then on but never changes.
*/
static void trace_do_benchmark(void)
@@ -112,7 +112,7 @@ static void trace_do_benchmark(void)
int i = 0;
/*
* stddev is the square of standard deviation but
- * we want the actualy number. Use the average
+ * we want the actually number. Use the average
* as our seed to find the std.
*
* The next try is:
@@ -155,7 +155,7 @@ static int benchmark_event_kthread(void *arg)
/*
* We don't go to sleep, but let others run as well.
- * This is bascially a "yield()" to let any task that
+ * This is basically a "yield()" to let any task that
* wants to run, schedule in, but if the CPU is idle,
* we'll keep burning cycles.
*
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index c22a152ef0b4..a82f03f385f8 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -344,6 +344,8 @@ static int __init trace_boot_init(void)
trace_boot_init_one_instance(tr, trace_node);
trace_boot_init_instances(trace_node);
+ disable_tracing_selftest("running boot-time tracing");
+
return 0;
}
/*
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 5fa49cfd2bb6..4f967d5cd917 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -276,7 +276,7 @@ int dynevent_arg_add(struct dynevent_cmd *cmd,
* arguments of the form 'type variable_name;' or 'x+y'.
*
* The lhs argument string will be appended to the current cmd string,
- * followed by an operator, if applicable, followd by the rhs string,
+ * followed by an operator, if applicable, followed by the rhs string,
* followed finally by a separator, if applicable. Before the
* argument is added, the @check_arg function, if present, will be
* used to check the sanity of the current arg strings.
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index d6857a254ede..d6f72dcb7269 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -29,10 +29,10 @@ struct dyn_event;
* @show: Showing method. This is invoked when user reads the event definitions
* via dynamic_events interface.
* @is_busy: Check whether given event is busy so that it can not be deleted.
- * Return true if it is busy, otherwides false.
- * @free: Delete the given event. Return 0 if success, otherwides error.
+ * Return true if it is busy, otherwise false.
+ * @free: Delete the given event. Return 0 if success, otherwise error.
* @match: Check whether given event and system name match this event. The argc
- * and argv is used for exact match. Return true if it matches, otherwides
+ * and argv is used for exact match. Return true if it matches, otherwise
* false.
*
* Except for @create, these methods are called under holding event_mutex.
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 18c4a58aff79..4547ac59da61 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -32,7 +32,7 @@
* to be deciphered for the format file. Although these macros
* may become out of sync with the internal structure, they
* will create a compile error if it happens. Since the
- * internel structures are just tracing helpers, this is not
+ * internal structures are just tracing helpers, this is not
* an issue.
*
* When an internal structure is used, it should use:
@@ -93,10 +93,10 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
__field_packed( unsigned long, ret, func )
- __field_packed( unsigned long, ret, overrun )
+ __field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, overrun )
__field_packed( unsigned long long, ret, calltime)
__field_packed( unsigned long long, ret, rettime )
- __field_packed( int, ret, depth )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 643e0b19920d..a71181655958 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -432,17 +432,25 @@ NOKPROBE_SYMBOL(perf_trace_buf_update);
#ifdef CONFIG_FUNCTION_TRACER
static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *pt_regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct ftrace_entry *entry;
struct perf_event *event;
struct hlist_head head;
struct pt_regs regs;
int rctx;
+ int bit;
+
+ if (!rcu_is_watching())
+ return;
if ((unsigned long)ops->private != smp_processor_id())
return;
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
event = container_of(ops, struct perf_event, ftrace_ops);
/*
@@ -463,13 +471,15 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
if (!entry)
- return;
+ goto out;
entry->ip = ip;
entry->parent_ip = parent_ip;
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
1, &regs, &head, NULL);
+out:
+ ftrace_test_recursion_unlock(bit);
#undef ENTRY_SIZE
}
@@ -477,7 +487,6 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags = FTRACE_OPS_FL_RCU;
ops->func = perf_ftrace_function_call;
ops->private = (void *)(unsigned long)nr_cpu_ids;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index adf65b502453..e9d28eeccb7e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2436,7 +2436,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
/*
* Since calls are grouped by systems, the likelyhood that the
* next call in the iteration belongs to the same system as the
- * previous call is high. As an optimization, we skip seaching
+ * previous call is high. As an optimization, we skip searching
* for a map[] that matches the call's system if the last call
* was from the same system. That's what last_i is for. If the
* call has the same system as the previous call, then last_i
@@ -3201,7 +3201,7 @@ static __init int setup_trace_event(char *str)
{
strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
ring_buffer_expanded = true;
- tracing_selftest_disabled = true;
+ disable_tracing_selftest("running event tracing");
return 1;
}
@@ -3271,7 +3271,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
*
* When a new instance is created, it needs to set up its events
* directory, as well as other files associated with events. It also
- * creates the event hierachry in the @parent/events directory.
+ * creates the event hierarchy in the @parent/events directory.
*
* Returns 0 on success.
*
@@ -3673,7 +3673,7 @@ static struct trace_event_file event_trace_file __initdata;
static void __init
function_test_events_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *regs)
{
struct trace_buffer *buffer;
struct ring_buffer_event *event;
@@ -3712,7 +3712,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops trace_ops __initdata =
{
.func = function_test_events_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 78a678eeb140..e91259f6a722 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1561,27 +1561,6 @@ static inline void event_clear_filter(struct trace_event_file *file)
RCU_INIT_POINTER(file->filter, NULL);
}
-static inline void
-event_set_no_set_filter_flag(struct trace_event_file *file)
-{
- file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
-}
-
-static inline void
-event_clear_no_set_filter_flag(struct trace_event_file *file)
-{
- file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
-}
-
-static inline bool
-event_no_set_filter_flag(struct trace_event_file *file)
-{
- if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
- return true;
-
- return false;
-}
-
struct filter_list {
struct list_head list;
struct event_filter *filter;
@@ -1950,7 +1929,7 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len,
/*
* The 'ip' field could have multiple filters set, separated
* either by space or comma. We first cut the filter and apply
- * all pieces separatelly.
+ * all pieces separately.
*/
re = ftrace_function_filter_re(buf, len, &re_cnt);
if (!re)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 96c3f86b81c5..39ebe1826fc3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -3355,7 +3355,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
} else {
field_var = NULL;
/*
- * If no explicit system.event is specfied, default to
+ * If no explicit system.event is specified, default to
* looking for fields on the onmatch(system.event.xxx)
* event.
*/
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 881df991742a..5a8bc0b421f1 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1276,7 +1276,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
/**
* synth_event_create - Create a new synthetic event
- * @name: The name of the new sythetic event
+ * @name: The name of the new synthetic event
* @fields: An array of type/name field descriptions
* @n_fields: The number of field descriptions contained in the fields array
* @mod: The module creating the event, NULL if not created from a module
@@ -1446,7 +1446,7 @@ __synth_event_trace_init(struct trace_event_file *file,
* this code to be called, etc). Because this is called
* directly by the user, we don't have that but we still need
* to honor not logging when disabled. For the iterated
- * trace case, we save the enabed state upon start and just
+ * trace case, we save the enabled state upon start and just
* ignore the following data calls.
*/
if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 90f81d33fa3f..d960f6b11b5e 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -26,7 +26,7 @@ static int ftrace_event_register(struct trace_event_call *call,
/*
* The FTRACE_ENTRY_REG macro allows ftrace entry to define register
- * function and thus become accesible via perf.
+ * function and thus become accessible via perf.
*/
#undef FTRACE_ENTRY_REG
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 2c2126e1871d..c5095dd28e20 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -23,10 +23,10 @@ static void tracing_start_function_trace(struct trace_array *tr);
static void tracing_stop_function_trace(struct trace_array *tr);
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs);
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs);
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
static struct tracer_flags func_flags;
/* Our option */
@@ -48,7 +48,7 @@ int ftrace_allocate_ftrace_ops(struct trace_array *tr)
/* Currently only the non stack version is supported */
ops->func = function_trace_call;
- ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
+ ops->flags = FTRACE_OPS_FL_PID;
tr->ops = ops;
ops->private = tr;
@@ -89,7 +89,6 @@ void ftrace_destroy_function_files(struct trace_array *tr)
static int function_trace_init(struct trace_array *tr)
{
ftrace_func_t func;
-
/*
* Instance trace_arrays get their ops allocated
* at instance creation. Unless it failed
@@ -129,7 +128,7 @@ static void function_trace_start(struct trace_array *tr)
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
struct trace_array_cpu *data;
@@ -141,22 +140,20 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
if (unlikely(!tr->function_enabled))
return;
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
pc = preempt_count();
preempt_disable_notrace();
- bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
- if (bit < 0)
- goto out;
-
cpu = smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
if (!atomic_read(&data->disabled)) {
local_save_flags(flags);
trace_function(tr, ip, parent_ip, flags, pc);
}
- trace_clear_recursion(bit);
-
- out:
+ ftrace_test_recursion_unlock(bit);
preempt_enable_notrace();
}
@@ -180,7 +177,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 60d66278aa0d..d874dec87131 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -957,7 +957,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
/* Overrun */
if (flags & TRACE_GRAPH_PRINT_OVERRUN)
- trace_seq_printf(s, " (Overruns: %lu)\n",
+ trace_seq_printf(s, " (Overruns: %u)\n",
trace->overrun);
print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index d071fc271eef..c0df9b97f147 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -485,11 +485,11 @@ hwlat_width_write(struct file *filp, const char __user *ubuf,
* @ppos: The current position in @file
*
* This function provides a write implementation for the "window" interface
- * to the hardware latency detetector. The window is the total time
+ * to the hardware latency detector. The window is the total time
* in us that will be considered one sample period. Conceptually, windows
* occur back-to-back and contain a sample width period during which
* actual sampling occurs. Can be used to write a new total window size. It
- * is enfoced that any value written must be greater than the sample width
+ * is enforced that any value written must be greater than the sample width
* size, or an error results.
*/
static ssize_t
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 10bbb0f381d5..d06aab4dcbb8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -138,7 +138,7 @@ static int func_prolog_dec(struct trace_array *tr,
*/
static void
irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 97c7a7782db7..9c31f42245e9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -25,11 +25,12 @@
/* Kprobe early definition from command line */
static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
-static bool kprobe_boot_events_enabled __initdata;
static int __init set_kprobe_boot_events(char *str)
{
strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+ disable_tracing_selftest("running kprobe events");
+
return 0;
}
__setup("kprobe_event=", set_kprobe_boot_events);
@@ -1888,8 +1889,6 @@ static __init void setup_boot_kprobe_events(void)
ret = trace_run_command(cmd, create_or_delete_trace_kprobe);
if (ret)
pr_warn("Failed to add event(%d): %s\n", ret, cmd);
- else
- kprobe_boot_events_enabled = true;
cmd = p;
}
@@ -1974,10 +1973,8 @@ static __init int kprobe_trace_self_tests_init(void)
if (tracing_is_disabled())
return -ENODEV;
- if (kprobe_boot_events_enabled) {
- pr_info("Skipping kprobe tests due to kprobe_event on cmdline\n");
+ if (tracing_selftest_disabled)
return 0;
- }
target = kprobe_trace_selftest_target;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 000e9dc224c6..92b1575ae0ca 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,8 +353,8 @@ static inline const char *kretprobed(const char *name)
}
#endif /* CONFIG_KRETPROBES */
-static void
-seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
+void
+trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
{
#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
@@ -420,7 +420,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
goto out;
}
- seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
+ trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
if (sym_flags & TRACE_ITER_SYM_ADDR)
trace_seq_printf(s, " <" IP_FMT ">", ip);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 2f742b74e7e6..4c954636caf0 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,6 +16,7 @@ extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
+extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
new file mode 100644
index 000000000000..b2edac1fe156
--- /dev/null
+++ b/kernel/trace/trace_recursion_record.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+struct recursed_functions {
+ unsigned long ip;
+ unsigned long parent_ip;
+};
+
+static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE];
+static atomic_t nr_records;
+
+/*
+ * Cache the last found function. Yes, updates to this is racey, but
+ * so is memory cache ;-)
+ */
+static unsigned long cached_function;
+
+void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip)
+{
+ int index = 0;
+ int i;
+ unsigned long old;
+
+ again:
+ /* First check the last one recorded */
+ if (ip == cached_function)
+ return;
+
+ i = atomic_read(&nr_records);
+ /* nr_records is -1 when clearing records */
+ smp_mb__after_atomic();
+ if (i < 0)
+ return;
+
+ /*
+ * If there's two writers and this writer comes in second,
+ * the cmpxchg() below to update the ip will fail. Then this
+ * writer will try again. It is possible that index will now
+ * be greater than nr_records. This is because the writer
+ * that succeeded has not updated the nr_records yet.
+ * This writer could keep trying again until the other writer
+ * updates nr_records. But if the other writer takes an
+ * interrupt, and that interrupt locks up that CPU, we do
+ * not want this CPU to lock up due to the recursion protection,
+ * and have a bug report showing this CPU as the cause of
+ * locking up the computer. To not lose this record, this
+ * writer will simply use the next position to update the
+ * recursed_functions, and it will update the nr_records
+ * accordingly.
+ */
+ if (index < i)
+ index = i;
+ if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE)
+ return;
+
+ for (i = index - 1; i >= 0; i--) {
+ if (recursed_functions[i].ip == ip) {
+ cached_function = ip;
+ return;
+ }
+ }
+
+ cached_function = ip;
+
+ /*
+ * We only want to add a function if it hasn't been added before.
+ * Add to the current location before incrementing the count.
+ * If it fails to add, then increment the index (save in i)
+ * and try again.
+ */
+ old = cmpxchg(&recursed_functions[index].ip, 0, ip);
+ if (old != 0) {
+ /* Did something else already added this for us? */
+ if (old == ip)
+ return;
+ /* Try the next location (use i for the next index) */
+ index++;
+ goto again;
+ }
+
+ recursed_functions[index].parent_ip = parent_ip;
+
+ /*
+ * It's still possible that we could race with the clearing
+ * CPU0 CPU1
+ * ---- ----
+ * ip = func
+ * nr_records = -1;
+ * recursed_functions[0] = 0;
+ * i = -1
+ * if (i < 0)
+ * nr_records = 0;
+ * (new recursion detected)
+ * recursed_functions[0] = func
+ * cmpxchg(recursed_functions[0],
+ * func, 0)
+ *
+ * But the worse that could happen is that we get a zero in
+ * the recursed_functions array, and it's likely that "func" will
+ * be recorded again.
+ */
+ i = atomic_read(&nr_records);
+ smp_mb__after_atomic();
+ if (i < 0)
+ cmpxchg(&recursed_functions[index].ip, ip, 0);
+ else if (i <= index)
+ atomic_cmpxchg(&nr_records, i, index + 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_record_recursion);
+
+static DEFINE_MUTEX(recursed_function_lock);
+static struct trace_seq *tseq;
+
+static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos)
+{
+ void *ret = NULL;
+ int index;
+
+ mutex_lock(&recursed_function_lock);
+ index = atomic_read(&nr_records);
+ if (*pos < index) {
+ ret = &recursed_functions[*pos];
+ }
+
+ tseq = kzalloc(sizeof(*tseq), GFP_KERNEL);
+ if (!tseq)
+ return ERR_PTR(-ENOMEM);
+
+ trace_seq_init(tseq);
+
+ return ret;
+}
+
+static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int index;
+ int p;
+
+ index = atomic_read(&nr_records);
+ p = ++(*pos);
+
+ return p < index ? &recursed_functions[p] : NULL;
+}
+
+static void recursed_function_seq_stop(struct seq_file *m, void *v)
+{
+ kfree(tseq);
+ mutex_unlock(&recursed_function_lock);
+}
+
+static int recursed_function_seq_show(struct seq_file *m, void *v)
+{
+ struct recursed_functions *record = v;
+ int ret = 0;
+
+ if (record) {
+ trace_seq_print_sym(tseq, record->parent_ip, true);
+ trace_seq_puts(tseq, ":\t");
+ trace_seq_print_sym(tseq, record->ip, true);
+ trace_seq_putc(tseq, '\n');
+ ret = trace_print_seq(m, tseq);
+ }
+
+ return ret;
+}
+
+static const struct seq_operations recursed_function_seq_ops = {
+ .start = recursed_function_seq_start,
+ .next = recursed_function_seq_next,
+ .stop = recursed_function_seq_stop,
+ .show = recursed_function_seq_show
+};
+
+static int recursed_function_open(struct inode *inode, struct file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&recursed_function_lock);
+ /* If this file was opened for write, then erase contents */
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ /* disable updating records */
+ atomic_set(&nr_records, -1);
+ smp_mb__after_atomic();
+ memset(recursed_functions, 0, sizeof(recursed_functions));
+ smp_wmb();
+ /* enable them again */
+ atomic_set(&nr_records, 0);
+ }
+ if (file->f_mode & FMODE_READ)
+ ret = seq_open(file, &recursed_function_seq_ops);
+ mutex_unlock(&recursed_function_lock);
+
+ return ret;
+}
+
+static ssize_t recursed_function_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return count;
+}
+
+static int recursed_function_release(struct inode *inode, struct file *file)
+{
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+ return 0;
+}
+
+static const struct file_operations recursed_functions_fops = {
+ .open = recursed_function_open,
+ .write = recursed_function_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = recursed_function_release,
+};
+
+__init static int create_recursed_functions(void)
+{
+ struct dentry *dentry;
+
+ dentry = trace_create_file("recursed_functions", 0644, NULL, NULL,
+ &recursed_functions_fops);
+ if (!dentry)
+ pr_warn("WARNING: Failed to create recursed_functions\n");
+ return 0;
+}
+
+fs_initcall(create_recursed_functions);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 97b10bb31a1f..c0181066dbe9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -212,7 +212,7 @@ static void wakeup_print_header(struct seq_file *s)
*/
static void
wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 4738ad48a667..73ef12092250 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -107,7 +107,7 @@ static int trace_selftest_test_probe1_cnt;
static void trace_selftest_test_probe1_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_probe1_cnt++;
}
@@ -116,7 +116,7 @@ static int trace_selftest_test_probe2_cnt;
static void trace_selftest_test_probe2_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_probe2_cnt++;
}
@@ -125,7 +125,7 @@ static int trace_selftest_test_probe3_cnt;
static void trace_selftest_test_probe3_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_probe3_cnt++;
}
@@ -134,7 +134,7 @@ static int trace_selftest_test_global_cnt;
static void trace_selftest_test_global_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_global_cnt++;
}
@@ -143,24 +143,21 @@ static int trace_selftest_test_dyn_cnt;
static void trace_selftest_test_dyn_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_dyn_cnt++;
}
static struct ftrace_ops test_probe1 = {
.func = trace_selftest_test_probe1_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_probe2 = {
.func = trace_selftest_test_probe2_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_probe3 = {
.func = trace_selftest_test_probe3_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static void print_counts(void)
@@ -417,7 +414,7 @@ static int trace_selftest_recursion_cnt;
static void trace_selftest_test_recursion_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
/*
* This function is registered without the recursion safe flag.
@@ -432,7 +429,7 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
static void trace_selftest_test_recursion_safe_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
/*
* We said we would provide our own recursion. By calling
@@ -448,11 +445,11 @@ static void trace_selftest_test_recursion_safe_func(unsigned long ip,
static struct ftrace_ops test_rec_probe = {
.func = trace_selftest_test_recursion_func,
+ .flags = FTRACE_OPS_FL_RECURSION,
};
static struct ftrace_ops test_recsafe_probe = {
.func = trace_selftest_test_recursion_safe_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static int
@@ -551,9 +548,11 @@ static enum {
static void trace_selftest_test_regs_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
- if (pt_regs)
+ struct pt_regs *regs = ftrace_get_regs(fregs);
+
+ if (regs)
trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
else
trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
@@ -561,7 +560,7 @@ static void trace_selftest_test_regs_func(unsigned long ip,
static struct ftrace_ops test_regs_probe = {
.func = trace_selftest_test_regs_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
};
static int
@@ -787,7 +786,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
/* Have we just recovered from a hang? */
if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
- tracing_selftest_disabled = true;
+ disable_tracing_selftest("recovering from a hang");
ret = -1;
goto out;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c408423e5d65..63c285042051 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -290,7 +290,7 @@ static void check_stack(unsigned long ip, unsigned long *stack)
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
unsigned long stack;
@@ -318,7 +318,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops trace_ops __read_mostly =
{
.func = stack_trace_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static ssize_t
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 4b50fc0cb12c..d6bddb157ef2 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -609,7 +609,7 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
* signal that state. There are two user-visible tracing_map
* variables, 'hits' and 'drops', which are updated by this function.
* Every time an element is either successfully inserted or retrieved,
- * the 'hits' value is incrememented. Every time an element insertion
+ * the 'hits' value is incremented. Every time an element insertion
* fails, the 'drops' value is incremented.
*
* This is a lock-free tracing map insertion function implementing a
@@ -642,9 +642,9 @@ struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key)
* tracing_map_elt. This is a lock-free lookup; see
* tracing_map_insert() for details on tracing_map and how it works.
* Every time an element is retrieved, the 'hits' value is
- * incrememented. There is one user-visible tracing_map variable,
+ * incremented. There is one user-visible tracing_map variable,
* 'hits', which is updated by this function. Every time an element
- * is successfully retrieved, the 'hits' value is incrememented. The
+ * is successfully retrieved, the 'hits' value is incremented. The
* 'drops' value is never updated by this function.
*
* Return: the tracing_map_elt pointer val associated with the key.
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index a6de61fc22de..2c765ee2a4d4 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -50,7 +50,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
* an instance of tracing_map_elt, where 'elt' in the latter part of
* that variable name is short for 'element'. The purpose of a
* tracing_map_elt is to hold values specific to the particular
- * 32-bit hashed key it's assocated with. Things such as the unique
+ * 32-bit hashed key it's associated with. Things such as the unique
* set of aggregated sums associated with the 32-bit hashed key, along
* with a copy of the full key associated with the entry, and which
* was used to produce the 32-bit hashed key.