summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/hashtab.c84
-rw-r--r--kernel/bpf/verifier.c7
-rw-r--r--kernel/configs/android-base.config152
-rw-r--r--kernel/configs/android-recommended.config121
-rw-r--r--kernel/cpuset.c15
-rw-r--r--kernel/events/core.c176
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/futex.c23
-rw-r--r--kernel/irq/affinity.c2
-rw-r--r--kernel/irq/chip.c11
-rw-r--r--kernel/irq/manage.c8
-rw-r--r--kernel/irq/msi.c11
-rw-r--r--kernel/jump_label.c61
-rw-r--r--kernel/kexec.c3
-rw-r--r--kernel/kexec_core.c69
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/livepatch/core.c2
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/locking/qspinlock_stat.h1
-rw-r--r--kernel/module.c122
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/power/hibernate.c4
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/printk/braille.c4
-rw-r--r--kernel/printk/printk.c170
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/relay.c34
-rw-r--r--kernel/sched/core.c19
-rw-r--r--kernel/sched/cpudeadline.c2
-rw-r--r--kernel/sched/cputime.c41
-rw-r--r--kernel/sched/deadline.c5
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/seccomp.c6
-rw-r--r--kernel/sysctl.c52
-rw-r--r--kernel/task_work.c10
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timekeeping_debug.c9
-rw-r--r--kernel/time/timer.c5
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c8
-rw-r--r--kernel/trace/trace_events_hist.c14
43 files changed, 1108 insertions, 200 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650d52fc..570eeca7bdfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
struct bucket *buckets;
void *elems;
struct pcpu_freelist freelist;
+ void __percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
+enum extra_elem_state {
+ HTAB_NOT_AN_EXTRA_ELEM = 0,
+ HTAB_EXTRA_ELEM_FREE,
+ HTAB_EXTRA_ELEM_USED
+};
+
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
@@ -38,7 +45,10 @@ struct htab_elem {
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
};
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ enum extra_elem_state state;
+ };
u32 hash;
char key[0] __aligned(8);
};
@@ -113,6 +123,23 @@ free_elems:
return err;
}
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+ void __percpu *pptr;
+ int cpu;
+
+ pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+ HTAB_EXTRA_ELEM_FREE;
+ }
+ htab->extra_elems = pptr;
+ return 0;
+}
+
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (percpu)
cost += (u64) round_up(htab->map.value_size, 8) *
num_possible_cpus() * htab->map.max_entries;
+ else
+ cost += (u64) htab->elem_size * num_possible_cpus();
if (cost >= U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
+ if (!percpu) {
+ err = alloc_extra_elems(htab);
+ if (err)
+ goto free_buckets;
+ }
+
if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
err = prealloc_elems_and_freelist(htab);
if (err)
- goto free_buckets;
+ goto free_extra_elems;
}
return &htab->map;
+free_extra_elems:
+ free_percpu(htab->extra_elems);
free_buckets:
kvfree(htab->buckets);
free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
kfree(l);
-
}
static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
+ if (l->state == HTAB_EXTRA_ELEM_USED) {
+ l->state = HTAB_EXTRA_ELEM_FREE;
+ return;
+ }
+
if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
- bool percpu, bool onallcpus)
+ bool percpu, bool onallcpus,
+ bool old_elem_exists)
{
u32 size = htab->map.value_size;
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
struct htab_elem *l_new;
void __percpu *pptr;
+ int err = 0;
if (prealloc) {
l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
if (!l_new)
- return ERR_PTR(-E2BIG);
+ err = -E2BIG;
} else {
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
atomic_dec(&htab->count);
- return ERR_PTR(-E2BIG);
+ err = -E2BIG;
+ } else {
+ l_new = kmalloc(htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
}
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return ERR_PTR(-ENOMEM);
+ }
+
+ if (err) {
+ if (!old_elem_exists)
+ return ERR_PTR(err);
+
+ /* if we're updating the existing element and the hash table
+ * is full, use per-cpu extra elems
+ */
+ l_new = this_cpu_ptr(htab->extra_elems);
+ if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+ return ERR_PTR(-E2BIG);
+ l_new->state = HTAB_EXTRA_ELEM_USED;
+ } else {
+ l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
}
memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret)
goto err;
- l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+ l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+ !!l_old);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
}
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus);
+ hash, true, onallcpus, false);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
htab_free_elems(htab);
pcpu_freelist_destroy(&htab->freelist);
}
+ free_percpu(htab->extra_elems);
kvfree(htab->buckets);
kfree(htab);
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f72f23b8fdab..daea765d72e6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,7 @@ struct verifier_env {
struct verifier_state_list **explored_states; /* search pruning optimization */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
+ u32 id_gen; /* used to generate unique reg IDs */
bool allow_ptr_leaks;
};
@@ -1052,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
goto error;
break;
case BPF_MAP_TYPE_CGROUP_ARRAY:
- if (func_id != BPF_FUNC_skb_in_cgroup)
+ if (func_id != BPF_FUNC_skb_under_cgroup)
goto error;
break;
default:
@@ -1074,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
- case BPF_FUNC_skb_in_cgroup:
+ case BPF_FUNC_skb_under_cgroup:
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
break;
@@ -1301,7 +1302,7 @@ add_imm:
/* dst_reg stays as pkt_ptr type and since some positive
* integer value was added to the pointer, increment its 'id'
*/
- dst_reg->id++;
+ dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range and off to zero */
dst_reg->off = 0;
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
new file mode 100644
index 000000000000..9f748ed7bea8
--- /dev/null
+++ b/kernel/configs/android-base.config
@@ -0,0 +1,152 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_MODULES is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_STAGING=y
+CONFIG_SWP_EMULATION=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
new file mode 100644
index 000000000000..e3b953e966d2
--- /dev/null
+++ b/kernel/configs/android-recommended.config
@@ -0,0 +1,121 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_UEVENT=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c7fd2778ed50..c27e53326bef 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2069,6 +2069,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&cpuset_mutex);
}
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task)
+{
+ if (task_css_is_root(task, cpuset_cgrp_id))
+ return;
+
+ set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ task->mems_allowed = current->mems_allowed;
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
@@ -2079,6 +2093,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .fork = cpuset_fork,
.legacy_cftypes = files,
.early_init = true,
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 356a6c7cb52a..3cfabdf7b942 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
return ret;
}
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
- struct event_function_struct efs = {
- .event = event,
- .func = func,
- .data = data,
- };
-
- int ret = event_function(&efs);
- WARN_ON_ONCE(ret);
-}
-
static void event_function_call(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
raw_spin_unlock_irq(&ctx->lock);
}
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct task_struct *task = READ_ONCE(ctx->task);
+ struct perf_event_context *task_ctx = NULL;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (task) {
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ task_ctx = ctx;
+ }
+
+ perf_ctx_lock(cpuctx, task_ctx);
+
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
+
+ if (task) {
+ /*
+ * We must be either inactive or active and the right task,
+ * otherwise we're screwed, since we cannot IPI to somewhere
+ * else.
+ */
+ if (ctx->is_active) {
+ if (WARN_ON_ONCE(task != current))
+ goto unlock;
+
+ if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+ goto unlock;
+ }
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ func(event, cpuctx, ctx, data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
+}
+
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
@@ -448,7 +484,7 @@ static u64 __report_allowed;
static void perf_duration_warn(struct irq_work *w)
{
- printk_ratelimited(KERN_WARNING
+ printk_ratelimited(KERN_INFO
"perf: interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
__report_avg, __report_allowed,
@@ -843,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
}
}
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool add)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
+ return;
+
+ if (add && ctx->nr_cgroups++)
+ return;
+ else if (!add && --ctx->nr_cgroups)
+ return;
+ /*
+ * Because cgroup events are always per-cpu events,
+ * this will always be called from the right CPU.
+ */
+ cpuctx = __get_cpu_context(ctx);
+ cpuctx->cgrp = add ? event->cgrp : NULL;
+}
+
#else /* !CONFIG_CGROUP_PERF */
static inline bool
@@ -920,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
}
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool add)
+{
+}
+
#endif
/*
@@ -1392,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+
lockdep_assert_held(&ctx->lock);
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1482,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}
- if (is_cgroup_event(event))
- ctx->nr_cgroups++;
+ list_update_cgroup_event(event, ctx, true);
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
@@ -1581,8 +1650,6 @@ static void perf_group_attach(struct perf_event *event)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_cpu_context *cpuctx;
-
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -1594,20 +1661,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- if (is_cgroup_event(event)) {
- ctx->nr_cgroups--;
- /*
- * Because cgroup events are always per-cpu events, this will
- * always be called from the right CPU.
- */
- cpuctx = __get_cpu_context(ctx);
- /*
- * If there are no more cgroup events then clear cgrp to avoid
- * stale pointer in update_cgrp_time_from_cpuctx().
- */
- if (!ctx->nr_cgroups)
- cpuctx->cgrp = NULL;
- }
+ list_update_cgroup_event(event, ctx, false);
ctx->nr_events--;
if (event->attr.inherit_stat)
@@ -1716,8 +1770,8 @@ static inline int pmu_filter_match(struct perf_event *event)
static inline int
event_filter_match(struct perf_event *event)
{
- return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event) && pmu_filter_match(event);
+ return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+ perf_cgroup_match(event) && pmu_filter_match(event);
}
static void
@@ -1737,8 +1791,8 @@ event_sched_out(struct perf_event *event,
* maintained, otherwise bogus information is return
* via read() for time_enabled, time_running:
*/
- if (event->state == PERF_EVENT_STATE_INACTIVE
- && !event_filter_match(event)) {
+ if (event->state == PERF_EVENT_STATE_INACTIVE &&
+ !event_filter_match(event)) {
delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = tstamp;
@@ -2236,10 +2290,15 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
- event->ctx = ctx;
if (event->cpu != -1)
event->cpu = cpu;
+ /*
+ * Ensures that if we can observe event->ctx, both the event and ctx
+ * will be 'complete'. See perf_iterate_sb_cpu().
+ */
+ smp_store_release(&event->ctx, ctx);
+
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
return;
@@ -3490,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group)
.group = group,
.ret = 0,
};
- smp_call_function_single(event->oncpu,
- __perf_event_read, &data, 1);
- ret = data.ret;
+ ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
+ /* The event must have been read from an online CPU: */
+ WARN_ON_ONCE(ret);
+ ret = ret ? : data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
@@ -5969,6 +6029,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
struct perf_event *event;
list_for_each_entry_rcu(event, &pel->list, sb_list) {
+ /*
+ * Skip events that are not fully formed yet; ensure that
+ * if we observe event->ctx, both event and ctx will be
+ * complete enough. See perf_install_in_context().
+ */
+ if (!smp_load_acquire(&event->ctx))
+ continue;
+
if (event->state < PERF_EVENT_STATE_INACTIVE)
continue;
if (!event_filter_match(event))
@@ -6098,7 +6166,7 @@ static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
struct pmu *pmu = event->pmu;
- struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
@@ -6553,15 +6621,6 @@ got_name:
}
/*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
- return filter->filter && filter->inode;
-}
-
-/*
* Check whether inode and address range match filter criteria.
*/
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6622,6 +6681,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
struct perf_event_context *ctx;
int ctxn;
+ /*
+ * Data tracing isn't supported yet and as such there is no need
+ * to keep track of anything that isn't related to executable code:
+ */
+ if (!(vma->vm_flags & VM_EXEC))
+ return;
+
rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
@@ -7774,7 +7840,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
list_for_each_entry(filter, &ifh->list, entry) {
event->addr_filters_offs[count] = 0;
- if (perf_addr_filter_needs_mmap(filter))
+ /*
+ * Adjust base offset if the filter is associated to a binary
+ * that needs to be mapped:
+ */
+ if (filter->inode)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);
@@ -7905,8 +7975,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail;
}
- if (token == IF_SRC_FILE) {
- filename = match_strdup(&args[2]);
+ if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+ int fpos = filter->range ? 2 : 1;
+
+ filename = match_strdup(&args[fpos]);
if (!filename) {
ret = -ENOMEM;
goto fail;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525ab2083..8c50276b60d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
+ if (!ptep) {
+ mem_cgroup_cancel_charge(kpage, memcg, false);
goto unlock;
+ }
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr, false);
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 84ae830234f8..2f974ae042a6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -715,7 +715,7 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..aaf782327bf3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1404,7 +1404,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
p->audit_context = NULL;
- threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1556,6 +1555,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
@@ -1656,6 +1656,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p);
bad_fork_free_pid:
+ threadgroup_change_end(current);
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_thread:
@@ -1688,7 +1689,6 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
- threadgroup_change_end(current);
delayacct_tsk_free(p);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index 33664f70e2d2..46cb3a301bc1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled;
* Futex flags used to encode options to functions and preserve them across
* restarts.
*/
-#define FLAGS_SHARED 0x01
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED 0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED 0x00
+#endif
#define FLAGS_CLOCKRT 0x02
#define FLAGS_HAS_TIMEOUT 0x04
@@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key)
if (!key->both.ptr)
return;
+ /*
+ * On MMU less systems futexes are always "private" as there is no per
+ * process address space. We need the smp wmb nevertheless - yes,
+ * arch/blackfin has MMU less SMP ...
+ */
+ if (!IS_ENABLED(CONFIG_MMU)) {
+ smp_mb(); /* explicit smp_mb(); (B) */
+ return;
+ }
+
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
ihold(key->shared.inode); /* implies smp_mb(); (B) */
@@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key)
return;
}
+ if (!IS_ENABLED(CONFIG_MMU))
+ return;
+
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
iput(key->shared.inode);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f68959341c0f..32f6cfcff212 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
return NULL;
}
+ get_online_cpus();
if (max_vecs >= num_online_cpus()) {
cpumask_copy(affinity_mask, cpu_online_mask);
*nr_vecs = num_online_cpus();
@@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
}
*nr_vecs = vecs;
}
+ put_online_cpus();
return affinity_mask;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b4c1bc7c9ca2..637389088b3f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -820,6 +820,17 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
desc->name = name;
if (handle != handle_bad_irq && is_chained) {
+ /*
+ * We're about to start this interrupt immediately,
+ * hence the need to set the trigger configuration.
+ * But the .set_type callback may have overridden the
+ * flow handler, ignoring that we're dealing with a
+ * chained interrupt. Reset it immediately because we
+ * do know better.
+ */
+ __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data));
+ desc->handle_irq = handle;
+
irq_settings_set_noprobe(desc);
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 73a2b786b5e9..9530fcd27704 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
action->dev_id = dev_id;
retval = irq_chip_pm_get(&desc->irq_data);
- if (retval < 0)
+ if (retval < 0) {
+ kfree(action);
return retval;
+ }
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
@@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
action->percpu_dev_id = dev_id;
retval = irq_chip_pm_get(&desc->irq_data);
- if (retval < 0)
+ if (retval < 0) {
+ kfree(action);
return retval;
+ }
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 54999350162c..19e9dfbe97fa 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
else
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+ struct irq_data *irq_data;
+
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ irq_domain_activate_irq(irq_data);
+ }
}
return 0;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0dbea887d625..93ad6c1fb9b6 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
+#include <linux/bug.h>
#ifdef HAVE_JUMP_LABEL
@@ -56,6 +57,49 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
static void jump_label_update(struct static_key *key);
+/*
+ * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
+ * The use of 'atomic_read()' requires atomic.h and its problematic for some
+ * kernel headers such as kernel.h and others. Since static_key_count() is not
+ * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok
+ * to have it be a function here. Similarly, for 'static_key_enable()' and
+ * 'static_key_disable()', which require bug.h. This should allow jump_label.h
+ * to be included from most/all places for HAVE_JUMP_LABEL.
+ */
+int static_key_count(struct static_key *key)
+{
+ /*
+ * -1 means the first static_key_slow_inc() is in progress.
+ * static_key_enabled() must return true, so return 1 here.
+ */
+ int n = atomic_read(&key->enabled);
+
+ return n >= 0 ? n : 1;
+}
+EXPORT_SYMBOL_GPL(static_key_count);
+
+void static_key_enable(struct static_key *key)
+{
+ int count = static_key_count(key);
+
+ WARN_ON_ONCE(count < 0 || count > 1);
+
+ if (!count)
+ static_key_slow_inc(key);
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable(struct static_key *key)
+{
+ int count = static_key_count(key);
+
+ WARN_ON_ONCE(count < 0 || count > 1);
+
+ if (count)
+ static_key_slow_dec(key);
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
void static_key_slow_inc(struct static_key *key)
{
int v, v1;
@@ -235,6 +279,18 @@ void __init jump_label_init(void)
struct static_key *key = NULL;
struct jump_entry *iter;
+ /*
+ * Since we are initializing the static_key.enabled field with
+ * with the 'raw' int values (to avoid pulling in atomic.h) in
+ * jump_label.h, let's make sure that is safe. There are only two
+ * cases to check since we initialize to 0 or 1.
+ */
+ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
+ BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
+
+ if (static_key_initialized)
+ return;
+
jump_label_lock();
jump_label_sort_entries(iter_start, iter_stop);
@@ -284,11 +340,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
+ preempt_disable();
mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ preempt_enable();
+
if (!mod)
return 0;
- WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
return __jump_label_text_reserved(mod->jump_entries,
mod->jump_entries + mod->num_jump_entries,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4384672d3245..980936a90ee6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
if (kexec_on_panic) {
/* Verify we have a valid entry point */
- if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ if ((entry < phys_to_boot_phys(crashk_res.start)) ||
+ (entry > phys_to_boot_phys(crashk_res.end)))
return -EADDRNOTAVAIL;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 56b3ed0927b0..561675589511 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -95,6 +95,12 @@ int kexec_should_crash(struct task_struct *p)
return 0;
}
+int kexec_crash_loaded(void)
+{
+ return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
/*
* When kexec transitions to the new kernel there is a one-to-one
* mapping between physical and virtual addresses. On processors
@@ -140,6 +146,7 @@ int kexec_should_crash(struct task_struct *p)
* allocating pages whose destination address we do not care about.
*/
#define KIMAGE_NO_DEST (-1UL)
+#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
static struct page *kimage_alloc_page(struct kimage *image,
gfp_t gfp_mask,
@@ -147,8 +154,9 @@ static struct page *kimage_alloc_page(struct kimage *image,
int sanity_check_segment_list(struct kimage *image)
{
- int result, i;
+ int i;
unsigned long nr_segments = image->nr_segments;
+ unsigned long total_pages = 0;
/*
* Verify we have good destination addresses. The caller is
@@ -163,16 +171,17 @@ int sanity_check_segment_list(struct kimage *image)
* simply because addresses are changed to page size
* granularity.
*/
- result = -EADDRNOTAVAIL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
+ if (mstart > mend)
+ return -EADDRNOTAVAIL;
if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- return result;
+ return -EADDRNOTAVAIL;
if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- return result;
+ return -EADDRNOTAVAIL;
}
/* Verify our destination addresses do not overlap.
@@ -180,7 +189,6 @@ int sanity_check_segment_list(struct kimage *image)
* through very weird things can happen with no
* easy explanation as one segment stops on another.
*/
- result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
unsigned long j;
@@ -194,7 +202,7 @@ int sanity_check_segment_list(struct kimage *image)
pend = pstart + image->segment[j].memsz;
/* Do the segments overlap ? */
if ((mend > pstart) && (mstart < pend))
- return result;
+ return -EINVAL;
}
}
@@ -203,12 +211,26 @@ int sanity_check_segment_list(struct kimage *image)
* and it is easier to check up front than to be surprised
* later on.
*/
- result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
if (image->segment[i].bufsz > image->segment[i].memsz)
- return result;
+ return -EINVAL;
+ }
+
+ /*
+ * Verify that no more than half of memory will be consumed. If the
+ * request from userspace is too large, a large amount of time will be
+ * wasted allocating pages, which can cause a soft lockup.
+ */
+ for (i = 0; i < nr_segments; i++) {
+ if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2)
+ return -EINVAL;
+
+ total_pages += PAGE_COUNT(image->segment[i].memsz);
}
+ if (total_pages > totalram_pages / 2)
+ return -EINVAL;
+
/*
* Verify we have good destination addresses. Normally
* the caller is responsible for making certain we don't
@@ -220,16 +242,15 @@ int sanity_check_segment_list(struct kimage *image)
*/
if (image->type == KEXEC_TYPE_CRASH) {
- result = -EADDRNOTAVAIL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz - 1;
/* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) ||
- (mend > crashk_res.end))
- return result;
+ if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
+ (mend > phys_to_boot_phys(crashk_res.end)))
+ return -EADDRNOTAVAIL;
}
}
@@ -352,7 +373,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
if (!pages)
break;
- pfn = page_to_pfn(pages);
+ pfn = page_to_boot_pfn(pages);
epfn = pfn + count;
addr = pfn << PAGE_SHIFT;
eaddr = epfn << PAGE_SHIFT;
@@ -478,7 +499,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
return -ENOMEM;
ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
image->entry = ind_page;
image->last_entry = ind_page +
((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -533,13 +554,13 @@ void kimage_terminate(struct kimage *image)
#define for_each_kimage_entry(image, ptr, entry) \
for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
ptr = (entry & IND_INDIRECTION) ? \
- phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
static void kimage_free_entry(kimage_entry_t entry)
{
struct page *page;
- page = pfn_to_page(entry >> PAGE_SHIFT);
+ page = boot_pfn_to_page(entry >> PAGE_SHIFT);
kimage_free_pages(page);
}
@@ -633,7 +654,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
* have a match.
*/
list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
+ addr = page_to_boot_pfn(page) << PAGE_SHIFT;
if (addr == destination) {
list_del(&page->lru);
return page;
@@ -648,12 +669,12 @@ static struct page *kimage_alloc_page(struct kimage *image,
if (!page)
return NULL;
/* If the page cannot be used file it away */
- if (page_to_pfn(page) >
+ if (page_to_boot_pfn(page) >
(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
list_add(&page->lru, &image->unusable_pages);
continue;
}
- addr = page_to_pfn(page) << PAGE_SHIFT;
+ addr = page_to_boot_pfn(page) << PAGE_SHIFT;
/* If it is the destination page we want use it */
if (addr == destination)
@@ -676,7 +697,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
struct page *old_page;
old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
copy_highpage(page, old_page);
*old = addr | (*old & ~PAGE_MASK);
@@ -732,7 +753,7 @@ static int kimage_load_normal_segment(struct kimage *image,
result = -ENOMEM;
goto out;
}
- result = kimage_add_page(image, page_to_pfn(page)
+ result = kimage_add_page(image, page_to_boot_pfn(page)
<< PAGE_SHIFT);
if (result < 0)
goto out;
@@ -793,7 +814,7 @@ static int kimage_load_crash_segment(struct kimage *image,
char *ptr;
size_t uchunk, mchunk;
- page = pfn_to_page(maddr >> PAGE_SHIFT);
+ page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
if (!page) {
result = -ENOMEM;
goto out;
@@ -921,7 +942,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
unsigned long addr;
for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+ free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
}
int crash_shrink_memory(unsigned long new_size)
@@ -1374,7 +1395,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
void __weak arch_crash_save_vmcoreinfo(void)
{}
-unsigned long __weak paddr_vmcoreinfo_note(void)
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
return __pa((unsigned long)(char *)&vmcoreinfo_note);
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 152da4a48867..ee1bc1bb8feb 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -101,7 +101,7 @@ KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", !!kexec_crash_image);
+ return sprintf(buf, "%d\n", kexec_crash_loaded());
}
KERNEL_ATTR_RO(kexec_crash_loaded);
@@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size);
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lx %x\n",
- paddr_vmcoreinfo_note(),
+ phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
+ return sprintf(buf, "%pa %x\n", &vmcore_base,
(unsigned int)sizeof(vmcoreinfo_note));
}
KERNEL_ATTR_RO(vmcoreinfo);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 5c2bc1052691..8bbe50704621 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod,
break;
}
- module_enable_ro(pmod);
+ module_enable_ro(pmod, true);
return ret;
}
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 37649e69056c..8a99abf58080 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
goto gotlock;
}
}
- WRITE_ONCE(pn->state, vcpu_halted);
+ WRITE_ONCE(pn->state, vcpu_hashed);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 22e025309845..b9d031516254 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
*/
if ((counter == qstat_pv_latency_kick) ||
(counter == qstat_pv_latency_wake)) {
- stat = 0;
if (kicks)
stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
}
diff --git a/kernel/module.c b/kernel/module.c
index 5f71aa63ed2a..529efae9f481 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,6 +60,7 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -264,7 +265,7 @@ static void module_assert_mutex_or_preempt(void)
if (unlikely(!debug_locks))
return;
- WARN_ON(!rcu_read_lock_sched_held() &&
+ WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
!lockdep_is_held(&module_mutex));
#endif
}
@@ -336,7 +337,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
* A thread that wants to hold a reference to a module only while it
* is running can call this to safely exit. nfsd and lockd use this.
*/
-void __module_put_and_exit(struct module *mod, long code)
+void __noreturn __module_put_and_exit(struct module *mod, long code)
{
module_put(mod);
do_exit(code);
@@ -1693,8 +1694,7 @@ static int module_add_modinfo_attrs(struct module *mod)
temp_attr = mod->modinfo_attrs;
for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
- if (!attr->test ||
- (attr->test && attr->test(mod))) {
+ if (!attr->test || attr->test(mod)) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
error = sysfs_create_file(&mod->mkobj.kobj,
@@ -1858,10 +1858,11 @@ static void mod_sysfs_teardown(struct module *mod)
* from modification and any data from execution.
*
* General layout of module is:
- * [text] [read-only-data] [writable data]
- * text_size -----^ ^ ^
- * ro_size ------------------------| |
- * size -------------------------------------------|
+ * [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^ ^ ^ ^
+ * ro_size ------------------------| | |
+ * ro_after_init_size -----------------------------| |
+ * size -----------------------------------------------------------|
*
* These values are always page-aligned (as is base)
*/
@@ -1884,14 +1885,24 @@ static void frob_rodata(const struct module_layout *layout,
(layout->ro_size - layout->text_size) >> PAGE_SHIFT);
}
+static void frob_ro_after_init(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->ro_size,
+ (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
static void frob_writable_data(const struct module_layout *layout,
int (*set_memory)(unsigned long start, int num_pages))
{
BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_size,
- (layout->size - layout->ro_size) >> PAGE_SHIFT);
+ set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+ (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
}
/* livepatching wants to disable read-only so it can frob module. */
@@ -1899,21 +1910,26 @@ void module_disable_ro(const struct module *mod)
{
frob_text(&mod->core_layout, set_memory_rw);
frob_rodata(&mod->core_layout, set_memory_rw);
+ frob_ro_after_init(&mod->core_layout, set_memory_rw);
frob_text(&mod->init_layout, set_memory_rw);
frob_rodata(&mod->init_layout, set_memory_rw);
}
-void module_enable_ro(const struct module *mod)
+void module_enable_ro(const struct module *mod, bool after_init)
{
frob_text(&mod->core_layout, set_memory_ro);
frob_rodata(&mod->core_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_ro);
frob_rodata(&mod->init_layout, set_memory_ro);
+
+ if (after_init)
+ frob_ro_after_init(&mod->core_layout, set_memory_ro);
}
static void module_enable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_nx);
+ frob_ro_after_init(&mod->core_layout, set_memory_nx);
frob_writable_data(&mod->core_layout, set_memory_nx);
frob_rodata(&mod->init_layout, set_memory_nx);
frob_writable_data(&mod->init_layout, set_memory_nx);
@@ -1922,6 +1938,7 @@ static void module_enable_nx(const struct module *mod)
static void module_disable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_x);
+ frob_ro_after_init(&mod->core_layout, set_memory_x);
frob_writable_data(&mod->core_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_x);
frob_writable_data(&mod->init_layout, set_memory_x);
@@ -1964,6 +1981,8 @@ static void disable_ro_nx(const struct module_layout *layout)
frob_text(layout, set_memory_rw);
frob_rodata(layout, set_memory_rw);
frob_rodata(layout, set_memory_x);
+ frob_ro_after_init(layout, set_memory_rw);
+ frob_ro_after_init(layout, set_memory_x);
frob_writable_data(layout, set_memory_x);
}
@@ -2306,6 +2325,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
* finder in the two loops below */
{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
};
@@ -2337,7 +2357,11 @@ static void layout_sections(struct module *mod, struct load_info *info)
mod->core_layout.size = debug_align(mod->core_layout.size);
mod->core_layout.ro_size = mod->core_layout.size;
break;
- case 3: /* whole core */
+ case 2: /* RO after init */
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.ro_after_init_size = mod->core_layout.size;
+ break;
+ case 4: /* whole core */
mod->core_layout.size = debug_align(mod->core_layout.size);
break;
}
@@ -2367,7 +2391,14 @@ static void layout_sections(struct module *mod, struct load_info *info)
mod->init_layout.size = debug_align(mod->init_layout.size);
mod->init_layout.ro_size = mod->init_layout.size;
break;
- case 3: /* whole init */
+ case 2:
+ /*
+ * RO after init doesn't apply to init_layout (only
+ * core_layout), so it just takes the value of ro_size.
+ */
+ mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
+ break;
+ case 4: /* whole init */
mod->init_layout.size = debug_align(mod->init_layout.size);
break;
}
@@ -2687,13 +2718,18 @@ static inline void kmemleak_load_module(const struct module *mod,
#endif
#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
int err = -ENOKEY;
const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
const void *mod = info->hdr;
- if (info->len > markerlen &&
+ /*
+ * Require flags == 0, as a module with version information
+ * removed is no longer the module that was signed
+ */
+ if (flags == 0 &&
+ info->len > markerlen &&
memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
/* We truncate the module to discard the signature */
info->len -= markerlen;
@@ -2712,7 +2748,7 @@ static int module_sig_check(struct load_info *info)
return err;
}
#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
return 0;
}
@@ -2920,8 +2956,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return -ENOEXEC;
}
- if (!get_modinfo(info, "intree"))
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
@@ -3090,6 +3130,8 @@ static int move_module(struct module *mod, struct load_info *info)
static int check_module_license_and_versions(struct module *mod)
{
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+
/*
* ndiswrapper is under GPL by itself, but loads proprietary modules.
* Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -3108,6 +3150,9 @@ static int check_module_license_and_versions(struct module *mod)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
LOCKDEP_NOW_UNRELIABLE);
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
+
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -3155,16 +3200,41 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(char *module_name)
+{
+ const char *p;
+ size_t len;
+
+ if (!module_blacklist)
+ return false;
+
+ for (p = module_blacklist; *p; p += len) {
+ len = strcspn(p, ",");
+ if (strlen(module_name) == len && !memcmp(module_name, p, len))
+ return true;
+ if (p[len] == ',')
+ len++;
+ }
+ return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
+ unsigned int ndx;
int err;
mod = setup_load_info(info, flags);
if (IS_ERR(mod))
return mod;
+ if (blacklisted(mod->name))
+ return ERR_PTR(-EPERM);
+
err = check_modinfo(mod, info, flags);
if (err)
return ERR_PTR(err);
@@ -3178,6 +3248,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ /*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+ ndx = find_sec(info, ".data..ro_after_init");
+ if (ndx)
+ info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
special cases for the architectures. */
@@ -3344,12 +3423,14 @@ static noinline int do_init_module(struct module *mod)
/* Switch to core kallsyms now init is done: kallsyms may be walking! */
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
+ module_enable_ro(mod, true);
mod_tree_remove_init(mod);
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
mod->init_layout.base = NULL;
mod->init_layout.size = 0;
mod->init_layout.ro_size = 0;
+ mod->init_layout.ro_after_init_size = 0;
mod->init_layout.text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
@@ -3441,8 +3522,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
- /* Set RO and NX regions */
- module_enable_ro(mod);
+ module_enable_ro(mod, false);
module_enable_nx(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
@@ -3498,7 +3578,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
long err;
char *after_dashes;
- err = module_sig_check(info);
+ err = module_sig_check(info, flags);
if (err)
goto free_copy;
diff --git a/kernel/panic.c b/kernel/panic.c
index 8aa74497cc5a..ca8cea1ef673 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -108,6 +108,7 @@ void panic(const char *fmt, ...)
long i, i_next = 0;
int state = 0;
int old_cpu, this_cpu;
+ bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
@@ -160,7 +161,7 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!crash_kexec_post_notifiers) {
+ if (!_crash_kexec_post_notifiers) {
printk_nmi_flush_on_panic();
__crash_kexec(NULL);
}
@@ -191,7 +192,7 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (crash_kexec_post_notifiers)
+ if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
bust_spinlocks(0);
@@ -571,13 +572,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
-
-static int __init setup_crash_kexec_post_notifiers(char *s)
-{
- crash_kexec_post_notifiers = true;
- return 0;
-}
-early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
static int __init oops_setup(char *s)
{
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a881c6a7ba74..33c79b6105c5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -300,12 +300,12 @@ static int create_image(int platform_mode)
save_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ /* Restore control flow magically appears here */
+ restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
- /* Restore control flow magically appears here */
- restore_processor_state();
if (!in_suspend)
events_check_enabled = false;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9a0178c2ac1d..b02228411d57 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
*/
static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur.node = list_entry(bm->cur.node->list.next,
- struct rtree_node, list);
- if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
bm->cur.node_pfn += BM_BITS_PER_BLOCK;
bm->cur.node_bit = 0;
touch_softlockup_watchdog();
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/* No more nodes, goto next zone */
- bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur.zone->list != &bm->zones) {
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
char *_braille_console_setup(char **str, char **brl_options)
{
- if (!memcmp(*str, "brl,", 4)) {
+ if (!strncmp(*str, "brl,", 4)) {
*brl_options = "";
*str += 4;
- } else if (!memcmp(str, "brl=", 4)) {
+ } else if (!strncmp(*str, "brl=", 4)) {
*brl_options = *str + 4;
*str = strchr(*brl_options, ',');
if (!*str)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d4de33934dac..eea6dbc2d8cf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -26,7 +26,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/interrupt.h> /* For in_interrupt() */
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
@@ -48,7 +47,7 @@
#include <linux/uio.h>
#include <asm/uaccess.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -86,6 +85,111 @@ static struct lockdep_map console_lock_dep_map = {
};
#endif
+enum devkmsg_log_bits {
+ __DEVKMSG_LOG_BIT_ON = 0,
+ __DEVKMSG_LOG_BIT_OFF,
+ __DEVKMSG_LOG_BIT_LOCK,
+};
+
+enum devkmsg_log_masks {
+ DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON),
+ DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF),
+ DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK),
+};
+
+/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
+#define DEVKMSG_LOG_MASK_DEFAULT 0
+
+static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+
+static int __control_devkmsg(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strncmp(str, "on", 2)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_ON;
+ return 2;
+ } else if (!strncmp(str, "off", 3)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_OFF;
+ return 3;
+ } else if (!strncmp(str, "ratelimit", 9)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+ return 9;
+ }
+ return -EINVAL;
+}
+
+static int __init control_devkmsg(char *str)
+{
+ if (__control_devkmsg(str) < 0)
+ return 1;
+
+ /*
+ * Set sysctl string accordingly:
+ */
+ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) {
+ memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+ strncpy(devkmsg_log_str, "on", 2);
+ } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) {
+ memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+ strncpy(devkmsg_log_str, "off", 3);
+ }
+ /* else "ratelimit" which is set by default. */
+
+ /*
+ * Sysctl cannot change it anymore. The kernel command line setting of
+ * this parameter is to force the setting to be permanent throughout the
+ * runtime of the system. This is a precation measure against userspace
+ * trying to be a smarta** and attempting to change it up on us.
+ */
+ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
+
+ return 0;
+}
+__setup("printk.devkmsg=", control_devkmsg);
+
+char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
+
+int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char old_str[DEVKMSG_STR_MAX_SIZE];
+ unsigned int old;
+ int err;
+
+ if (write) {
+ if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
+ return -EINVAL;
+
+ old = devkmsg_log;
+ strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE);
+ }
+
+ err = proc_dostring(table, write, buffer, lenp, ppos);
+ if (err)
+ return err;
+
+ if (write) {
+ err = __control_devkmsg(devkmsg_log_str);
+
+ /*
+ * Do not accept an unknown string OR a known string with
+ * trailing crap...
+ */
+ if (err < 0 || (err + 1 != *lenp)) {
+
+ /* ... and restore old setting. */
+ devkmsg_log = old;
+ strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE);
+
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
/*
* Number of registered extended console drivers.
*
@@ -614,6 +718,7 @@ struct devkmsg_user {
u64 seq;
u32 idx;
enum log_flags prev;
+ struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
};
@@ -623,11 +728,24 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
char *buf, *line;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
+ struct file *file = iocb->ki_filp;
+ struct devkmsg_user *user = file->private_data;
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (len > LOG_LINE_MAX)
+ if (!user || len > LOG_LINE_MAX)
return -EINVAL;
+
+ /* Ignore when user logging is disabled. */
+ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+ return len;
+
+ /* Ratelimit when not explicitly enabled. */
+ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
+ if (!___ratelimit(&user->rs, current->comm))
+ return ret;
+ }
+
buf = kmalloc(len+1, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
@@ -800,19 +918,24 @@ static int devkmsg_open(struct inode *inode, struct file *file)
struct devkmsg_user *user;
int err;
- /* write-only does not need any file context */
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- return 0;
+ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+ return -EPERM;
- err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
- SYSLOG_FROM_READER);
- if (err)
- return err;
+ /* write-only does not need any file context */
+ if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ if (err)
+ return err;
+ }
user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
+ ratelimit_default_init(&user->rs);
+ ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);
+
mutex_init(&user->lock);
raw_spin_lock_irq(&logbuf_lock);
@@ -831,6 +954,8 @@ static int devkmsg_release(struct inode *inode, struct file *file)
if (!user)
return 0;
+ ratelimit_state_exit(&user->rs);
+
mutex_destroy(&user->lock);
kfree(user);
return 0;
@@ -986,6 +1111,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
"ignore loglevel setting (prints all kernel messages to the console)");
+static bool suppress_message_printing(int level)
+{
+ return (level >= console_loglevel && !ignore_loglevel);
+}
+
#ifdef CONFIG_BOOT_PRINTK_DELAY
static int boot_delay; /* msecs delay after each printk during bootup */
@@ -1015,7 +1145,7 @@ static void boot_delay_msec(int level)
unsigned long timeout;
if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
- || (level >= console_loglevel && !ignore_loglevel)) {
+ || suppress_message_printing(level)) {
return;
}
@@ -1439,8 +1569,6 @@ static void call_console_drivers(int level,
trace_console(text, len);
- if (level >= console_loglevel && !ignore_loglevel)
- return;
if (!console_drivers)
return;
@@ -1888,6 +2016,7 @@ static void call_console_drivers(int level,
static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
+static bool suppress_message_printing(int level) { return false; }
/* Still needs to be defined for users */
DEFINE_PER_CPU(printk_func_t, printk_func);
@@ -2167,6 +2296,13 @@ static void console_cont_flush(char *text, size_t size)
if (!cont.len)
goto out;
+ if (suppress_message_printing(cont.level)) {
+ cont.cons = cont.len;
+ if (cont.flushed)
+ cont.len = 0;
+ goto out;
+ }
+
/*
* We still queue earlier records, likely because the console was
* busy. The earlier ones need to be printed before this one, we
@@ -2270,10 +2406,13 @@ skip:
break;
msg = log_from_idx(console_idx);
- if (msg->flags & LOG_NOCONS) {
+ level = msg->level;
+ if ((msg->flags & LOG_NOCONS) ||
+ suppress_message_printing(level)) {
/*
* Skip record we have buffered and already printed
- * directly to the console when we received it.
+ * directly to the console when we received it, and
+ * record that has level above the console loglevel.
*/
console_idx = log_next(console_idx);
console_seq++;
@@ -2287,7 +2426,6 @@ skip:
goto skip;
}
- level = msg->level;
len += msg_print_text(msg, console_prev, false,
text + len, sizeof(text) - len);
if (nr_ext_console_drivers) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d49bfa1e53e6..1d3b7665d0be 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
return -EINVAL;
if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
- if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
- !config_enabled(CONFIG_SECCOMP))
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/relay.c b/kernel/relay.c
index 04d7cf3ef8cf..d797502140b9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -451,6 +451,13 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
if (!dentry)
goto free_buf;
relay_set_buf_dentry(buf, dentry);
+ } else {
+ /* Only retrieve global info, nothing more, nothing less */
+ dentry = chan->cb->create_buf_file(NULL, NULL,
+ S_IRUSR, buf,
+ &chan->is_global);
+ if (WARN_ON(dentry))
+ goto free_buf;
}
buf->cpu = cpu;
@@ -562,6 +569,10 @@ static int relay_hotcpu_callback(struct notifier_block *nb,
* attributes specified. The created channel buffer files
* will be named base_filename0...base_filenameN-1. File
* permissions will be %S_IRUSR.
+ *
+ * If opening a buffer (@parent = NULL) that you later wish to register
+ * in a filesystem, call relay_late_setup_files() once the @parent dentry
+ * is available.
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
@@ -640,8 +651,12 @@ static void __relay_set_buf_dentry(void *info)
*
* Returns 0 if successful, non-zero otherwise.
*
- * Use to setup files for a previously buffer-only channel.
- * Useful to do early tracing in kernel, before VFS is up, for example.
+ * Use to setup files for a previously buffer-only channel created
+ * by relay_open() with a NULL parent dentry.
+ *
+ * For example, this is useful for perfomring early tracing in kernel,
+ * before VFS is up and then exposing the early results once the dentry
+ * is available.
*/
int relay_late_setup_files(struct rchan *chan,
const char *base_filename,
@@ -666,6 +681,20 @@ int relay_late_setup_files(struct rchan *chan,
}
chan->has_base_filename = 1;
chan->parent = parent;
+
+ if (chan->is_global) {
+ err = -EINVAL;
+ if (!WARN_ON_ONCE(!chan->buf[0])) {
+ dentry = relay_create_buf_file(chan, chan->buf[0], 0);
+ if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
+ relay_set_buf_dentry(chan->buf[0], dentry);
+ err = 0;
+ }
+ }
+ mutex_unlock(&relay_channels_mutex);
+ return err;
+ }
+
curr_cpu = get_cpu();
/*
* The CPU hotplug notifier ran before us and created buffers with
@@ -706,6 +735,7 @@ int relay_late_setup_files(struct rchan *chan,
return err;
}
+EXPORT_SYMBOL_GPL(relay_late_setup_files);
/**
* relay_switch_subbuf - switch to a new sub-buffer
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c883fe8e440..2a906f20fba7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
#include <linux/context_tracking.h>
#include <linux/compiler.h>
#include <linux/frame.h>
+#include <linux/prefetch.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -2972,6 +2973,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+ struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+ prefetch(curr);
+ prefetch(&curr->exec_start);
+}
+
+/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -3005,6 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
+ prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be58820465c..d4184498c9f5 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
if (old_idx == IDX_INVALID) {
cp->size++;
- cp->elements[cp->size - 1].dl = 0;
+ cp->elements[cp->size - 1].dl = dl;
cp->elements[cp->size - 1].cpu = cpu;
cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 1934f658c036..a846cf89eb96 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* idle, or potentially user or system time. Due to rounding,
* other time can exceed ticks occasionally.
*/
- other = account_other_time(cputime);
+ other = account_other_time(ULONG_MAX);
if (other >= cputime)
return;
cputime -= other;
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
}
cputime = cputime_one_jiffy;
- steal = steal_account_process_time(cputime);
+ steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
return;
@@ -508,13 +513,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
*/
void account_idle_ticks(unsigned long ticks)
{
+ cputime_t cputime, steal;
if (sched_clock_irqtime) {
irqtime_account_idle_ticks(ticks);
return;
}
- account_idle_time(jiffies_to_cputime(ticks));
+ cputime = jiffies_to_cputime(ticks);
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
+ return;
+
+ cputime -= steal;
+ account_idle_time(cputime);
}
/*
@@ -606,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -641,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
@@ -686,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
unsigned long now = READ_ONCE(jiffies);
cputime_t delta, other;
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
delta = jiffies_to_cputime(now - tsk->vtime_snap);
other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..1ce8867283dc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
*
* XXX figure out if select_task_rq_dl() deals with offline cpus.
*/
- if (unlikely(!rq->online))
+ if (unlikely(!rq->online)) {
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
+ rf.cookie = lockdep_pin_lock(&rq->lock);
+ }
/*
* Queueing this task back might have overloaded rq, check if we need
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4088eedea763..039de34f1521 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
- pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54d15eb2b701..ef6c6c3f9d8a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -542,7 +542,7 @@ void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
- if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return;
@@ -655,7 +655,7 @@ int __secure_computing(const struct seccomp_data *sd)
int mode = current->seccomp.mode;
int this_syscall;
- if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 53954631a4e1..a13bbdaab47d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -814,6 +814,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &ten_thousand,
},
{
+ .procname = "printk_devkmsg",
+ .data = devkmsg_log_str,
+ .maxlen = DEVKMSG_STR_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = devkmsg_sysctl_set_loglvl,
+ },
+ {
.procname = "dmesg_restrict",
.data = &dmesg_restrict,
.maxlen = sizeof(int),
@@ -2133,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
return 0;
}
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2252,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
int proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- NULL,NULL);
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
}
/*
@@ -2851,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2896,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
* exception granted :-)
*/
EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 6ab4842b00e8..d513051fcca2 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
struct callback_head *head;
do {
- head = ACCESS_ONCE(task->task_works);
+ head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
return -ESRCH;
work->next = head;
@@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
unsigned long flags;
+
+ if (likely(!task->task_works))
+ return NULL;
/*
* If cmpxchg() fails we continue without updating pprev.
* Either we raced with task_work_add() which added the
@@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = ACCESS_ONCE(*pprev))) {
- smp_read_barrier_depends();
+ while ((work = lockless_dereference(*pprev))) {
if (work->func != func)
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
@@ -95,7 +97,7 @@ void task_work_run(void)
* work_exited unless the list is empty.
*/
do {
- work = ACCESS_ONCE(task->task_works);
+ work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3b65746c7f15..e07fb093f819 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ now = ktime_to_ns(tkr->base);
+
+ now += clocksource_delta(tkr->read(tkr->clock),
+ tkr->cycle_last, tkr->mask);
} while (read_seqcount_retry(&tkf->seq, seq));
return now;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..107310a6f36f 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
#include "timekeeping_internal.h"
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
{
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
void tk_debug_account_sleep_time(struct timespec64 *t)
{
- sleep_time_bin[fls(t->tv_sec)]++;
+ /* Cap bin index so we don't overflow the array */
+ int bin = min(fls(t->tv_sec), NUM_BINS-1);
+
+ sleep_time_bin[bin]++;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 555670a5143c..32bf6f75a8fe 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
+ bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
spin_lock(&base->lock);
nextevt = __next_timer_interrupt(base);
+ is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
base->next_expiry = nextevt;
/*
* We have a fresh next event. Check whether we can forward the base:
@@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
expires = basem;
base->is_idle = false;
} else {
- expires = basem + (nextevt - basej) * TICK_NSEC;
+ if (!is_max_delta)
+ expires = basem + (nextevt - basej) * TICK_NSEC;
/*
* If we expect to sleep more than a tick, mark the base idle:
*/
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979e7bfbde7a..d0a1617b52b4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,4 +1,8 @@
+# We are fully aware of the dangers of __builtin_return_address()
+FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
+KBUILD_CFLAGS += $(FRAME_CFLAGS)
+
# Do not instrument the tracer itself:
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb345cd11883..dbafc5df03f3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= MASK_TC_BIT(op_flags, META);
what |= MASK_TC_BIT(op_flags, PREFLUSH);
what |= MASK_TC_BIT(op_flags, FUA);
- if (op == REQ_OP_DISCARD)
+ if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
what |= BLK_TC_ACT(BLK_TC_FLUSH);
@@ -776,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_rw, what, error, 0, NULL);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL);
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -881,7 +881,7 @@ static void blk_add_trace_split(void *ignore,
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
+ bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
&rpdu);
}
@@ -915,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error,
sizeof(r), &r);
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0c05b8a99806..f3a960ed75a1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1441,6 +1441,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
goto out;
}
+ if (hist_data->attrs->pause)
+ data->paused = true;
+
if (named_data) {
destroy_hist_data(data->private_data);
data->private_data = named_data->private_data;
@@ -1448,9 +1451,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
data->ops = &event_hist_trigger_named_ops;
}
- if (hist_data->attrs->pause)
- data->paused = true;
-
if (data->ops->init) {
ret = data->ops->init(data->ops, data);
if (ret < 0)
@@ -1500,9 +1500,9 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
static void hist_unreg_all(struct trace_event_file *file)
{
- struct event_trigger_data *test;
+ struct event_trigger_data *test, *n;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry_safe(test, n, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
list_del_rcu(&test->list);
trace_event_trigger_enable_disable(file, 0);
@@ -1699,9 +1699,9 @@ hist_enable_get_trigger_ops(char *cmd, char *param)
static void hist_enable_unreg_all(struct trace_event_file *file)
{
- struct event_trigger_data *test;
+ struct event_trigger_data *test, *n;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry_safe(test, n, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) {
list_del_rcu(&test->list);
update_cond_flag(file);